Index of Section 3 Manual Pages

Interix / SUATcl_UniChar.3Interix / SUA

Utf(3)                Tcl Library Procedures               Utf(3)



_________________________________________________________________

NAME
       Tcl_UniChar,   Tcl_UniCharCaseMatch,  Tcl_UniCharNcasecmp,
       Tcl_UniCharToUtf,   Tcl_UtfToUniChar,   Tcl_UniCharToUtfD-
       String,      Tcl_UtfToUniCharDString,      Tcl_UniCharLen,
       Tcl_UniCharNcmp,   Tcl_UtfCharComplete,   Tcl_NumUtfChars,
       Tcl_UtfFindFirst,  Tcl_UtfFindLast,  Tcl_UtfNext, Tcl_Utf-
       Prev, Tcl_UniCharAtIndex, Tcl_UtfAtIndex, Tcl_UtfBackslash
       - routines for manipulating UTF-8 strings.

SYNOPSIS
       #include 

       typedef ... Tcl_UniChar;

       int
       Tcl_UniCharToUtf(ch, buf)

       int
       Tcl_UtfToUniChar(src, chPtr)

       char *                                                     |
       Tcl_UniCharToUtfDString(uniStr, numChars, dstPtr)          |

       Tcl_UniChar *                                              |
       Tcl_UtfToUniCharDString(src, len, dstPtr)                  |

       int
       Tcl_UniCharLen(uniStr)

       int
       Tcl_UniCharNcmp(uniStr, uniStr, num)

       int                                                        |
       Tcl_UniCharNcasecmp(uniStr, uniStr, num)                   |

       int                                                        |
       Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)           |

       int
       Tcl_UtfNcmp(src, src, num)

       int
       Tcl_UtfNcasecmp(src, src, num)

       int
       Tcl_UtfCharComplete(src, len)

       int
       Tcl_NumUtfChars(src, len)

       CONST char *                                               |
       Tcl_UtfFindFirst(src, ch)                                  |

       CONST char *                                               |
       Tcl_UtfFindLast(src, ch)                                   |

       CONST char *                                               |
       Tcl_UtfNext(src)                                           |

       CONST char *                                               |
       Tcl_UtfPrev(src, start)                                    |

       Tcl_UniChar
       Tcl_UniCharAtIndex(src, index)

       CONST char *                                               |
       Tcl_UtfAtIndex(src, index)                                 |

       int
       Tcl_UtfBackslash(src, readPtr, dst)

ARGUMENTS
       char                *buf       (out)     Buffer  in  which
                                                the UTF-8  repre-
                                                sentation  of the
                                                Tcl_UniChar    is
                                                stored.   At most
                                                TCL_UTF_MAX bytes
                                                are stored in the
                                                buffer.

       int                 ch         (in)      The   Tcl_UniChar
                                                to  be  converted
                                                or examined.

       Tcl_UniChar         *chPtr     (out)     Filled  with  the
                                                Tcl_UniChar  rep-
                                                resented  by  the
                                                head of the UTF-8
                                                string.

       CONST char          *src       (in)      Pointer   to    a
                                                UTF-8 string.

       CONST Tcl_UniChar   *uniStr    (in)      A null-terminated
                                                Unicode string.

       CONST Tcl_UniChar   *uniPattern(in)      A null-terminated
                                                Unicode string.

       int                 len        (in)      The length of the
                                                UTF-8  string  in
                                                bytes  (not UTF-8
                                                characters).   If
                                                negative,     all
                                                bytes up  to  the
                                                first  null  byte
                                                are used.

       int                 numChars   (in)      The length of the
                                                Unicode string in
                                                characters.  Must
                                                be  greater  than
                                                or equal to 0.

       Tcl_DString         *dstPtr    (in/out)  A  pointer  to  a
                                                previously-ini-
                                                tialized
                                                Tcl_DString.

       unsigned long       num        (in)      The   number   of
                                                characters     to
                                                compare.

       CONST char          *start     (in)      Pointer   to  the
                                                beginning  of   a
                                                UTF-8 string.

       int                 index      (in)      The  index  of  a
                                                character    (not
                                                byte)    in   the
                                                UTF-8 string.

       int                 *readPtr   (out)     If      non-NULL,
                                                filled  with  the
                                                number  of  bytes
                                                in  the backslash
                                                sequence, includ-
                                                ing the backslash
                                                character.

       char                *dst       (out)     Buffer  in  which
                                                the  bytes repre-
                                                sented   by   the
                                                backslash
                                                sequence      are
                                                stored.   At most
                                                TCL_UTF_MAX bytes
                                                are stored in the
                                                buffer.           |

       int                 nocase     (in)                        ||
                                                Specifies whether |
                                                the match  should |
                                                be done case-sen- |
                                                sitive   (0)   or |
                                                case-insensitive  |
                                                (1).
_________________________________________________________________


DESCRIPTION
       These  routines  convert   between   UTF-8   strings   and
       Tcl_UniChars.  A Tcl_UniChar is a Unicode character repre-
       sented as an unsigned, fixed-size quantity.  A UTF-8 char-
       acter  is  a  Unicode  character represented as a varying-
       length sequence of up to TCL_UTF_MAX bytes.   A  multibyte
       UTF-8  sequence  consists  of a lead byte followed by some
       number of trail bytes.

       TCL_UTF_MAX is the maximum number of bytes that  it  takes
       to  represent one Unicode character in the UTF-8 represen-
       tation.

       Tcl_UniCharToUtf stores the  Tcl_UniChar  ch  as  a  UTF-8
       string in starting at buf.  The return value is the number
       of bytes stored in buf.

       Tcl_UtfToUniChar reads one UTF-8 character starting at src
       and  stores  it  as  a  Tcl_UniChar in *chPtr.  The return
       value is the number of bytes read from src..   The  caller
       must  ensure  that  the  source buffer is long enough such
       that this routine does not run off the end and dereference
       non-existent  or  random  memory;  if the source buffer is
       known to be null-terminated, this will not happen.  If the
       input is not in proper UTF-8 format, Tcl_UtfToUniChar will
       store the first byte of src in  *chPtr  as  a  Tcl_UniChar
       between 0x0000 and 0x00ff and return 1.

       Tcl_UniCharToUtfDString  converts the given Unicode string
       to UTF-8, storing the result in  a  previously-initialized
       Tcl_DString.   You  must  specify  the length of the given
       Unicode string.  The return value  is  a  pointer  to  the
       UTF-8  representation  of the Unicode string.  Storage for
       the  return  value  is  appended  to  the   end   of   the
       Tcl_DString.

       Tcl_UtfToUniCharDString converts the given UTF-8 string to
       Unicode, storing the result in the  previously-initialized
       Tcl_DString.   you  may  either  specify the length of the
       given   UTF-8   string   or   "-1",    in    which    case
       Tcl_UtfToUniCharDString   uses  strlen  to  calculate  the
       length.  The return value is a pointer to the Unicode rep-
       resentation  of  the UTF-8 string.  Storage for the return
       value is appended to the end of the Tcl_DString.  The Uni-
       code string is terminated with a Unicode null character.

       Tcl_UniCharLen  corresponds  to strlen for Unicode charac-
       ters.  It accepts a  null-terminated  Unicode  string  and
       returns  the  number  of Unicode characters (not bytes) in
       that string.

       Tcl_UniCharNcmp  and  Tcl_UniCharNcasecmp  correspond   to
       strncmp and strncasecmp, respectively, for Unicode charac-
       ters.  They accepts two  null-terminated  Unicode  strings
       and the number of characters to compare.  Both strings are
       assumed   to   be   at   least   len   characters    long.
       Tcl_UniCharNcmp   compares  the  two strings character-by-
       character according to the Unicode character ordering.  It
       returns  an integer greater than, equal to, or less than 0
       if the first string is greater than,  equal  to,  or  less
       than  the second string respectively.  Tcl_UniCharNcasecmp
       is the Unicode case insensitive version.

       Tcl_UniCharCaseMatch  is   the   Unicode   equivalent   to |
       Tcl_StringCaseMatch.  It accepts a null-terminated Unicode |
       string, a Unicode pattern, and a boolean value  specifying |
       whether  the  match  should  be case sensitive and returns |
       whether the string matches the pattern.

       Tcl_UtfNcmp corresponds to strncmp for UTF-8  strings.  It
       accepts  two  null-terminated UTF-8 strings and the number
       of characters to compare.  (Both strings are assumed to be
       at  least  len characters long.)  Tcl_UtfNcmp compares the
       two strings character-by-character according to  the  Uni-
       code  character  ordering.   It returns an integer greater
       than, equal to, or less than 0  if  the  first  string  is
       greater  than,  equal  to,  or less than the second string
       respectively.

       Tcl_UtfNcasecmp  corresponds  to  strncasecmp  for   UTF-8
       strings.   It is similar to Tcl_UtfNcmp except comparisons
       ignore differences in case when comparing upper, lower  or
       title case characters.

       Tcl_UtfCharComplete  returns  1 if the source UTF-8 string
       src of length len bytes is long enough to  be  decoded  by
       Tcl_UtfToUniChar,  or 0 otherwise.  This function does not
       guarantee that the UTF-8 string is properly formed.   This
       routine is used by procedures that are operating on a byte
       at a time and need to know if a full Tcl_UniChar has  been
       seen.

       Tcl_NumUtfChars  corresponds  to strlen for UTF-8 strings.
       It returns the number of Tcl_UniChars that are represented
       by  the UTF-8 string src.  The length of the source string
       is len bytes.  If the length is negative, all bytes up  to
       the first null byte are used.

       Tcl_UtfFindFirst  corresponds to strchr for UTF-8 strings.
       It returns a  pointer  to  the  first  occurrence  of  the
       Tcl_UniChar  ch  in  the null-terminated UTF-8 string src.
       The null  terminator  is  considered  part  of  the  UTF-8
       string.

       Tcl_UtfFindLast  corresponds to strrchr for UTF-8 strings.
       It returns  a  pointer  to  the  last  occurrence  of  the
       Tcl_UniChar  ch  in  the null-terminated UTF-8 string src.
       The null  terminator  is  considered  part  of  the  UTF-8
       string.

       Given  src,  a pointer to some location in a UTF-8 string,
       Tcl_UtfNext returns a pointer to the next UTF-8  character
       in the string.  The caller must not ask for the next char-
       acter after the last character in the string if the string
       is not terminated by a null character.

       Given  src,  a  pointer to some location in a UTF-8 string
       (or to a null byte immediately following such  a  string),
       Tcl_UtfPrev  returns  a  pointer  to the closest preceding
       byte that starts a UTF-8 character.   This  function  will
       not  back  up to a position before start, the start of the
       UTF-8 string.  If src was already  at  start,  the  return
       value will be start.

       Tcl_UniCharAtIndex  corresponds to a C string array deref-
       erence or the  Pascal  Ord()  function.   It  returns  the
       Tcl_UniChar  represented  at  the specified character (not
       byte) index in the UTF-8 string src.   The  source  string
       must contain at least index characters.  Behavior is unde-
       fined if a negative index is given.

       Tcl_UtfAtIndex returns a pointer to the specified  charac-
       ter  (not byte) index in the UTF-8 string src.  The source
       string must contain at least index  characters.   This  is
       equivalent to calling Tcl_UtfNext index times.  If a nega-
       tive index is given, the  return  pointer  points  to  the
       first character in the source string.

       Tcl_UtfBackslash is a utility procedure used by several of
       the Tcl commands.  It  parses  a  backslash  sequence  and
       stores  the properly formed UTF-8 character represented by
       the backslash sequence in the output buffer dst.  At  most
       TCL_UTF_MAX  bytes are stored in the buffer.  Tcl_UtfBack-
       slash modifies *readPtr to contain the number of bytes  in
       the backslash sequence, including the backslash character.
       The return value is the number of bytes stored in the out-
       put buffer.

       See  the  Tcl  manual  entry  for information on the valid
       backslash sequences.  All of the  sequences  described  in
       the Tcl manual entry are supported by Tcl_UtfBackslash.


KEYWORDS
       utf, unicode, backslash



Tcl                            8.1                         Utf(3)

Interix / SUAHosted at SUA Community for Interix, SUA and SFUInterix / SUA