123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351 |
- /*
- * gutf8.c: UTF-8 conversion
- *
- * Author:
- * Atsushi Enomoto <atsushi@ximian.com>
- *
- * (C) 2006 Novell, Inc.
- * Copyright 2012 Xamarin Inc
- */
- #include "config.h"
- #include <stdio.h>
- #include <glib.h>
- /*
- * Index into the table below with the first byte of a UTF-8 sequence to get
- * the number of bytes that are supposed to follow it to complete the sequence.
- *
- * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
- * as-is for anyone who may want to do such conversion, which was allowed in
- * earlier algorithms.
- */
- const guchar g_utf8_jump_table[256] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
- };
- static gchar *
- utf8_case_conv (const gchar *str, gssize len, gboolean upper)
- {
- gunichar *ustr;
- glong i, ulen;
- gchar *utf8;
-
- ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
- for (i = 0; i < ulen; i++)
- ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
- utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
- g_free (ustr);
-
- return utf8;
- }
- gchar *
- g_utf8_strup (const gchar *str, gssize len)
- {
- return utf8_case_conv (str, len, TRUE);
- }
- gchar *
- g_utf8_strdown (const gchar *str, gssize len)
- {
- return utf8_case_conv (str, len, FALSE);
- }
- static gboolean
- utf8_validate (const unsigned char *inptr, size_t len)
- {
- const unsigned char *ptr = inptr + len;
- unsigned char c;
-
- /* Everything falls through when TRUE... */
- switch (len) {
- default:
- return FALSE;
- case 4:
- if ((c = (*--ptr)) < 0x80 || c > 0xBF)
- return FALSE;
-
- if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
- if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
- ptr[-2] == 0xAF || ptr[-2] == 0xBF)
- return FALSE;
- }
- case 3:
- if ((c = (*--ptr)) < 0x80 || c > 0xBF)
- return FALSE;
- case 2:
- if ((c = (*--ptr)) < 0x80 || c > 0xBF)
- return FALSE;
-
- /* no fall-through in this inner switch */
- switch (*inptr) {
- case 0xE0: if (c < 0xA0) return FALSE; break;
- case 0xED: if (c > 0x9F) return FALSE; break;
- case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
- if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
- break;
- case 0xF0: if (c < 0x90) return FALSE; break;
- case 0xF4: if (c > 0x8F) return FALSE; break;
- default: if (c < 0x80) return FALSE; break;
- }
- case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
- }
-
- if (*inptr > 0xF4)
- return FALSE;
-
- return TRUE;
- }
- /**
- * g_utf8_validate:
- * @str: a utf-8 encoded string
- * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
- * @end: output parameter to mark the end of the valid input
- *
- * Checks @utf for being valid UTF-8. @str is assumed to be
- * null-terminated. This function is not super-strict, as it will
- * allow longer UTF-8 sequences than necessary. Note that Java is
- * capable of producing these sequences if provoked. Also note, this
- * routine checks for the 4-byte maximum size, but does not check for
- * 0x10ffff maximum value.
- *
- * Return value: %TRUE if @str is valid or %FALSE otherwise.
- **/
- gboolean
- g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
- {
- guchar *inptr = (guchar *) str;
- gboolean valid = TRUE;
- guint length, min;
- gssize n = 0;
-
- if (max_len == 0)
- return FALSE;
-
- if (max_len < 0) {
- while (*inptr != 0) {
- length = g_utf8_jump_table[*inptr];
- if (!utf8_validate (inptr, length)) {
- valid = FALSE;
- break;
- }
-
- inptr += length;
- }
- } else {
- while (n < max_len) {
- if (*inptr == 0) {
- /* Note: return FALSE if we encounter nul-byte
- * before max_len is reached. */
- valid = FALSE;
- break;
- }
-
- length = g_utf8_jump_table[*inptr];
- min = MIN (length, max_len - n);
-
- if (!utf8_validate (inptr, min)) {
- valid = FALSE;
- break;
- }
-
- if (min < length) {
- valid = FALSE;
- break;
- }
-
- inptr += length;
- n += length;
- }
- }
-
- if (end != NULL)
- *end = (gchar *) inptr;
-
- return valid;
- }
- gunichar
- g_utf8_get_char_validated (const gchar *str, gssize max_len)
- {
- unsigned char *inptr = (unsigned char *) str;
- gunichar u = *inptr;
- int n, i;
-
- if (max_len == 0)
- return -2;
-
- if (u < 0x80) {
- /* simple ascii case */
- return u;
- } else if (u < 0xc2) {
- return -1;
- } else if (u < 0xe0) {
- u &= 0x1f;
- n = 2;
- } else if (u < 0xf0) {
- u &= 0x0f;
- n = 3;
- } else if (u < 0xf8) {
- u &= 0x07;
- n = 4;
- } else if (u < 0xfc) {
- u &= 0x03;
- n = 5;
- } else if (u < 0xfe) {
- u &= 0x01;
- n = 6;
- } else {
- return -1;
- }
-
- if (max_len > 0) {
- if (!utf8_validate (inptr, MIN (max_len, n)))
- return -1;
-
- if (max_len < n)
- return -2;
- } else {
- if (!utf8_validate (inptr, n))
- return -1;
- }
-
- for (i = 1; i < n; i++)
- u = (u << 6) | (*++inptr ^ 0x80);
-
- return u;
- }
- glong
- g_utf8_strlen (const gchar *str, gssize max_len)
- {
- const guchar *inptr = (const guchar *) str;
- glong clen = 0, len = 0, n;
-
- if (max_len == 0)
- return 0;
-
- if (max_len < 0) {
- while (*inptr) {
- inptr += g_utf8_jump_table[*inptr];
- len++;
- }
- } else {
- while (len < max_len && *inptr) {
- n = g_utf8_jump_table[*inptr];
- if ((clen + n) > max_len)
- break;
-
- inptr += n;
- clen += n;
- len++;
- }
- }
-
- return len;
- }
- gunichar
- g_utf8_get_char (const gchar *src)
- {
- unsigned char *inptr = (unsigned char *) src;
- gunichar u = *inptr;
- int n, i;
-
- if (u < 0x80) {
- /* simple ascii case */
- return u;
- } else if (u < 0xe0) {
- u &= 0x1f;
- n = 2;
- } else if (u < 0xf0) {
- u &= 0x0f;
- n = 3;
- } else if (u < 0xf8) {
- u &= 0x07;
- n = 4;
- } else if (u < 0xfc) {
- u &= 0x03;
- n = 5;
- } else {
- u &= 0x01;
- n = 6;
- }
-
- for (i = 1; i < n; i++)
- u = (u << 6) | (*++inptr ^ 0x80);
-
- return u;
- }
- gchar *
- g_utf8_offset_to_pointer (const gchar *str, glong offset)
- {
- const gchar *p = str;
- if (offset > 0) {
- do {
- p = g_utf8_next_char (p);
- offset --;
- } while (offset > 0);
- }
- else if (offset < 0) {
- const gchar *jump = str;
- do {
- // since the minimum size of a character is 1
- // we know we can step back at least offset bytes
- jump = jump + offset;
-
- // if we land in the middle of a character
- // walk to the beginning
- while ((*jump & 0xc0) == 0x80)
- jump --;
-
- // count how many characters we've actually walked
- // by going forward
- p = jump;
- do {
- p = g_utf8_next_char (p);
- offset ++;
- } while (p < jump);
-
- } while (offset < 0);
- }
-
- return (gchar *)p;
- }
- glong
- g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
- {
- const gchar *inptr, *inend;
- glong offset = 0;
- glong sign = 1;
-
- if (pos == str)
- return 0;
-
- if (str < pos) {
- inptr = str;
- inend = pos;
- } else {
- inptr = pos;
- inend = str;
- sign = -1;
- }
-
- do {
- inptr = g_utf8_next_char (inptr);
- offset++;
- } while (inptr < inend);
-
- return offset * sign;
- }
|