123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386 |
- /**
- * \file
- * string encoding conversions
- *
- * Author:
- * Dick Porter (dick@ximian.com)
- *
- * (C) 2003 Ximian, Inc.
- */
- #include <config.h>
- #include <glib.h>
- #include <string.h>
- #include "strenc.h"
- #include "strenc-internals.h"
- #include "mono-error.h"
- #include "mono-error-internals.h"
- static const char trailingBytesForUTF8[256] = {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
- };
- /**
- * mono_unicode_from_external:
- * \param in pointers to the buffer.
- * \param bytes number of bytes in the string.
- * Tries to turn a NULL-terminated string into UTF-16.
- *
- * First, see if it's valid UTF-8, in which case just turn it directly
- * into UTF-16. Next, run through the colon-separated encodings in
- * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
- * returning the first successful conversion to UTF-16. If no
- * conversion succeeds, return NULL.
- *
- * Callers must free the returned string if not NULL. \p bytes holds the number
- * of bytes in the returned string, not including the terminator.
- */
- gunichar2 *mono_unicode_from_external (const gchar *in, gsize *bytes)
- {
- gchar *res=NULL;
- gchar **encodings;
- gchar *encoding_list;
- int i;
- glong lbytes;
-
- if(in==NULL) {
- return(NULL);
- }
-
- encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
- if(encoding_list==NULL) {
- encoding_list = g_strdup("");
- }
-
- encodings=g_strsplit (encoding_list, ":", 0);
- g_free (encoding_list);
- for(i=0;encodings[i]!=NULL; i++) {
- /* "default_locale" is a special case encoding */
- if(!strcmp (encodings[i], "default_locale")) {
- gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
- if(utf8!=NULL) {
- res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
- *bytes = (gsize) lbytes;
- }
- g_free (utf8);
- } else {
- /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
- res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
- if (res != NULL) {
- gchar *ptr = res;
- res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
- *bytes = (gsize) lbytes;
- g_free (ptr);
- }
- }
- if(res!=NULL) {
- g_strfreev (encodings);
- *bytes *= 2;
- return((gunichar2 *)res);
- }
- }
-
- g_strfreev (encodings);
-
- if(g_utf8_validate (in, -1, NULL)) {
- glong items_written;
- gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, &items_written, NULL);
- items_written *= 2;
- *bytes = items_written;
- return(unires);
- }
- return(NULL);
- }
- /**
- * mono_utf8_from_external:
- * \param in pointer to the string buffer.
- * Tries to turn a NULL-terminated string into UTF8.
- *
- * First, see if it's valid UTF-8, in which case there's nothing more
- * to be done. Next, run through the colon-separated encodings in
- * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
- * returning the first successful conversion to UTF-8. If no
- * conversion succeeds, return NULL.
- *
- * Callers must free the returned string if not NULL.
- *
- * This function is identical to \c mono_unicode_from_external, apart
- * from returning UTF-8 not UTF-16; it's handy in a few places to work
- * in UTF-8.
- */
- gchar *mono_utf8_from_external (const gchar *in)
- {
- gchar *res=NULL;
- gchar **encodings;
- gchar *encoding_list;
- int i;
-
- if(in==NULL) {
- return(NULL);
- }
-
- encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
- if(encoding_list==NULL) {
- encoding_list = g_strdup("");
- }
-
- encodings=g_strsplit (encoding_list, ":", 0);
- g_free (encoding_list);
- for(i=0;encodings[i]!=NULL; i++) {
-
- /* "default_locale" is a special case encoding */
- if(!strcmp (encodings[i], "default_locale")) {
- res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
- if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
- g_free (res);
- res=NULL;
- }
- } else {
- res=g_convert (in, -1, "UTF8", encodings[i], NULL,
- NULL, NULL);
- }
- if(res!=NULL) {
- g_strfreev (encodings);
- return(res);
- }
- }
-
- g_strfreev (encodings);
-
- if(g_utf8_validate (in, -1, NULL)) {
- return(g_strdup (in));
- }
- return(NULL);
- }
- /**
- * mono_unicode_to_external:
- * \param uni a UTF-16 string to convert to an external representation.
- * Turns NULL-terminated UTF-16 into either UTF-8, or the first
- * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions
- * work, then UTF-8 is returned.
- * Callers must free the returned string.
- */
- gchar *mono_unicode_to_external (const gunichar2 *uni)
- {
- return mono_unicode_to_external_checked (uni, NULL);
- }
- gchar *mono_unicode_to_external_checked (const gunichar2 *uni, MonoError *err)
- {
- gchar *utf8;
- gchar *encoding_list;
- GError *gerr = NULL;
-
- /* Turn the unicode into utf8 to start with, because its
- * easier to work with gchar * than gunichar2 *
- */
- utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, &gerr);
- if (utf8 == NULL) {
- mono_error_set_argument (err, "uni", gerr->message);
- g_error_free (gerr);
- return utf8;
- }
-
- encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
- if(encoding_list==NULL) {
- /* Do UTF8 */
- return(utf8);
- } else {
- gchar *res, **encodings;
- int i;
-
- encodings=g_strsplit (encoding_list, ":", 0);
- g_free (encoding_list);
- for(i=0; encodings[i]!=NULL; i++) {
- if(!strcmp (encodings[i], "default_locale")) {
- res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
- NULL);
- } else {
- res=g_convert (utf8, -1, encodings[i], "UTF8",
- NULL, NULL, NULL);
- }
- if(res!=NULL) {
- g_free (utf8);
- g_strfreev (encodings);
-
- return(res);
- }
- }
-
- g_strfreev (encodings);
- }
-
- /* Nothing else worked, so just return the utf8 */
- return(utf8);
- }
- /**
- * mono_utf8_validate_and_len
- * \param source Pointer to putative UTF-8 encoded string.
- * Checks \p source for being valid UTF-8. \p utf is assumed to be
- * null-terminated.
- * \returns TRUE if \p source is valid.
- * \p oEnd will equal the null terminator at the end of the string if valid.
- * if not valid, it will equal the first charater of the invalid sequence.
- * \p oLength will equal the length to \p oEnd
- **/
- gboolean
- mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
- {
- gboolean retVal = TRUE;
- gboolean lastRet = TRUE;
- guchar* ptr = (guchar*) source;
- guchar* srcPtr;
- guint length;
- guchar a;
- *oLength = 0;
- while (*ptr != 0) {
- length = trailingBytesForUTF8 [*ptr] + 1;
- srcPtr = (guchar*) ptr + length;
- switch (length) {
- default: retVal = FALSE;
- /* Everything else falls through when "TRUE"... */
- case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
- if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
- *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
- retVal = FALSE;
- }
- case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- switch (*ptr) {
- /* no fall-through in this inner switch */
- case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
- case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
- case 0xEF: {
- if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
- else if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE;
- break;
- }
- case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
- case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
- default: if (a < (guchar) 0x80) retVal = FALSE;
- }
- case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
- }
- if (*ptr > (guchar) 0xF4)
- retVal = FALSE;
- //If the string is invalid, set the end to the invalid byte.
- if (!retVal && lastRet) {
- if (oEnd != NULL)
- *oEnd = (gchar*) ptr;
- lastRet = FALSE;
- }
- ptr += length;
- (*oLength)++;
- }
- if (retVal && oEnd != NULL)
- *oEnd = (gchar*) ptr;
- return retVal;
- }
- /**
- * mono_utf8_validate_and_len_with_bounds
- * \param source: Pointer to putative UTF-8 encoded string.
- * \param max_bytes: Max number of bytes that can be decoded.
- *
- * Checks \p source for being valid UTF-8. \p utf is assumed to be
- * null-terminated.
- *
- * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
- *
- * \returns TRUE if \p source is valid.
- * \p oEnd will equal the null terminator at the end of the string if valid.
- * if not valid, it will equal the first charater of the invalid sequence.
- * \p oLength will equal the length to \p oEnd
- **/
- gboolean
- mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
- {
- gboolean retVal = TRUE;
- gboolean lastRet = TRUE;
- guchar* ptr = (guchar*) source;
- guchar *end = ptr + max_bytes;
- guchar* srcPtr;
- guint length;
- guchar a;
- *oLength = 0;
- if (max_bytes < 1) {
- if (oEnd)
- *oEnd = (gchar*) ptr;
- return FALSE;
- }
- while (*ptr != 0) {
- length = trailingBytesForUTF8 [*ptr] + 1;
- srcPtr = (guchar*) ptr + length;
-
- /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
- srcPtr points to the first byte after the current char.*/
- if (srcPtr >= end) {
- retVal = FALSE;
- break;
- }
- switch (length) {
- default: retVal = FALSE;
- /* Everything else falls through when "TRUE"... */
- case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
- if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
- *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
- retVal = FALSE;
- }
- case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
- switch (*ptr) {
- /* no fall-through in this inner switch */
- case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
- case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
- case 0xEF: {
- if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
- else if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE;
- break;
- }
- case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
- case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
- default: if (a < (guchar) 0x80) retVal = FALSE;
- }
- case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
- }
- if (*ptr > (guchar) 0xF4)
- retVal = FALSE;
- //If the string is invalid, set the end to the invalid byte.
- if (!retVal && lastRet) {
- if (oEnd != NULL)
- *oEnd = (gchar*) ptr;
- lastRet = FALSE;
- }
- ptr += length;
- (*oLength)++;
- }
- if (retVal && oEnd != NULL)
- *oEnd = (gchar*) ptr;
- return retVal;
- }
|