strenc.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. /**
  2. * \file
  3. * string encoding conversions
  4. *
  5. * Author:
  6. * Dick Porter (dick@ximian.com)
  7. *
  8. * (C) 2003 Ximian, Inc.
  9. */
  10. #include <config.h>
  11. #include <glib.h>
  12. #include <string.h>
  13. #include "strenc.h"
  14. #include "strenc-internals.h"
  15. #include "mono-error.h"
  16. #include "mono-error-internals.h"
  17. static const char trailingBytesForUTF8[256] = {
  18. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  19. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  20. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  21. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  22. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  23. 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  24. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  25. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0
  26. };
  27. /**
  28. * mono_unicode_from_external:
  29. * \param in pointers to the buffer.
  30. * \param bytes number of bytes in the string.
  31. * Tries to turn a NULL-terminated string into UTF-16.
  32. *
  33. * First, see if it's valid UTF-8, in which case just turn it directly
  34. * into UTF-16. Next, run through the colon-separated encodings in
  35. * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
  36. * returning the first successful conversion to UTF-16. If no
  37. * conversion succeeds, return NULL.
  38. *
  39. * Callers must free the returned string if not NULL. \p bytes holds the number
  40. * of bytes in the returned string, not including the terminator.
  41. */
  42. gunichar2 *mono_unicode_from_external (const gchar *in, gsize *bytes)
  43. {
  44. gchar *res=NULL;
  45. gchar **encodings;
  46. gchar *encoding_list;
  47. int i;
  48. glong lbytes;
  49. if(in==NULL) {
  50. return(NULL);
  51. }
  52. encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
  53. if(encoding_list==NULL) {
  54. encoding_list = g_strdup("");
  55. }
  56. encodings=g_strsplit (encoding_list, ":", 0);
  57. g_free (encoding_list);
  58. for(i=0;encodings[i]!=NULL; i++) {
  59. /* "default_locale" is a special case encoding */
  60. if(!strcmp (encodings[i], "default_locale")) {
  61. gchar *utf8=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
  62. if(utf8!=NULL) {
  63. res=(gchar *) g_utf8_to_utf16 (utf8, -1, NULL, &lbytes, NULL);
  64. *bytes = (gsize) lbytes;
  65. }
  66. g_free (utf8);
  67. } else {
  68. /* Don't use UTF16 here. It returns the <FF FE> prepended to the string */
  69. res = g_convert (in, strlen (in), "UTF8", encodings[i], NULL, bytes, NULL);
  70. if (res != NULL) {
  71. gchar *ptr = res;
  72. res = (gchar *) g_utf8_to_utf16 (res, -1, NULL, &lbytes, NULL);
  73. *bytes = (gsize) lbytes;
  74. g_free (ptr);
  75. }
  76. }
  77. if(res!=NULL) {
  78. g_strfreev (encodings);
  79. *bytes *= 2;
  80. return((gunichar2 *)res);
  81. }
  82. }
  83. g_strfreev (encodings);
  84. if(g_utf8_validate (in, -1, NULL)) {
  85. glong items_written;
  86. gunichar2 *unires=g_utf8_to_utf16 (in, -1, NULL, &items_written, NULL);
  87. items_written *= 2;
  88. *bytes = items_written;
  89. return(unires);
  90. }
  91. return(NULL);
  92. }
  93. /**
  94. * mono_utf8_from_external:
  95. * \param in pointer to the string buffer.
  96. * Tries to turn a NULL-terminated string into UTF8.
  97. *
  98. * First, see if it's valid UTF-8, in which case there's nothing more
  99. * to be done. Next, run through the colon-separated encodings in
  100. * \c MONO_EXTERNAL_ENCODINGS and do an \c iconv conversion on each,
  101. * returning the first successful conversion to UTF-8. If no
  102. * conversion succeeds, return NULL.
  103. *
  104. * Callers must free the returned string if not NULL.
  105. *
  106. * This function is identical to \c mono_unicode_from_external, apart
  107. * from returning UTF-8 not UTF-16; it's handy in a few places to work
  108. * in UTF-8.
  109. */
  110. gchar *mono_utf8_from_external (const gchar *in)
  111. {
  112. gchar *res=NULL;
  113. gchar **encodings;
  114. gchar *encoding_list;
  115. int i;
  116. if(in==NULL) {
  117. return(NULL);
  118. }
  119. encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
  120. if(encoding_list==NULL) {
  121. encoding_list = g_strdup("");
  122. }
  123. encodings=g_strsplit (encoding_list, ":", 0);
  124. g_free (encoding_list);
  125. for(i=0;encodings[i]!=NULL; i++) {
  126. /* "default_locale" is a special case encoding */
  127. if(!strcmp (encodings[i], "default_locale")) {
  128. res=g_locale_to_utf8 (in, -1, NULL, NULL, NULL);
  129. if(res!=NULL && !g_utf8_validate (res, -1, NULL)) {
  130. g_free (res);
  131. res=NULL;
  132. }
  133. } else {
  134. res=g_convert (in, -1, "UTF8", encodings[i], NULL,
  135. NULL, NULL);
  136. }
  137. if(res!=NULL) {
  138. g_strfreev (encodings);
  139. return(res);
  140. }
  141. }
  142. g_strfreev (encodings);
  143. if(g_utf8_validate (in, -1, NULL)) {
  144. return(g_strdup (in));
  145. }
  146. return(NULL);
  147. }
  148. /**
  149. * mono_unicode_to_external:
  150. * \param uni a UTF-16 string to convert to an external representation.
  151. * Turns NULL-terminated UTF-16 into either UTF-8, or the first
  152. * working item in \c MONO_EXTERNAL_ENCODINGS if set. If no conversions
  153. * work, then UTF-8 is returned.
  154. * Callers must free the returned string.
  155. */
  156. gchar *mono_unicode_to_external (const gunichar2 *uni)
  157. {
  158. return mono_unicode_to_external_checked (uni, NULL);
  159. }
  160. gchar *mono_unicode_to_external_checked (const gunichar2 *uni, MonoError *err)
  161. {
  162. gchar *utf8;
  163. gchar *encoding_list;
  164. GError *gerr = NULL;
  165. /* Turn the unicode into utf8 to start with, because its
  166. * easier to work with gchar * than gunichar2 *
  167. */
  168. utf8=g_utf16_to_utf8 (uni, -1, NULL, NULL, &gerr);
  169. if (utf8 == NULL) {
  170. mono_error_set_argument (err, "uni", gerr->message);
  171. g_error_free (gerr);
  172. return utf8;
  173. }
  174. encoding_list=g_getenv ("MONO_EXTERNAL_ENCODINGS");
  175. if(encoding_list==NULL) {
  176. /* Do UTF8 */
  177. return(utf8);
  178. } else {
  179. gchar *res, **encodings;
  180. int i;
  181. encodings=g_strsplit (encoding_list, ":", 0);
  182. g_free (encoding_list);
  183. for(i=0; encodings[i]!=NULL; i++) {
  184. if(!strcmp (encodings[i], "default_locale")) {
  185. res=g_locale_from_utf8 (utf8, -1, NULL, NULL,
  186. NULL);
  187. } else {
  188. res=g_convert (utf8, -1, encodings[i], "UTF8",
  189. NULL, NULL, NULL);
  190. }
  191. if(res!=NULL) {
  192. g_free (utf8);
  193. g_strfreev (encodings);
  194. return(res);
  195. }
  196. }
  197. g_strfreev (encodings);
  198. }
  199. /* Nothing else worked, so just return the utf8 */
  200. return(utf8);
  201. }
  202. /**
  203. * mono_utf8_validate_and_len
  204. * \param source Pointer to putative UTF-8 encoded string.
  205. * Checks \p source for being valid UTF-8. \p utf is assumed to be
  206. * null-terminated.
  207. * \returns TRUE if \p source is valid.
  208. * \p oEnd will equal the null terminator at the end of the string if valid.
  209. * if not valid, it will equal the first charater of the invalid sequence.
  210. * \p oLength will equal the length to \p oEnd
  211. **/
  212. gboolean
  213. mono_utf8_validate_and_len (const gchar *source, glong* oLength, const gchar** oEnd)
  214. {
  215. gboolean retVal = TRUE;
  216. gboolean lastRet = TRUE;
  217. guchar* ptr = (guchar*) source;
  218. guchar* srcPtr;
  219. guint length;
  220. guchar a;
  221. *oLength = 0;
  222. while (*ptr != 0) {
  223. length = trailingBytesForUTF8 [*ptr] + 1;
  224. srcPtr = (guchar*) ptr + length;
  225. switch (length) {
  226. default: retVal = FALSE;
  227. /* Everything else falls through when "TRUE"... */
  228. case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  229. if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
  230. if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
  231. *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
  232. retVal = FALSE;
  233. }
  234. case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  235. case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  236. switch (*ptr) {
  237. /* no fall-through in this inner switch */
  238. case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
  239. case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
  240. case 0xEF: {
  241. if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
  242. else if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE;
  243. break;
  244. }
  245. case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
  246. case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
  247. default: if (a < (guchar) 0x80) retVal = FALSE;
  248. }
  249. case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
  250. }
  251. if (*ptr > (guchar) 0xF4)
  252. retVal = FALSE;
  253. //If the string is invalid, set the end to the invalid byte.
  254. if (!retVal && lastRet) {
  255. if (oEnd != NULL)
  256. *oEnd = (gchar*) ptr;
  257. lastRet = FALSE;
  258. }
  259. ptr += length;
  260. (*oLength)++;
  261. }
  262. if (retVal && oEnd != NULL)
  263. *oEnd = (gchar*) ptr;
  264. return retVal;
  265. }
  266. /**
  267. * mono_utf8_validate_and_len_with_bounds
  268. * \param source: Pointer to putative UTF-8 encoded string.
  269. * \param max_bytes: Max number of bytes that can be decoded.
  270. *
  271. * Checks \p source for being valid UTF-8. \p utf is assumed to be
  272. * null-terminated.
  273. *
  274. * This function returns FALSE if it needs to decode characters beyond \p max_bytes.
  275. *
  276. * \returns TRUE if \p source is valid.
  277. * \p oEnd will equal the null terminator at the end of the string if valid.
  278. * if not valid, it will equal the first charater of the invalid sequence.
  279. * \p oLength will equal the length to \p oEnd
  280. **/
  281. gboolean
  282. mono_utf8_validate_and_len_with_bounds (const gchar *source, glong max_bytes, glong* oLength, const gchar** oEnd)
  283. {
  284. gboolean retVal = TRUE;
  285. gboolean lastRet = TRUE;
  286. guchar* ptr = (guchar*) source;
  287. guchar *end = ptr + max_bytes;
  288. guchar* srcPtr;
  289. guint length;
  290. guchar a;
  291. *oLength = 0;
  292. if (max_bytes < 1) {
  293. if (oEnd)
  294. *oEnd = (gchar*) ptr;
  295. return FALSE;
  296. }
  297. while (*ptr != 0) {
  298. length = trailingBytesForUTF8 [*ptr] + 1;
  299. srcPtr = (guchar*) ptr + length;
  300. /* since *ptr is not zero we must ensure that we can decode the current char + the byte after
  301. srcPtr points to the first byte after the current char.*/
  302. if (srcPtr >= end) {
  303. retVal = FALSE;
  304. break;
  305. }
  306. switch (length) {
  307. default: retVal = FALSE;
  308. /* Everything else falls through when "TRUE"... */
  309. case 4: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  310. if ((a == (guchar) 0xBF || a == (guchar) 0xBE) && *(srcPtr-1) == (guchar) 0xBF) {
  311. if (*(srcPtr-2) == (guchar) 0x8F || *(srcPtr-2) == (guchar) 0x9F ||
  312. *(srcPtr-2) == (guchar) 0xAF || *(srcPtr-2) == (guchar) 0xBF)
  313. retVal = FALSE;
  314. }
  315. case 3: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  316. case 2: if ((a = (*--srcPtr)) < (guchar) 0x80 || a > (guchar) 0xBF) retVal = FALSE;
  317. switch (*ptr) {
  318. /* no fall-through in this inner switch */
  319. case 0xE0: if (a < (guchar) 0xA0) retVal = FALSE; break;
  320. case 0xED: if (a > (guchar) 0x9F) retVal = FALSE; break;
  321. case 0xEF: {
  322. if (a == (guchar)0xB7 && (*(srcPtr+1) > (guchar) 0x8F && *(srcPtr+1) < 0xB0)) retVal = FALSE;
  323. else if (a == (guchar)0xBF && (*(srcPtr+1) == (guchar) 0xBE || *(srcPtr+1) == 0xBF)) retVal = FALSE;
  324. break;
  325. }
  326. case 0xF0: if (a < (guchar) 0x90) retVal = FALSE; break;
  327. case 0xF4: if (a > (guchar) 0x8F) retVal = FALSE; break;
  328. default: if (a < (guchar) 0x80) retVal = FALSE;
  329. }
  330. case 1: if (*ptr >= (guchar ) 0x80 && *ptr < (guchar) 0xC2) retVal = FALSE;
  331. }
  332. if (*ptr > (guchar) 0xF4)
  333. retVal = FALSE;
  334. //If the string is invalid, set the end to the invalid byte.
  335. if (!retVal && lastRet) {
  336. if (oEnd != NULL)
  337. *oEnd = (gchar*) ptr;
  338. lastRet = FALSE;
  339. }
  340. ptr += length;
  341. (*oLength)++;
  342. }
  343. if (retVal && oEnd != NULL)
  344. *oEnd = (gchar*) ptr;
  345. return retVal;
  346. }