gutf8.c 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /*
  2. * gutf8.c: UTF-8 conversion
  3. *
  4. * Author:
  5. * Atsushi Enomoto <atsushi@ximian.com>
  6. *
  7. * (C) 2006 Novell, Inc.
  8. * Copyright 2012 Xamarin Inc
  9. */
  10. #include "config.h"
  11. #include <stdio.h>
  12. #include <glib.h>
  13. /*
  14. * Index into the table below with the first byte of a UTF-8 sequence to get
  15. * the number of bytes that are supposed to follow it to complete the sequence.
  16. *
  17. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is left
  18. * as-is for anyone who may want to do such conversion, which was allowed in
  19. * earlier algorithms.
  20. */
  21. const guchar g_utf8_jump_table[256] = {
  22. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  23. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  24. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  25. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  26. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  27. 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  28. 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
  29. 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
  30. };
  31. static gchar *
  32. utf8_case_conv (const gchar *str, gssize len, gboolean upper)
  33. {
  34. gunichar *ustr;
  35. glong i, ulen;
  36. gchar *utf8;
  37. ustr = g_utf8_to_ucs4_fast (str, (glong) len, &ulen);
  38. for (i = 0; i < ulen; i++)
  39. ustr[i] = upper ? g_unichar_toupper (ustr[i]) : g_unichar_tolower (ustr[i]);
  40. utf8 = g_ucs4_to_utf8 (ustr, ulen, NULL, NULL, NULL);
  41. g_free (ustr);
  42. return utf8;
  43. }
  44. gchar *
  45. g_utf8_strup (const gchar *str, gssize len)
  46. {
  47. return utf8_case_conv (str, len, TRUE);
  48. }
  49. gchar *
  50. g_utf8_strdown (const gchar *str, gssize len)
  51. {
  52. return utf8_case_conv (str, len, FALSE);
  53. }
  54. static gboolean
  55. utf8_validate (const unsigned char *inptr, size_t len)
  56. {
  57. const unsigned char *ptr = inptr + len;
  58. unsigned char c;
  59. /* Everything falls through when TRUE... */
  60. switch (len) {
  61. default:
  62. return FALSE;
  63. case 4:
  64. if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  65. return FALSE;
  66. if ((c == 0xBF || c == 0xBE) && ptr[-1] == 0xBF) {
  67. if (ptr[-2] == 0x8F || ptr[-2] == 0x9F ||
  68. ptr[-2] == 0xAF || ptr[-2] == 0xBF)
  69. return FALSE;
  70. }
  71. case 3:
  72. if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  73. return FALSE;
  74. case 2:
  75. if ((c = (*--ptr)) < 0x80 || c > 0xBF)
  76. return FALSE;
  77. /* no fall-through in this inner switch */
  78. switch (*inptr) {
  79. case 0xE0: if (c < 0xA0) return FALSE; break;
  80. case 0xED: if (c > 0x9F) return FALSE; break;
  81. case 0xEF: if (c == 0xB7 && (ptr[1] > 0x8F && ptr[1] < 0xB0)) return FALSE;
  82. if (c == 0xBF && (ptr[1] == 0xBE || ptr[1] == 0xBF)) return FALSE;
  83. break;
  84. case 0xF0: if (c < 0x90) return FALSE; break;
  85. case 0xF4: if (c > 0x8F) return FALSE; break;
  86. default: if (c < 0x80) return FALSE; break;
  87. }
  88. case 1: if (*inptr >= 0x80 && *inptr < 0xC2) return FALSE;
  89. }
  90. if (*inptr > 0xF4)
  91. return FALSE;
  92. return TRUE;
  93. }
  94. /**
  95. * g_utf8_validate:
  96. * @str: a utf-8 encoded string
  97. * @max_len: max number of bytes to validate (or -1 to validate the entire null-terminated string)
  98. * @end: output parameter to mark the end of the valid input
  99. *
  100. * Checks @utf for being valid UTF-8. @str is assumed to be
  101. * null-terminated. This function is not super-strict, as it will
  102. * allow longer UTF-8 sequences than necessary. Note that Java is
  103. * capable of producing these sequences if provoked. Also note, this
  104. * routine checks for the 4-byte maximum size, but does not check for
  105. * 0x10ffff maximum value.
  106. *
  107. * Return value: %TRUE if @str is valid or %FALSE otherwise.
  108. **/
  109. gboolean
  110. g_utf8_validate (const gchar *str, gssize max_len, const gchar **end)
  111. {
  112. guchar *inptr = (guchar *) str;
  113. gboolean valid = TRUE;
  114. guint length, min;
  115. gssize n = 0;
  116. if (max_len == 0)
  117. return FALSE;
  118. if (max_len < 0) {
  119. while (*inptr != 0) {
  120. length = g_utf8_jump_table[*inptr];
  121. if (!utf8_validate (inptr, length)) {
  122. valid = FALSE;
  123. break;
  124. }
  125. inptr += length;
  126. }
  127. } else {
  128. while (n < max_len) {
  129. if (*inptr == 0) {
  130. /* Note: return FALSE if we encounter nul-byte
  131. * before max_len is reached. */
  132. valid = FALSE;
  133. break;
  134. }
  135. length = g_utf8_jump_table[*inptr];
  136. min = MIN (length, max_len - n);
  137. if (!utf8_validate (inptr, min)) {
  138. valid = FALSE;
  139. break;
  140. }
  141. if (min < length) {
  142. valid = FALSE;
  143. break;
  144. }
  145. inptr += length;
  146. n += length;
  147. }
  148. }
  149. if (end != NULL)
  150. *end = (gchar *) inptr;
  151. return valid;
  152. }
  153. gunichar
  154. g_utf8_get_char_validated (const gchar *str, gssize max_len)
  155. {
  156. unsigned char *inptr = (unsigned char *) str;
  157. gunichar u = *inptr;
  158. int n, i;
  159. if (max_len == 0)
  160. return -2;
  161. if (u < 0x80) {
  162. /* simple ascii case */
  163. return u;
  164. } else if (u < 0xc2) {
  165. return -1;
  166. } else if (u < 0xe0) {
  167. u &= 0x1f;
  168. n = 2;
  169. } else if (u < 0xf0) {
  170. u &= 0x0f;
  171. n = 3;
  172. } else if (u < 0xf8) {
  173. u &= 0x07;
  174. n = 4;
  175. } else if (u < 0xfc) {
  176. u &= 0x03;
  177. n = 5;
  178. } else if (u < 0xfe) {
  179. u &= 0x01;
  180. n = 6;
  181. } else {
  182. return -1;
  183. }
  184. if (max_len > 0) {
  185. if (!utf8_validate (inptr, MIN (max_len, n)))
  186. return -1;
  187. if (max_len < n)
  188. return -2;
  189. } else {
  190. if (!utf8_validate (inptr, n))
  191. return -1;
  192. }
  193. for (i = 1; i < n; i++)
  194. u = (u << 6) | (*++inptr ^ 0x80);
  195. return u;
  196. }
  197. glong
  198. g_utf8_strlen (const gchar *str, gssize max_len)
  199. {
  200. const guchar *inptr = (const guchar *) str;
  201. glong clen = 0, len = 0, n;
  202. if (max_len == 0)
  203. return 0;
  204. if (max_len < 0) {
  205. while (*inptr) {
  206. inptr += g_utf8_jump_table[*inptr];
  207. len++;
  208. }
  209. } else {
  210. while (len < max_len && *inptr) {
  211. n = g_utf8_jump_table[*inptr];
  212. if ((clen + n) > max_len)
  213. break;
  214. inptr += n;
  215. clen += n;
  216. len++;
  217. }
  218. }
  219. return len;
  220. }
  221. gunichar
  222. g_utf8_get_char (const gchar *src)
  223. {
  224. unsigned char *inptr = (unsigned char *) src;
  225. gunichar u = *inptr;
  226. int n, i;
  227. if (u < 0x80) {
  228. /* simple ascii case */
  229. return u;
  230. } else if (u < 0xe0) {
  231. u &= 0x1f;
  232. n = 2;
  233. } else if (u < 0xf0) {
  234. u &= 0x0f;
  235. n = 3;
  236. } else if (u < 0xf8) {
  237. u &= 0x07;
  238. n = 4;
  239. } else if (u < 0xfc) {
  240. u &= 0x03;
  241. n = 5;
  242. } else {
  243. u &= 0x01;
  244. n = 6;
  245. }
  246. for (i = 1; i < n; i++)
  247. u = (u << 6) | (*++inptr ^ 0x80);
  248. return u;
  249. }
  250. gchar *
  251. g_utf8_offset_to_pointer (const gchar *str, glong offset)
  252. {
  253. const gchar *p = str;
  254. if (offset > 0) {
  255. do {
  256. p = g_utf8_next_char (p);
  257. offset --;
  258. } while (offset > 0);
  259. }
  260. else if (offset < 0) {
  261. const gchar *jump = str;
  262. do {
  263. // since the minimum size of a character is 1
  264. // we know we can step back at least offset bytes
  265. jump = jump + offset;
  266. // if we land in the middle of a character
  267. // walk to the beginning
  268. while ((*jump & 0xc0) == 0x80)
  269. jump --;
  270. // count how many characters we've actually walked
  271. // by going forward
  272. p = jump;
  273. do {
  274. p = g_utf8_next_char (p);
  275. offset ++;
  276. } while (p < jump);
  277. } while (offset < 0);
  278. }
  279. return (gchar *)p;
  280. }
  281. glong
  282. g_utf8_pointer_to_offset (const gchar *str, const gchar *pos)
  283. {
  284. const gchar *inptr, *inend;
  285. glong offset = 0;
  286. glong sign = 1;
  287. if (pos == str)
  288. return 0;
  289. if (str < pos) {
  290. inptr = str;
  291. inend = pos;
  292. } else {
  293. inptr = pos;
  294. inend = str;
  295. sign = -1;
  296. }
  297. do {
  298. inptr = g_utf8_next_char (inptr);
  299. offset++;
  300. } while (inptr < inend);
  301. return offset * sign;
  302. }