utf8.c

Go to the documentation of this file.
00001 
00005 /*
00006 All original material Copyright (C) 2002-2010 UFO: Alien Invasion.
00007 
00008 Copyright (C) 1997-2001 Id Software, Inc.
00009 
00010 This program is free software; you can redistribute it and/or
00011 modify it under the terms of the GNU General Public License
00012 as published by the Free Software Foundation; either version 2
00013 of the License, or (at your option) any later version.
00014 
00015 This program is distributed in the hope that it will be useful,
00016 but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
00018 
00019 See the GNU General Public License for more details.
00020 
00021 You should have received a copy of the GNU General Public License
00022 along with this program; if not, write to the Free Software
00023 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
00024 */
00025 
00026 #include "utf8.h"
00027 #include <string.h>
00028 
00035 int UTF8_delete_char (char *s, int pos)
00036 {
00037     int start = pos;
00038     int next = pos;
00039 
00040     while (start > 0 && UTF8_CONTINUATION_BYTE(s[start]))
00041         start--;
00042     if (s[next] != 0)
00043         next++;
00044     while (s[next] != 0 && UTF8_CONTINUATION_BYTE(s[next]))
00045         next++;
00046     /* memmove is the only standard copying function that is guaranteed
00047      * to work if the source and destination overlap. */
00048     memmove(&s[start], &s[next], strlen(&s[next]) + 1);
00049     return start;
00050 }
00051 
00060 int UTF8_insert_char (char *s, int n, int pos, int c)
00061 {
00062     const int utf8len = UTF8_encoded_len(c);
00063     const int tail = strlen(&s[pos]) + 1;
00064 
00065     if (utf8len == 0)
00066         return 0;
00067 
00068     if (pos + tail + utf8len >= n)
00069         return 0;
00070 
00071     /* Insertion: move up rest of string. Also moves string terminator. */
00072     memmove(&s[pos + utf8len], &s[pos], tail);
00073 
00074     if (c <= 0x7f) {
00075         s[pos] = c;
00076     } else if (c <= 0x7ff) {                /* c has 11 bits */
00077         s[pos] = 0xc0 | (c >> 6);               /* high 5 bits */
00078         s[pos + 1] = 0x80 | (c & 0x3f);         /* low 6 bits */
00079     } else if (c <= 0xffff) {               /* c has 16 bits */
00080         s[pos] = 0xe0 | (c >> 12);              /* high 4 bits */
00081         s[pos + 1] = 0x80 | ((c >> 6) & 0x3f);  /* mid 6 bits */
00082         s[pos + 2] = 0x80 | (c & 0x3f);         /* low 6 bits */
00083     } else if (c <= 0x10ffff) {             /* c has 21 bits */
00084         s[pos] = 0xf0 | (c >> 18);              /* high 3 bits */
00085         s[pos + 1] = 0x80 | ((c >> 12) & 0x3f); /* mid 6 bits */
00086         s[pos + 2] = 0x80 | ((c >> 6) & 0x3f);  /* mid 6 bits */
00087         s[pos + 3] = 0x80 | (c & 0x3f);         /* low 6 bits */
00088     }
00089 
00090     return utf8len;
00091 }
00092 
00103 int UTF8_char_len (unsigned char c)
00104 {
00105     if (c < 0x80)
00106         return 1;
00107     if (c < 0xc0)
00108         return 0;
00109     if (c < 0xe0)
00110         return 2;
00111     if (c < 0xf0)
00112         return 3;
00113     if (c < 0xf8)
00114         return 4;
00115     /* UTF-8 used to define 5 and 6 byte sequences, but they are
00116      * no longer valid. */
00117     return 0;
00118 }
00119 
00124 int UTF8_encoded_len (int c)
00125 {
00126     if (c <= 0x7F)
00127         return 1;
00128     if (c <= 0x07FF)
00129         return 2;
00130     if (c <= 0xFFFF)
00131         return 3;
00132     if (c <= 0x10FFFF)  /* highest defined Unicode code */
00133         return 4;
00134     return 0;
00135 }
00136 
00143 size_t UTF8_strlen (const char *str)
00144 {
00145     size_t result = 0;
00146 
00147     while (str[0] != '\0') {
00148         const int n = UTF8_char_len((unsigned char)*str);
00149         str += n;
00150         result++;
00151     }
00152     return result;
00153 }
00154 
00162 char *UTF8_strncpyz (char *dest, const char *src, size_t limit)
00163 {
00164     size_t length;
00165 
00166     length = strlen(src);
00167     if (length > limit - 1) {
00168         length = limit - 1;
00169         if (length > 0 && (unsigned char) src[length - 1] >= 0x80) {
00170             size_t i = length - 1;
00171             while ((i > 0) && ((unsigned char) src[i] & 0xC0) == 0x80)
00172                 i--;
00173             if (UTF8_char_len(src[i]) + i > length)
00174                 length = i;
00175         }
00176     }
00177 
00178     memcpy(dest, src, length);
00179     dest[length] = '\0';
00180 
00181     return dest;
00182 }
00183