convert.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #if !defined(INCLUDED_CONVERT_H)
00023 #define INCLUDED_CONVERT_H
00024
00027
00028 #include "debugging/debugging.h"
00029 #include <algorithm>
00030 #include <glib/gunicode.h>
00031 #include <glib/gconvert.h>
00032
00033 #include "character.h"
00034
00036 inline std::size_t utf8_character_length (const char* character)
00037 {
00038 if ((*character & 0xE0) == 0xC0) {
00039 return 2;
00040 } else if ((*character & 0xF0) == 0xE0) {
00041 return 3;
00042 } else if ((*character & 0xF8) == 0xF0) {
00043 return 4;
00044 } else if ((*character & 0xFC) == 0xF8) {
00045 return 5;
00046 } else if ((*character & 0xFE) == 0xFC) {
00047 return 6;
00048 }
00049 ERROR_MESSAGE("");
00050 return 0;
00051 }
00052
00053 struct UTF8Character
00054 {
00055 const char* buffer;
00056 std::size_t length;
00057 UTF8Character () :
00058 buffer(0), length(0)
00059 {
00060 }
00061 UTF8Character (const char* bytes) :
00062 buffer(bytes), length(utf8_character_length(bytes))
00063 {
00064 }
00065 };
00066
00067 inline bool operator< (const UTF8Character& self, const UTF8Character& other)
00068 {
00069 return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer
00070 + other.length);
00071 }
00072
00074 template<typename TextOutputStreamType>
00075 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const UTF8Character& c)
00076 {
00077 for (const char* p = c.buffer; p != c.buffer + c.length; ++p) {
00078 ostream << HexChar(*p);
00079 }
00080 return ostream;
00081 }
00082
00086 class CharacterSet
00087 {
00088 const char* m_charSet;
00089 public:
00090 CharacterSet ()
00091 {
00092 if (g_get_charset(&m_charSet) != FALSE) {
00093 m_charSet = 0;
00094 }
00095 }
00096 bool isUTF8 () const
00097 {
00098 return m_charSet == 0;
00099 }
00100 const char* get () const
00101 {
00102 return m_charSet;
00103 }
00104 };
00105
00106 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
00107
00109 inline CharacterSet& globalCharacterSet ()
00110 {
00111 return GlobalCharacterSet::instance();
00112 }
00113
00114 class UTF8CharacterToExtendedASCII
00115 {
00116 public:
00117 UTF8Character m_utf8;
00118 char m_c;
00119 UTF8CharacterToExtendedASCII () :
00120 m_c('\0')
00121 {
00122 }
00123 UTF8CharacterToExtendedASCII (const UTF8Character& utf8, char c) :
00124 m_utf8(utf8), m_c(c)
00125 {
00126 }
00127 };
00128
00129 inline bool operator< (const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
00130 {
00131 return self.m_utf8 < other.m_utf8;
00132 }
00133
00134 inline std::size_t extended_ascii_to_index (char c)
00135 {
00136 return static_cast<std::size_t> (c & 0x7F);
00137 }
00138
00139 inline char extended_ascii_for_index (std::size_t i)
00140 {
00141 return static_cast<char> (i | 0x80);
00142 }
00143
00148 class ExtendedASCIICharacterSet
00149 {
00150 typedef char UTF8CharBuffer[6];
00151 UTF8CharBuffer m_converted[128];
00152 UTF8Character m_decodeMap[128];
00153 UTF8CharacterToExtendedASCII m_encodeMap[128];
00154 public:
00155 ExtendedASCIICharacterSet ()
00156 {
00157 if (!globalCharacterSet().isUTF8()) {
00158 GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
00159 for (std::size_t i = 1; i < 128; ++i) {
00160 char c = extended_ascii_for_index(i);
00161 char* inbuf = &c;
00162 std::size_t inbytesleft = 1;
00163 char* outbuf = m_converted[i];
00164 std::size_t outbytesleft = 6;
00165 if (g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t) (-1)) {
00166 UTF8Character utf8(m_converted[i]);
00167 m_decodeMap[i] = utf8;
00168 m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
00169 }
00170 }
00171 g_iconv_close(descriptor);
00172 std::sort(m_encodeMap, m_encodeMap + 128);
00173 }
00174 }
00177 void print () const
00178 {
00179 globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
00180 for (std::size_t i = 1; i < 128; ++i) {
00181 if (m_decodeMap[i].buffer != 0) {
00182 globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
00183 }
00184 }
00185 }
00188 const UTF8Character& decode (char c) const
00189 {
00190 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
00191 ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
00192 ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
00193 return m_decodeMap[extended_ascii_to_index(c)];
00194 }
00197 char encode (const UTF8Character& c) const
00198 {
00199 ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
00200 ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
00201 std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range =
00202 std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
00203 ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
00204 return (*range.first).m_c;
00205 }
00206 };
00207
00208 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
00209
00211 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet ()
00212 {
00213 return GlobalExtendedASCIICharacterSet::instance();
00214 }
00215
00216 class ConvertUTF8ToLocale
00217 {
00218 public:
00219 StringRange m_range;
00220 ConvertUTF8ToLocale (const char* string) :
00221 m_range(StringRange(string, string + strlen(string)))
00222 {
00223 }
00224 ConvertUTF8ToLocale (const StringRange& range) :
00225 m_range(range)
00226 {
00227 }
00228 };
00229
00231 template<typename TextOutputStreamType>
00232 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
00233 {
00234 if (globalCharacterSet().isUTF8()) {
00235 return ostream << convert.m_range;
00236 }
00237
00238 for (const char* p = convert.m_range.first; p != convert.m_range.last;) {
00239 if (!char_is_ascii(*p)) {
00240 UTF8Character c(p);
00241 ostream << globalExtendedASCIICharacterSet().encode(c);
00242 p += c.length;
00243 } else {
00244 ostream << *p++;
00245 }
00246 }
00247 return ostream;
00248 }
00249
00250 class ConvertLocaleToUTF8
00251 {
00252 public:
00253 StringRange m_range;
00254 ConvertLocaleToUTF8 (const char* string) :
00255 m_range(StringRange(string, string + strlen(string)))
00256 {
00257 }
00258 ConvertLocaleToUTF8 (const StringRange& range) :
00259 m_range(range)
00260 {
00261 }
00262 };
00263
00265 template<typename TextOutputStreamType>
00266 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
00267 {
00268 if (globalCharacterSet().isUTF8()) {
00269 return ostream << convert.m_range;
00270 }
00271
00272 for (const char* p = convert.m_range.first; p != convert.m_range.last; ++p) {
00273 if (!char_is_ascii(*p)) {
00274 UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
00275 ostream.write(c.buffer, c.length);
00276 } else {
00277 ostream << *p;
00278 }
00279 }
00280 return ostream;
00281 }
00282
00283 #endif