convert.h

Go to the documentation of this file.
00001 /*
00002  Copyright (C) 2001-2006, William Joseph.
00003  All Rights Reserved.
00004 
00005  This file is part of GtkRadiant.
00006 
00007  GtkRadiant is free software; you can redistribute it and/or modify
00008  it under the terms of the GNU General Public License as published by
00009  the Free Software Foundation; either version 2 of the License, or
00010  (at your option) any later version.
00011 
00012  GtkRadiant is distributed in the hope that it will be useful,
00013  but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015  GNU General Public License for more details.
00016 
00017  You should have received a copy of the GNU General Public License
00018  along with GtkRadiant; if not, write to the Free Software
00019  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00020  */
00021 
00022 #if !defined(INCLUDED_CONVERT_H)
00023 #define INCLUDED_CONVERT_H
00024 
00027 
00028 #include "debugging/debugging.h"
00029 #include <algorithm>
00030 #include <glib/gunicode.h>
00031 #include <glib/gconvert.h>
00032 
00033 #include "character.h"
00034 
00036 inline std::size_t utf8_character_length (const char* character)
00037 {
00038     if ((*character & 0xE0) == 0xC0) { // 110xxxxx
00039         return 2;
00040     } else if ((*character & 0xF0) == 0xE0) { // 1110xxxx
00041         return 3;
00042     } else if ((*character & 0xF8) == 0xF0) { // 11110xxx
00043         return 4;
00044     } else if ((*character & 0xFC) == 0xF8) { // 111110xx
00045         return 5;
00046     } else if ((*character & 0xFE) == 0xFC) { // 1111110x
00047         return 6;
00048     }
00049     ERROR_MESSAGE("");
00050     return 0;
00051 }
00052 
00053 struct UTF8Character
00054 {
00055         const char* buffer;
00056         std::size_t length;
00057         UTF8Character () :
00058             buffer(0), length(0)
00059         {
00060         }
00061         UTF8Character (const char* bytes) :
00062             buffer(bytes), length(utf8_character_length(bytes))
00063         {
00064         }
00065 };
00066 
00067 inline bool operator< (const UTF8Character& self, const UTF8Character& other)
00068 {
00069     return std::lexicographical_compare(self.buffer, self.buffer + self.length, other.buffer, other.buffer
00070             + other.length);
00071 }
00072 
00074 template<typename TextOutputStreamType>
00075 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const UTF8Character& c)
00076 {
00077     for (const char* p = c.buffer; p != c.buffer + c.length; ++p) {
00078         ostream << HexChar(*p);
00079     }
00080     return ostream;
00081 }
00082 
00086 class CharacterSet
00087 {
00088         const char* m_charSet;
00089     public:
00090         CharacterSet ()
00091         {
00092             if (g_get_charset(&m_charSet) != FALSE) {
00093                 m_charSet = 0;
00094             }
00095         }
00096         bool isUTF8 () const
00097         {
00098             return m_charSet == 0;
00099         }
00100         const char* get () const
00101         {
00102             return m_charSet;
00103         }
00104 };
00105 
00106 typedef LazyStatic<CharacterSet> GlobalCharacterSet;
00107 
00109 inline CharacterSet& globalCharacterSet ()
00110 {
00111     return GlobalCharacterSet::instance();
00112 }
00113 
00114 class UTF8CharacterToExtendedASCII
00115 {
00116     public:
00117         UTF8Character m_utf8;
00118         char m_c;
00119         UTF8CharacterToExtendedASCII () :
00120             m_c('\0')
00121         {
00122         }
00123         UTF8CharacterToExtendedASCII (const UTF8Character& utf8, char c) :
00124             m_utf8(utf8), m_c(c)
00125         {
00126         }
00127 };
00128 
00129 inline bool operator< (const UTF8CharacterToExtendedASCII& self, const UTF8CharacterToExtendedASCII& other)
00130 {
00131     return self.m_utf8 < other.m_utf8;
00132 }
00133 
00134 inline std::size_t extended_ascii_to_index (char c)
00135 {
00136     return static_cast<std::size_t> (c & 0x7F);
00137 }
00138 
00139 inline char extended_ascii_for_index (std::size_t i)
00140 {
00141     return static_cast<char> (i | 0x80);
00142 }
00143 
00148 class ExtendedASCIICharacterSet
00149 {
00150         typedef char UTF8CharBuffer[6];
00151         UTF8CharBuffer m_converted[128];
00152         UTF8Character m_decodeMap[128];
00153         UTF8CharacterToExtendedASCII m_encodeMap[128];
00154     public:
00155         ExtendedASCIICharacterSet ()
00156         {
00157             if (!globalCharacterSet().isUTF8()) {
00158                 GIConv descriptor = g_iconv_open("UTF-8", globalCharacterSet().get());
00159                 for (std::size_t i = 1; i < 128; ++i) {
00160                     char c = extended_ascii_for_index(i);
00161                     char* inbuf = &c;
00162                     std::size_t inbytesleft = 1;
00163                     char* outbuf = m_converted[i];
00164                     std::size_t outbytesleft = 6;
00165                     if (g_iconv(descriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t) (-1)) {
00166                         UTF8Character utf8(m_converted[i]);
00167                         m_decodeMap[i] = utf8;
00168                         m_encodeMap[i] = UTF8CharacterToExtendedASCII(utf8, c);
00169                     }
00170                 }
00171                 g_iconv_close(descriptor);
00172                 std::sort(m_encodeMap, m_encodeMap + 128);
00173             }
00174         }
00177         void print () const
00178         {
00179             globalOutputStream() << "UTF-8 conversion required from charset: " << globalCharacterSet().get() << "\n";
00180             for (std::size_t i = 1; i < 128; ++i) {
00181                 if (m_decodeMap[i].buffer != 0) {
00182                     globalOutputStream() << extended_ascii_for_index(i) << " = " << m_decodeMap[i] << "\n";
00183                 }
00184             }
00185         }
00188         const UTF8Character& decode (char c) const
00189         {
00190             ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
00191             ASSERT_MESSAGE(!char_is_ascii(c), "decode: ascii character");
00192             ASSERT_MESSAGE(m_decodeMap[extended_ascii_to_index(c)].buffer != 0, "decode: invalid character: " << HexChar(c));
00193             return m_decodeMap[extended_ascii_to_index(c)];
00194         }
00197         char encode (const UTF8Character& c) const
00198         {
00199             ASSERT_MESSAGE(!globalCharacterSet().isUTF8(), "locale is utf8, no conversion required");
00200             ASSERT_MESSAGE(!char_is_ascii(*c.buffer), "encode: ascii character");
00201             std::pair<const UTF8CharacterToExtendedASCII*, const UTF8CharacterToExtendedASCII*> range =
00202                     std::equal_range(m_encodeMap, m_encodeMap + 128, UTF8CharacterToExtendedASCII(c, 0));
00203             ASSERT_MESSAGE(range.first != range.second, "encode: invalid character: " << c);
00204             return (*range.first).m_c;
00205         }
00206 };
00207 
00208 typedef LazyStatic<ExtendedASCIICharacterSet> GlobalExtendedASCIICharacterSet;
00209 
00211 inline ExtendedASCIICharacterSet& globalExtendedASCIICharacterSet ()
00212 {
00213     return GlobalExtendedASCIICharacterSet::instance();
00214 }
00215 
00216 class ConvertUTF8ToLocale
00217 {
00218     public:
00219         StringRange m_range;
00220         ConvertUTF8ToLocale (const char* string) :
00221             m_range(StringRange(string, string + strlen(string)))
00222         {
00223         }
00224         ConvertUTF8ToLocale (const StringRange& range) :
00225             m_range(range)
00226         {
00227         }
00228 };
00229 
00231 template<typename TextOutputStreamType>
00232 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const ConvertUTF8ToLocale& convert)
00233 {
00234     if (globalCharacterSet().isUTF8()) {
00235         return ostream << convert.m_range;
00236     }
00237 
00238     for (const char* p = convert.m_range.first; p != convert.m_range.last;) {
00239         if (!char_is_ascii(*p)) {
00240             UTF8Character c(p);
00241             ostream << globalExtendedASCIICharacterSet().encode(c);
00242             p += c.length;
00243         } else {
00244             ostream << *p++;
00245         }
00246     }
00247     return ostream;
00248 }
00249 
00250 class ConvertLocaleToUTF8
00251 {
00252     public:
00253         StringRange m_range;
00254         ConvertLocaleToUTF8 (const char* string) :
00255             m_range(StringRange(string, string + strlen(string)))
00256         {
00257         }
00258         ConvertLocaleToUTF8 (const StringRange& range) :
00259             m_range(range)
00260         {
00261         }
00262 };
00263 
00265 template<typename TextOutputStreamType>
00266 inline TextOutputStreamType& ostream_write (TextOutputStreamType& ostream, const ConvertLocaleToUTF8& convert)
00267 {
00268     if (globalCharacterSet().isUTF8()) {
00269         return ostream << convert.m_range;
00270     }
00271 
00272     for (const char* p = convert.m_range.first; p != convert.m_range.last; ++p) {
00273         if (!char_is_ascii(*p)) {
00274             UTF8Character c(globalExtendedASCIICharacterSet().decode(*p));
00275             ostream.write(c.buffer, c.length);
00276         } else {
00277             ostream << *p;
00278         }
00279     }
00280     return ostream;
00281 }
00282 
00283 #endif