hashfunc.h

Go to the documentation of this file.
00001 /*
00002 Copyright (C) 2001-2006, William Joseph.
00003 All Rights Reserved.
00004 
00005 This file is part of GtkRadiant.
00006 
00007 GtkRadiant is free software; you can redistribute it and/or modify
00008 it under the terms of the GNU General Public License as published by
00009 the Free Software Foundation; either version 2 of the License, or
00010 (at your option) any later version.
00011 
00012 GtkRadiant is distributed in the hope that it will be useful,
00013 but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 GNU General Public License for more details.
00016 
00017 You should have received a copy of the GNU General Public License
00018 along with GtkRadiant; if not, write to the Free Software
00019 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
00020 */
00021 
00022 #if !defined(INCLUDED_CONTAINER_HASHFUNC_H)
00023 #define INCLUDED_CONTAINER_HASHFUNC_H
00024 
00025 #include <string>
00026 #include <cctype>
00027 #include "string/string.h"
00028 #include "container/array.h"
00029 typedef  unsigned long  int  ub4;   /* unsigned 4-byte quantities */
00030 typedef  unsigned       char ub1;
00031 
00032 inline ub1 ub1_as_ub1_nocase(ub1 byte) {
00033     return std::tolower(byte);
00034 }
00035 
00036 inline ub4 ub1x4_as_ub4_nocase(const ub1 bytes[4]) {
00037     ub4 result;
00038     reinterpret_cast<ub1*>(&result)[0] = ub1_as_ub1_nocase(bytes[0]);
00039     reinterpret_cast<ub1*>(&result)[1] = ub1_as_ub1_nocase(bytes[1]);
00040     reinterpret_cast<ub1*>(&result)[2] = ub1_as_ub1_nocase(bytes[2]);
00041     reinterpret_cast<ub1*>(&result)[3] = ub1_as_ub1_nocase(bytes[3]);
00042     return result;
00043 }
00044 
00045 class ub1_default_traits {
00046 public:
00047     static ub1 as_ub1(ub1 byte) {
00048         return byte;
00049     }
00050 };
00051 
00052 class ub1_nocase_traits {
00053 public:
00054     static ub1 as_ub1(ub1 byte) {
00055         return ub1_as_ub1_nocase(byte);
00056     }
00057 };
00058 
00059 class ub1x4_default_traits {
00060 public:
00061     static ub4 as_ub4(const ub1 bytes[4]) {
00062         return *reinterpret_cast<const ub4*>(bytes);
00063     }
00064 };
00065 
00066 class ub1x4_nocase_traits {
00067 public:
00068     static ub4 as_ub4(const ub1 bytes[4]) {
00069         return ub1x4_as_ub4_nocase(bytes);
00070     }
00071 };
00072 
00073 class ub4_default_traits {
00074 public:
00075     static ub4 as_ub4(ub4 i) {
00076         return i;
00077     }
00078 };
00079 
00080 class ub4_nocase_traits {
00081 public:
00082     static ub4 as_ub4(ub4 i) {
00083         return ub1x4_as_ub4_nocase(reinterpret_cast<const ub1*>(&i));
00084     }
00085 };
00086 
00087 // lookup2.c
00088 // By Bob Jenkins, 1996.  bob_jenkins@burtleburtle.net.  You may use this
00089 // code any way you wish, private, educational, or commercial.  It's free.
00090 
00091 #define hashsize(n) ((ub4)1<<(n))
00092 #define hashmask(n) (hashsize(n)-1)
00093 
00094 /*
00095 --------------------------------------------------------------------
00096 mix -- mix 3 32-bit values reversibly.
00097 For every delta with one or two bit set, and the deltas of all three
00098   high bits or all three low bits, whether the original value of a,b,c
00099   is almost all zero or is uniformly distributed,
00100 * If mix() is run forward or backward, at least 32 bits in a,b,c
00101   have at least 1/4 probability of changing.
00102 * If mix() is run forward, every bit of c will change between 1/3 and
00103   2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
00104 mix() was built out of 36 single-cycle latency instructions in a
00105   structure that could supported 2x parallelism, like so:
00106       a -= b;
00107       a -= c; x = (c>>13);
00108       b -= c; a ^= x;
00109       b -= a; x = (a<<8);
00110       c -= a; b ^= x;
00111       c -= b; x = (b>>13);
00112       ...
00113   Unfortunately, superscalar Pentiums and Sparcs can't take advantage
00114   of that parallelism.  They've also turned some of those single-cycle
00115   latency instructions into multi-cycle latency instructions.  Still,
00116   this is the fastest good hash I could find.  There were about 2^^68
00117   to choose from.  I only looked at a billion or so.
00118 --------------------------------------------------------------------
00119 */
00120 #define mix(a,b,c) \
00121 { \
00122   a -= b; a -= c; a ^= (c>>13); \
00123   b -= c; b -= a; b ^= (a<<8); \
00124   c -= a; c -= b; c ^= (b>>13); \
00125   a -= b; a -= c; a ^= (c>>12);  \
00126   b -= c; b -= a; b ^= (a<<16); \
00127   c -= a; c -= b; c ^= (b>>5); \
00128   a -= b; a -= c; a ^= (c>>3);  \
00129   b -= c; b -= a; b ^= (a<<10); \
00130   c -= a; c -= b; c ^= (b>>15); \
00131 }
00132 
00133 /* same, but slower, works on systems that might have 8 byte ub4's */
00134 #define mix2(a,b,c) \
00135 { \
00136   a -= b; a -= c; a ^= (c>>13); \
00137   b -= c; b -= a; b ^= (a<< 8); \
00138   c -= a; c -= b; c ^= ((b&0xffffffff)>>13); \
00139   a -= b; a -= c; a ^= ((c&0xffffffff)>>12); \
00140   b -= c; b -= a; b = (b ^ (a<<16)) & 0xffffffff; \
00141   c -= a; c -= b; c = (c ^ (b>> 5)) & 0xffffffff; \
00142   a -= b; a -= c; a = (a ^ (c>> 3)) & 0xffffffff; \
00143   b -= c; b -= a; b = (b ^ (a<<10)) & 0xffffffff; \
00144   c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \
00145 }
00146 
00147 /*
00148 --------------------------------------------------------------------
00149 hash() -- hash a variable-length key into a 32-bit value
00150   k     : the key (the unaligned variable-length array of bytes)
00151   len   : the length of the key, counting by bytes
00152   level : can be any 4-byte value
00153 Returns a 32-bit value.  Every bit of the key affects every bit of
00154 the return value.  Every 1-bit and 2-bit delta achieves avalanche.
00155 About 36+6len instructions.
00156 
00157 The best hash table sizes are powers of 2.  There is no need to do
00158 mod a prime (mod is sooo slow!).  If you need less than 32 bits,
00159 use a bitmask.  For example, if you need only 10 bits, do
00160   h = (h & hashmask(10));
00161 In which case, the hash table should have hashsize(10) elements.
00162 
00163 If you are hashing n strings (ub1 **)k, do it like this:
00164   for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
00165 
00166 See http://burlteburtle.net/bob/hash/evahash.html
00167 Use for hash table lookup, or anything where one collision in 2^32 is
00168 acceptable.  Do NOT use for cryptographic purposes.
00169 --------------------------------------------------------------------
00170 */
00171 
00172 template<typename UB1Traits, typename UB4x1Traits>
00173 inline ub4 hash(
00174     const ub1 *k,        /* the key */
00175     ub4  length,   /* the length of the key */
00176     ub4  initval,    /* the previous hash, or an arbitrary value */
00177     const UB1Traits& ub1traits,
00178     const UB4x1Traits& ub4x1traits
00179 ) {
00180     register ub4 a, b, c, len;
00181 
00182     /* Set up the internal state */
00183     len = length;
00184     a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
00185     c = initval;           /* the previous hash value */
00186 
00187     /*---------------------------------------- handle most of the key */
00188     while (len >= 12) {
00189         a += (k[0] + ((ub4)UB1Traits::as_ub1(k[1]) << 8) + ((ub4)UB1Traits::as_ub1(k[2]) << 16) + ((ub4)UB1Traits::as_ub1(k[3]) << 24));
00190         b += (k[4] + ((ub4)UB1Traits::as_ub1(k[5]) << 8) + ((ub4)UB1Traits::as_ub1(k[6]) << 16) + ((ub4)UB1Traits::as_ub1(k[7]) << 24));
00191         c += (k[8] + ((ub4)UB1Traits::as_ub1(k[9]) << 8) + ((ub4)UB1Traits::as_ub1(k[10]) << 16) + ((ub4)UB1Traits::as_ub1(k[11]) << 24));
00192         mix(a, b, c);
00193         k += 12;
00194         len -= 12;
00195     }
00196 
00197     /*------------------------------------- handle the last 11 bytes */
00198     c += length;
00199     switch (len) {           /* all the case statements fall through */
00200     case 11:
00201         c += ((ub4)UB1Traits::as_ub1(k[10]) << 24);
00202     case 10:
00203         c += ((ub4)UB1Traits::as_ub1(k[9]) << 16);
00204     case 9 :
00205         c += ((ub4)UB1Traits::as_ub1(k[8]) << 8);
00206         /* the first byte of c is reserved for the length */
00207     case 8 :
00208         b += ((ub4)UB1Traits::as_ub1(k[7]) << 24);
00209     case 7 :
00210         b += ((ub4)UB1Traits::as_ub1(k[6]) << 16);
00211     case 6 :
00212         b += ((ub4)UB1Traits::as_ub1(k[5]) << 8);
00213     case 5 :
00214         b += UB1Traits::as_ub1(k[4]);
00215     case 4 :
00216         a += ((ub4)UB1Traits::as_ub1(k[3]) << 24);
00217     case 3 :
00218         a += ((ub4)UB1Traits::as_ub1(k[2]) << 16);
00219     case 2 :
00220         a += ((ub4)UB1Traits::as_ub1(k[1]) << 8);
00221     case 1 :
00222         a += UB1Traits::as_ub1(k[0]);
00223         /* case 0: nothing left to add */
00224     }
00225     mix(a, b, c);
00226     /*-------------------------------------------- report the result */
00227     return c;
00228 }
00229 
00230 /*
00231 --------------------------------------------------------------------
00232  This works on all machines.  hash2() is identical to hash() on
00233  little-endian machines, except that the length has to be measured
00234  in ub4s instead of bytes.  It is much faster than hash().  It
00235  requires
00236  -- that the key be an array of ub4's, and
00237  -- that all your machines have the same endianness, and
00238  -- that the length be the number of ub4's in the key
00239 --------------------------------------------------------------------
00240 */
00241 template<typename UB4Traits>
00242 inline ub4 hash2(
00243     const ub4 *k,        /* the key */
00244     ub4  length,   /* the length of the key, in ub4s */
00245     ub4  initval, /* the previous hash, or an arbitrary value */
00246     const UB4Traits& ub4traits
00247 ) {
00248     register ub4 a, b, c, len;
00249 
00250     /* Set up the internal state */
00251     len = length;
00252     a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
00253     c = initval;           /* the previous hash value */
00254 
00255     /*---------------------------------------- handle most of the key */
00256     while (len >= 3) {
00257         a += UB4Traits::as_ub4(k[0]);
00258         b += UB4Traits::as_ub4(k[1]);
00259         c += UB4Traits::as_ub4(k[2]);
00260         mix(a, b, c);
00261         k += 3;
00262         len -= 3;
00263     }
00264 
00265     /*-------------------------------------- handle the last 2 ub4's */
00266     c += length;
00267     switch (len) {           /* all the case statements fall through */
00268         /* c is reserved for the length */
00269     case 2 :
00270         b += UB4Traits::as_ub4(k[1]);
00271     case 1 :
00272         a += UB4Traits::as_ub4(k[0]);
00273         /* case 0: nothing left to add */
00274     }
00275     mix(a, b, c);
00276     /*-------------------------------------------- report the result */
00277     return c;
00278 }
00279 
00280 typedef ub4 hash_t;
00281 
00282 inline hash_t hash_ub1(const ub1* key, std::size_t len, hash_t previous = 0) {
00283     return hash(key, ub4(len), previous, ub1_default_traits(), ub1x4_default_traits());
00284 }
00285 
00286 inline hash_t hash_ub1_nocase(const ub1* key, std::size_t len, hash_t previous = 0) {
00287     return hash(key, ub4(len), previous, ub1_nocase_traits(), ub1x4_nocase_traits());
00288 }
00289 
00290 template<typename UB4Traits>
00291 inline hash_t hash_ub4(const ub4* key, std::size_t len, const UB4Traits& traits, hash_t previous = 0) {
00292     return hash2(key, ub4(len), previous, traits);
00293 }
00294 
00295 inline ub4 hash_combine(ub4 left, ub4 right) {
00296     return hash_ub1(reinterpret_cast<const ub1*>(&left), 4, right);
00297 }
00298 
00299 template<typename POD>
00300 inline hash_t pod_hash(const POD& pod) {
00301     return hash_ub1(reinterpret_cast<const ub1*>(&pod), sizeof(POD));
00302 }
00303 
00304 inline hash_t string_hash(const char* string, hash_t previous = 0) {
00305     return hash_ub1(reinterpret_cast<const ub1*>(string), string_length(string), previous);
00306 }
00307 
00308 inline hash_t string_hash_nocase(const char* string, hash_t previous = 0) {
00309     return hash_ub1_nocase(reinterpret_cast<const ub1*>(string), string_length(string), previous);
00310 }
00311 
00312 struct RawStringHash {
00313     typedef hash_t hash_type;
00314     hash_type operator()(const std::string& string) const {
00315         return string_hash(string.c_str());
00316     }
00317 };
00318 
00319 struct HashString {
00320     typedef hash_t hash_type;
00321     hash_type operator()(const std::string& string) const {
00322         return string_hash(string.c_str());
00323     }
00324 };
00325 
00326 struct HashStringNoCase {
00327     typedef hash_t hash_type;
00328     hash_type operator()(const std::string& string) const {
00329         return string_hash_nocase(string.c_str());
00330     }
00331 };
00332 
00337 inline std::size_t string_length_ub4(const char* string) {
00338     return ((string_length(string) >> 2) + 1) << 2;
00339 }
00340 
00343 template < typename UB4Traits = ub4_default_traits >
00344 class HashKey {
00345     Array<ub4> m_key;
00346     hash_t m_hash;
00347 
00348     void copy(const HashKey& other) {
00349         std::copy(other.m_key.begin(), other.m_key.end(), m_key.begin());
00350         m_hash = other.m_hash;
00351     }
00352     void copy(const char* string) {
00353         strncpy(reinterpret_cast<char*>(m_key.data()), string, m_key.size());
00354         for (Array<ub4>::iterator i = m_key.begin(); i != m_key.end(); ++i) {
00355             *i = UB4Traits::as_ub4(*i);
00356         }
00357         m_hash = hash_ub4(m_key.data(), m_key.size(), ub4_default_traits());
00358     }
00359     bool equal(const HashKey& other) const {
00360         return m_hash == other.m_hash && m_key.size() == other.m_key.size()
00361                && std::equal(m_key.begin(), m_key.end(), other.m_key.begin());
00362     }
00363 
00364 public:
00365     HashKey(const HashKey& other) : m_key(other.m_key.size()) {
00366         copy(other);
00367     }
00368     HashKey(const char* string) : m_key(string_length_ub4(string)) {
00369         copy(string);
00370     }
00371     HashKey& operator=(const char* string) {
00372         m_key.resize(string_length_ub4(string));
00373         copy(string);
00374         return *this;
00375     }
00376     bool operator==(const HashKey& other) const {
00377         return equal(other);
00378     }
00379     bool operator!=(const HashKey& other) const {
00380         return !equal(other);
00381     }
00382     hash_t hash() const {
00383         return m_hash;
00384     }
00385 };
00386 
00388 struct HashKeyHasher {
00389     typedef hash_t hash_type;
00390     hash_type operator()(const HashKey<ub4_default_traits>& key) const {
00391         return key.hash();
00392     }
00393 };
00394 
00395 
00396 
00397 #endif

Generated by  doxygen 1.6.2