Ticket #8915: hashmappy2.patch
File hashmappy2.patch, 14.8 KB (added by , 16 years ago) 


common/hashmap.cpp
24 24 */ 25 25 26 26 // The hash map (associative array) implementation in this file is 27 // based on code by Andrew Y. Ng, 1996: 27 // based on the PyDict implementation of CPython. The erase() method 28 // is based on example code in the Wikipedia article on Hash tables. 28 29 29 /*30 * Copyright (c) 19982003 Massachusetts Institute of Technology.31 * This code was developed as part of the Haystack research project32 * (http://haystack.lcs.mit.edu/). Permission is hereby granted,33 * free of charge, to any person obtaining a copy of this software34 * and associated documentation files (the "Software"), to deal in35 * the Software without restriction, including without limitation36 * the rights to use, copy, modify, merge, publish, distribute,37 * sublicense, and/or sell copies of the Software, and to permit38 * persons to whom the Software is furnished to do so, subject to39 * the following conditions:40 *41 * The above copyright notice and this permission notice shall be42 * included in all copies or substantial portions of the Software.43 *44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,45 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES46 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND47 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT48 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,49 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING50 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR51 * OTHER DEALINGS IN THE SOFTWARE.52 */53 54 30 #include "common/hashmap.h" 55 31 56 32 namespace Common { 57 33 58 // const char *:34 // Hash function for strings, taken from CPython. 59 35 uint hashit(const char *p) { 60 uint hash = 0;36 uint hash = *p << 7; 61 37 byte c; 62 while ((c = *p++)) 63 hash = (hash * 31 + c); 64 return hash; 38 int size = 0; 39 while ((c = *p++)) { 40 hash = (1000003 * hash) ^ c; 41 size++; 42 } 43 return hash ^ size; 65 44 } 66 45 46 // Like hashit, but converts every char to lowercase before hashing. 67 47 uint hashit_lower(const char *p) { 68 uint hash = 0;48 uint hash = tolower(*p) << 7; 69 49 byte c; 70 while ((c = *p++)) 71 hash = (hash * 31 + tolower(c)); 72 return hash; 50 int size = 0; 51 while ((c = *p++)) { 52 hash = (1000003 * hash) ^ tolower(c); 53 size++; 54 } 55 return hash ^ size; 73 56 } 74 57 75 // The following table is taken from the GNU ISO C++ Library's hashtable.h file.76 static const uint primes[] = {77 53ul, 97ul, 193ul, 389ul, 769ul,78 1543ul, 3079ul, 6151ul, 12289ul, 24593ul,79 49157ul, 98317ul, 196613ul, 393241ul, 786433ul,80 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul,81 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul,82 1610612741ul, 3221225473ul, 4294967291ul83 };84 85 uint nextTableSize(uint x) {86 int i = 0;87 while (x >= primes[i])88 i++;89 return primes[i];90 }91 92 58 #ifdef DEBUG_HASH_COLLISIONS 93 59 static double 94 60 g_collisions = 0, … … 98 64 g_size = 0; 99 65 static int g_max_capacity = 0, g_max_size = 0; 100 66 static int g_totalHashmaps = 0; 67 static int g_stats[4] = {0,0,0,0}; 101 68 102 69 void updateHashCollisionStats(int collisions, int lookups, int arrsize, int nele) { 103 70 g_collisions += collisions; … … 108 75 g_size += nele; 109 76 g_totalHashmaps++; 110 77 78 if (3*nele <= 2*8) 79 g_stats[0]++; 80 if (3*nele <= 2*16) 81 g_stats[1]++; 82 if (3*nele <= 2*32) 83 g_stats[2]++; 84 if (3*nele <= 2*64) 85 g_stats[3]++; 86 111 87 g_max_capacity = MAX(g_max_capacity, arrsize); 112 88 g_max_size = MAX(g_max_size, nele); 113 89 … … 118 94 100 * g_collPerLook / g_totalHashmaps, 119 95 g_size / g_totalHashmaps, g_max_size, 120 96 g_capacity / g_totalHashmaps, g_max_capacity); 97 fprintf(stdout, " %d less than %d; %d less than %d; %d less than %d; %d less than %d\n", 98 g_stats[0], 2*8/3, 99 g_stats[1],2*16/3, 100 g_stats[2],2*32/3, 101 g_stats[3],2*64/3); 102 103 // TODO: 104 // * Should record the maximal size of the map during its lifetime, not that at its death 105 // * Should do some statistics: how many maps are less than 2/3*8, 2/3*16, 2/3*32, ... 121 106 } 122 107 #endif 123 108 
common/hashmap.h
24 24 */ 25 25 26 26 // The hash map (associative array) implementation in this file is 27 // based on code by Andrew Y. Ng, 1996: 27 // based on the PyDict implementation of CPython. The erase() method 28 // is based on example code in the Wikipedia article on Hash tables. 28 29 29 /*30 * Copyright (c) 19982003 Massachusetts Institute of Technology.31 * This code was developed as part of the Haystack research project32 * (http://haystack.lcs.mit.edu/). Permission is hereby granted,33 * free of charge, to any person obtaining a copy of this software34 * and associated documentation files (the "Software"), to deal in35 * the Software without restriction, including without limitation36 * the rights to use, copy, modify, merge, publish, distribute,37 * sublicense, and/or sell copies of the Software, and to permit38 * persons to whom the Software is furnished to do so, subject to39 * the following conditions:40 *41 * The above copyright notice and this permission notice shall be42 * included in all copies or substantial portions of the Software.43 *44 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,45 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES46 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND47 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT48 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,49 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING50 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR51 * OTHER DEALINGS IN THE SOFTWARE.52 */53 54 30 #ifndef COMMON_HASHMAP_H 55 31 #define COMMON_HASHMAP_H 56 32 … … 74 50 75 51 namespace Common { 76 52 77 // The table sizes ideally are primes. We use a helper function to find78 // suitable table sizes.79 uint nextTableSize(uint x);80 81 82 53 // Enable the following #define if you want to check how many collisions the 83 54 // code produces (many collisions indicate either a bad hash function, or a 84 55 // hash table that is too small). … … 136 107 } 137 108 #endif 138 109 110 enum { 111 HASHMAP_PERTURB_SHIFT = 5, 112 HASHMAP_MIN_CAPACITY = 16, 113 114 // The quotient of the next two constants controls how much the 115 // internal storage of the hashmap may fill up before being 116 // increased automatically. 117 // Note: the quotient of these two must be between and different 118 // from 0 and 1. 119 HASHMAP_LOADFACTOR_NUMERATOR = 2, 120 HASHMAP_LOADFACTOR_DENOMINATOR = 3 121 }; 122 139 123 Node **_storage; // hashtable of size arrsize. 140 uint _ capacity;124 uint _mask; /**< Capacity of the HashMap minus one; must be a power of two of minus one */ 141 125 uint _size; 142 126 143 127 HashFunc _hash; … … 153 137 void assign(const HM_t &map); 154 138 int lookup(const Key &key) const; 155 139 int lookupAndCreateIfMissing(const Key &key); 156 void expand _array(uint newsize);140 void expandStorage(uint newCapacity); 157 141 158 142 template<class T> friend class IteratorImpl; 159 143 … … 175 159 176 160 NodeType *deref() const { 177 161 assert(_hashmap != 0); 178 assert(_idx < _hashmap>_capacity);162 assert(_idx <= _hashmap>_mask); 179 163 Node *node = _hashmap>_storage[_idx]; 180 164 assert(node != 0); 181 165 return node; … … 196 180 assert(_hashmap); 197 181 do { 198 182 _idx++; 199 } while (_idx < _hashmap>_capacity&& _hashmap>_storage[_idx] == 0);200 if (_idx > = _hashmap>_capacity)183 } while (_idx <= _hashmap>_mask && _hashmap>_storage[_idx] == 0); 184 if (_idx > _hashmap>_mask) 201 185 _idx = (uint)1; 202 186 203 187 return *this; … … 247 231 248 232 iterator begin() { 249 233 // Find and return the _key nonempty entry 250 for (uint ctr = 0; ctr < _capacity; ++ctr) {234 for (uint ctr = 0; ctr <= _mask; ++ctr) { 251 235 if (_storage[ctr]) 252 236 return iterator(ctr, this); 253 237 } … … 259 243 260 244 const_iterator begin() const { 261 245 // Find and return the first nonempty entry 262 for (uint ctr = 0; ctr < _capacity; ++ctr) {246 for (uint ctr = 0; ctr <= _mask; ++ctr) { 263 247 if (_storage[ctr]) 264 248 return const_iterator(ctr, this); 265 249 } … … 302 286 _nodePool(sizeof(Node)), 303 287 #endif 304 288 _defaultVal() { 305 _ capacity = nextTableSize(0);306 _storage = new Node *[ _capacity];289 _mask = HASHMAP_MIN_CAPACITY  1; 290 _storage = new Node *[HASHMAP_MIN_CAPACITY]; 307 291 assert(_storage != NULL); 308 memset(_storage, 0, _capacity* sizeof(Node *));292 memset(_storage, 0, HASHMAP_MIN_CAPACITY * sizeof(Node *)); 309 293 310 294 _size = 0; 311 295 … … 334 318 */ 335 319 template<class Key, class Val, class HashFunc, class EqualFunc> 336 320 HashMap<Key, Val, HashFunc, EqualFunc>::~HashMap() { 337 for (uint ctr = 0; ctr < _capacity; ++ctr)321 for (uint ctr = 0; ctr <= _mask; ++ctr) 338 322 if (_storage[ctr] != NULL) 339 323 freeNode(_storage[ctr]); 340 324 341 325 delete[] _storage; 342 326 #ifdef DEBUG_HASH_COLLISIONS 343 327 extern void updateHashCollisionStats(int, int, int, int); 344 updateHashCollisionStats(_collisions, _lookups, _ capacity, _size);328 updateHashCollisionStats(_collisions, _lookups, _mask+1, _size); 345 329 #endif 346 330 } 347 331 … … 354 338 */ 355 339 template<class Key, class Val, class HashFunc, class EqualFunc> 356 340 void HashMap<Key, Val, HashFunc, EqualFunc>::assign(const HM_t &map) { 357 _ capacity = map._capacity;358 _storage = new Node *[_ capacity];341 _mask = map._mask; 342 _storage = new Node *[_mask+1]; 359 343 assert(_storage != NULL); 360 memset(_storage, 0, _capacity* sizeof(Node *));344 memset(_storage, 0, (_mask+1) * sizeof(Node *)); 361 345 362 346 // Simply clone the map given to us, one by one. 363 347 _size = 0; 364 for (uint ctr = 0; ctr < _capacity; ++ctr) {348 for (uint ctr = 0; ctr <= _mask; ++ctr) { 365 349 if (map._storage[ctr] != NULL) { 366 350 _storage[ctr] = allocNode(map._storage[ctr]>_key); 367 351 _storage[ctr]>_value = map._storage[ctr]>_value; … … 375 359 376 360 template<class Key, class Val, class HashFunc, class EqualFunc> 377 361 void HashMap<Key, Val, HashFunc, EqualFunc>::clear(bool shrinkArray) { 378 for (uint ctr = 0; ctr < _capacity; ++ctr) {362 for (uint ctr = 0; ctr <= _mask; ++ctr) { 379 363 if (_storage[ctr] != NULL) { 380 364 freeNode(_storage[ctr]); 381 365 _storage[ctr] = NULL; 382 366 } 383 367 } 384 368 385 if (shrinkArray && _ capacity > nextTableSize(0)) {369 if (shrinkArray && _mask >= HASHMAP_MIN_CAPACITY) { 386 370 delete[] _storage; 387 371 388 _ capacity = nextTableSize(0);389 _storage = new Node *[ _capacity];372 _mask = HASHMAP_MIN_CAPACITY; 373 _storage = new Node *[HASHMAP_MIN_CAPACITY]; 390 374 assert(_storage != NULL); 391 memset(_storage, 0, _capacity* sizeof(Node *));375 memset(_storage, 0, HASHMAP_MIN_CAPACITY * sizeof(Node *)); 392 376 } 393 377 394 378 _size = 0; 395 379 } 396 380 397 381 template<class Key, class Val, class HashFunc, class EqualFunc> 398 void HashMap<Key, Val, HashFunc, EqualFunc>::expand_array(uint newsize) { 399 assert(newsize > _capacity); 400 uint ctr, dex; 382 void HashMap<Key, Val, HashFunc, EqualFunc>::expandStorage(uint newCapacity) { 383 assert(newCapacity > _mask+1); 401 384 402 385 const uint old_size = _size; 403 const uint old_ capacity = _capacity;386 const uint old_mask = _mask; 404 387 Node **old_storage = _storage; 405 388 406 389 // allocate a new array 407 390 _size = 0; 408 _ capacity = newsize;409 _storage = new Node *[ _capacity];391 _mask = newCapacity  1; 392 _storage = new Node *[newCapacity]; 410 393 assert(_storage != NULL); 411 memset(_storage, 0, _capacity * sizeof(Node *));394 memset(_storage, 0, newCapacity * sizeof(Node *)); 412 395 413 396 // rehash all the old elements 414 for ( ctr = 0; ctr < old_capacity; ++ctr) {397 for (uint ctr = 0; ctr <= old_mask; ++ctr) { 415 398 if (old_storage[ctr] == NULL) 416 399 continue; 417 400 … … 419 402 // Since we know that no key exists twice in the old table, we 420 403 // can do this slightly better than by calling lookup, since we 421 404 // don't have to call _equal(). 422 dex = _hash(old_storage[ctr]>_key) % _capacity; 423 while (_storage[dex] != NULL) { 424 dex = (dex + 1) % _capacity; 405 const uint hash = _hash(old_storage[ctr]>_key); 406 uint idx = hash & _mask; 407 for (uint perturb = hash; _storage[idx] != NULL; perturb >>= HASHMAP_PERTURB_SHIFT) { 408 idx = (5 * idx + perturb + 1) & _mask; 425 409 } 426 410 427 _storage[ dex] = old_storage[ctr];411 _storage[idx] = old_storage[ctr]; 428 412 _size++; 429 413 } 430 414 … … 439 423 440 424 template<class Key, class Val, class HashFunc, class EqualFunc> 441 425 int HashMap<Key, Val, HashFunc, EqualFunc>::lookup(const Key &key) const { 442 uint ctr = _hash(key) % _capacity; 426 const uint hash = _hash(key); 427 uint ctr = hash & _mask; 428 for (uint perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 429 if (_storage[ctr] == NULL  _equal(_storage[ctr]>_key, key)) 430 break; 443 431 444 while (_storage[ctr] != NULL && !_equal(_storage[ctr]>_key, key)) { 445 ctr = (ctr + 1) % _capacity; 432 ctr = (5 * ctr + perturb + 1) & _mask; 446 433 447 434 #ifdef DEBUG_HASH_COLLISIONS 448 435 _collisions++; … … 453 440 _lookups++; 454 441 fprintf(stderr, "collisions %d, lookups %d, ratio %f in HashMap %p; size %d num elements %d\n", 455 442 _collisions, _lookups, ((double) _collisions / (double)_lookups), 456 (const void *)this, _ capacity, _size);443 (const void *)this, _mask+1, _size); 457 444 #endif 458 445 459 446 return ctr; … … 467 454 _storage[ctr] = allocNode(key); 468 455 _size++; 469 456 470 // Keep the load factor below 75%. 471 if (_size > _capacity * 75 / 100) { 472 expand_array(nextTableSize(_capacity)); 457 // Keep the load factor below a certain threshold. 458 uint capacity = _mask + 1; 459 if (_size * HASHMAP_LOADFACTOR_DENOMINATOR > capacity * HASHMAP_LOADFACTOR_NUMERATOR) { 460 capacity = capacity < 500 ? (capacity * 4) : (capacity * 2); 461 expandStorage(capacity); 473 462 ctr = lookup(key); 474 463 } 475 464 } … … 520 509 template<class Key, class Val, class HashFunc, class EqualFunc> 521 510 void HashMap<Key, Val, HashFunc, EqualFunc>::erase(const Key &key) { 522 511 // This is based on code in the Wikipedia article on Hash tables. 523 uint i = lookup(key); 512 513 const uint hash = _hash(key); 514 uint i = hash & _mask; 515 uint perturb; 516 517 for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 518 if (_storage[i] == NULL  _equal(_storage[i]>_key, key)) 519 break; 520 521 i = (5 * i + perturb + 1) & _mask; 522 } 523 524 524 if (_storage[i] == NULL) 525 525 return; // key wasn't present, so no work has to be done 526 526 527 // If we remove a key, we must check all subsequent keys and possibly 527 528 // reinsert them. 528 529 uint j = i; 529 530 freeNode(_storage[i]); 530 531 _storage[i] = NULL; 531 while (true) {532 for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) { 532 533 // Look at the next table slot 533 j = ( j + 1) % _capacity;534 j = (5 * j + perturb + 1) & _mask; 534 535 // If the next slot is empty, we are done 535 536 if (_storage[j] == NULL) 536 537 break; 537 538 // Compute the slot where the content of the next slot should normally be, 538 539 // assuming an empty table, and check whether we have to move it. 539 uint k = _hash(_storage[j]>_key) % _capacity;540 uint k = _hash(_storage[j]>_key) & _mask; 540 541 if ((j > i && (k <= i  k > j))  541 542 (j < i && (k <= i && k > j)) ) { 542 543 _storage[i] = _storage[j];