Ticket #8915: hashmap-py.patch

File hashmap-py.patch, 10.7 KB (added by fingolfin, 16 years ago)
  • common/hashmap.cpp

     
    2424 */
    2525
    2626// The hash map (associative array) implementation in this file is
    27 // based on code by Andrew Y. Ng, 1996:
     27// based on the PyDict implementation of CPython. The erase() method
     28// is based on example code in the Wikipedia article on Hash tables.
    2829
    29 /*
    30  * Copyright (c) 1998-2003 Massachusetts Institute of Technology.
    31  * This code was developed as part of the Haystack research project
    32  * (http://haystack.lcs.mit.edu/). Permission is hereby granted,
    33  * free of charge, to any person obtaining a copy of this software
    34  * and associated documentation files (the "Software"), to deal in
    35  * the Software without restriction, including without limitation
    36  * the rights to use, copy, modify, merge, publish, distribute,
    37  * sublicense, and/or sell copies of the Software, and to permit
    38  * persons to whom the Software is furnished to do so, subject to
    39  * the following conditions:
    40  *
    41  * The above copyright notice and this permission notice shall be
    42  * included in all copies or substantial portions of the Software.
    43  *
    44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    45  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    46  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    47  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    48  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    49  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    50  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    51  * OTHER DEALINGS IN THE SOFTWARE.
    52  */
    53 
    5430#include "common/hashmap.h"
    5531
    5632namespace Common {
    5733
    58 // const char *:
     34// Hash function for strings, taken from CPython.
    5935uint hashit(const char *p) {
    60         uint hash = 0;
     36        uint hash = *p << 7;
    6137        byte c;
    62         while ((c = *p++))
    63                 hash = (hash * 31 + c);
    64         return hash;
     38        int size = 0;
     39        while ((c = *p++)) {
     40                hash = (1000003 * hash) ^ c;
     41                size++;
     42        }
     43        return hash ^ size;
    6544}
    6645
     46// Like hashit, but converts every char to lowercase before hashing.
    6747uint hashit_lower(const char *p) {
    68         uint hash = 0;
     48        uint hash = tolower(*p) << 7;
    6949        byte c;
    70         while ((c = *p++))
    71                 hash = (hash * 31 + tolower(c));
    72         return hash;
     50        int size = 0;
     51        while ((c = *p++)) {
     52                hash = (1000003 * hash) ^ tolower(c);
     53                size++;
     54        }
     55        return hash ^ size;
    7356}
    7457
    75 // The following table is taken from the GNU ISO C++ Library's hashtable.h file.
    76 static const uint primes[] = {
    77         53ul,         97ul,         193ul,       389ul,       769ul,
    78         1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
    79         49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
    80         1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
    81         50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
    82         1610612741ul, 3221225473ul, 4294967291ul
    83 };
    84 
    85 uint nextTableSize(uint x) {
    86         int i = 0;
    87         while (x >= primes[i])
    88                 i++;
    89         return primes[i];
    90 }
    91 
    9258#ifdef DEBUG_HASH_COLLISIONS
    9359static double
    9460        g_collisions = 0,
     
    9864        g_size = 0;
    9965static int g_max_capacity = 0, g_max_size = 0;
    10066static int g_totalHashmaps = 0;
     67static int g_stats[4] = {0,0,0,0};
    10168
    10269void updateHashCollisionStats(int collisions, int lookups, int arrsize, int nele) {
    10370        g_collisions += collisions;
     
    10875        g_size += nele;
    10976        g_totalHashmaps++;
    11077       
     78        if (3*nele <= 2*8)
     79                g_stats[0]++;
     80        if (3*nele <= 2*16)
     81                g_stats[1]++;
     82        if (3*nele <= 2*32)
     83                g_stats[2]++;
     84        if (3*nele <= 2*64)
     85                g_stats[3]++;
     86       
    11187        g_max_capacity = MAX(g_max_capacity, arrsize);
    11288        g_max_size = MAX(g_max_size, nele);
    11389
     
    11894                100 * g_collPerLook / g_totalHashmaps,
    11995                g_size / g_totalHashmaps, g_max_size,
    12096                g_capacity / g_totalHashmaps, g_max_capacity);
     97        fprintf(stdout, "  %d less than %d; %d less than %d; %d less than %d; %d less than %d\n",
     98                        g_stats[0], 2*8/3,
     99                        g_stats[1],2*16/3,
     100                        g_stats[2],2*32/3,
     101                        g_stats[3],2*64/3);
     102
     103        // TODO:
     104        // * Should record the maximal size of the map during its lifetime, not that at its death
     105        // * Should do some statistics: how many maps are less than 2/3*8, 2/3*16, 2/3*32, ...
    121106}
    122107#endif
    123108
  • common/hashmap.h

     
    2424 */
    2525
    2626// The hash map (associative array) implementation in this file is
    27 // based on code by Andrew Y. Ng, 1996:
     27// based on the PyDict implementation of CPython. The erase() method
     28// is based on example code in the Wikipedia article on Hash tables.
    2829
    29 /*
    30  * Copyright (c) 1998-2003 Massachusetts Institute of Technology.
    31  * This code was developed as part of the Haystack research project
    32  * (http://haystack.lcs.mit.edu/). Permission is hereby granted,
    33  * free of charge, to any person obtaining a copy of this software
    34  * and associated documentation files (the "Software"), to deal in
    35  * the Software without restriction, including without limitation
    36  * the rights to use, copy, modify, merge, publish, distribute,
    37  * sublicense, and/or sell copies of the Software, and to permit
    38  * persons to whom the Software is furnished to do so, subject to
    39  * the following conditions:
    40  *
    41  * The above copyright notice and this permission notice shall be
    42  * included in all copies or substantial portions of the Software.
    43  *
    44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    45  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    46  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    47  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    48  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    49  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    50  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    51  * OTHER DEALINGS IN THE SOFTWARE.
    52  */
    53 
    5430#ifndef COMMON_HASHMAP_H
    5531#define COMMON_HASHMAP_H
    5632
     
    7450
    7551namespace Common {
    7652
    77 // The table sizes ideally are primes. We use a helper function to find
    78 // suitable table sizes.
    79 uint nextTableSize(uint x);
    80 
    81 
    8253// Enable the following #define if you want to check how many collisions the
    8354// code produces (many collisions indicate either a bad hash function, or a
    8455// hash table that is too small).
    85 //#define DEBUG_HASH_COLLISIONS
     56#define DEBUG_HASH_COLLISIONS
    8657
    8758
    8859/**
     
    136107        }
    137108#endif
    138109
     110        enum {
     111                HASHMAP_PERTURB_SHIFT = 5,
     112                HASHMAP_MIN_CAPACITY = 8
     113        };
     114
    139115        Node **_storage;        // hashtable of size arrsize.
    140         uint _capacity;
     116        uint _capacity;         /**< Current capacity of the HashMap; must be a power of two */
    141117        uint _size;
    142118
    143119        HashFunc _hash;
     
    302278        _nodePool(sizeof(Node)),
    303279#endif
    304280        _defaultVal() {
    305         _capacity = nextTableSize(0);
     281        _capacity = HASHMAP_MIN_CAPACITY;
    306282        _storage = new Node *[_capacity];
    307283        assert(_storage != NULL);
    308284        memset(_storage, 0, _capacity * sizeof(Node *));
     
    382358                }
    383359        }
    384360
    385         if (shrinkArray && _capacity > nextTableSize(0)) {
     361        if (shrinkArray && _capacity > HASHMAP_MIN_CAPACITY) {
    386362                delete[] _storage;
    387363
    388                 _capacity = nextTableSize(0);
     364                _capacity = HASHMAP_MIN_CAPACITY;
    389365                _storage = new Node *[_capacity];
    390366                assert(_storage != NULL);
    391367                memset(_storage, 0, _capacity * sizeof(Node *));
     
    397373template<class Key, class Val, class HashFunc, class EqualFunc>
    398374void HashMap<Key, Val, HashFunc, EqualFunc>::expand_array(uint newsize) {
    399375        assert(newsize > _capacity);
    400         uint ctr, dex;
    401376
    402377        const uint old_size = _size;
    403378        const uint old_capacity = _capacity;
     
    411386        memset(_storage, 0, _capacity * sizeof(Node *));
    412387
    413388        // rehash all the old elements
    414         for (ctr = 0; ctr < old_capacity; ++ctr) {
     389        for (uint ctr = 0; ctr < old_capacity; ++ctr) {
    415390                if (old_storage[ctr] == NULL)
    416391                        continue;
    417392
     
    419394                // Since we know that no key exists twice in the old table, we
    420395                // can do this slightly better than by calling lookup, since we
    421396                // don't have to call _equal().
    422                 dex = _hash(old_storage[ctr]->_key) % _capacity;
    423                 while (_storage[dex] != NULL) {
    424                         dex = (dex + 1) % _capacity;
     397                const uint hash = _hash(old_storage[ctr]->_key);
     398                uint idx = hash & (_capacity - 1);
     399                for (uint perturb = hash; _storage[idx] != NULL; perturb >>= HASHMAP_PERTURB_SHIFT) {
     400                        idx = (5 * idx + perturb + 1) & (_capacity - 1);
    425401                }
    426402
    427                 _storage[dex] = old_storage[ctr];
     403                _storage[idx] = old_storage[ctr];
    428404                _size++;
    429405        }
    430406
     
    439415
    440416template<class Key, class Val, class HashFunc, class EqualFunc>
    441417int HashMap<Key, Val, HashFunc, EqualFunc>::lookup(const Key &key) const {
    442         uint ctr = _hash(key) % _capacity;
     418        const uint hash = _hash(key);
     419        uint ctr = hash & (_capacity - 1);
     420        for (uint perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
     421                if (_storage[ctr] == NULL || _equal(_storage[ctr]->_key, key))
     422                        break;
    443423
    444         while (_storage[ctr] != NULL && !_equal(_storage[ctr]->_key, key)) {
    445                 ctr = (ctr + 1) % _capacity;
     424                ctr = (5 * ctr + perturb + 1) & (_capacity - 1);
    446425
    447426#ifdef DEBUG_HASH_COLLISIONS
    448427                _collisions++;
     
    467446                _storage[ctr] = allocNode(key);
    468447                _size++;
    469448
    470                 // Keep the load factor below 75%.
    471                 if (_size > _capacity * 75 / 100) {
    472                         expand_array(nextTableSize(_capacity));
     449                // Keep the load factor below 2/3.
     450                if (3 * _size > _capacity * 2) {
     451                        expand_array(_capacity * 2);
    473452                        ctr = lookup(key);
    474453                }
    475454        }
     
    520499template<class Key, class Val, class HashFunc, class EqualFunc>
    521500void HashMap<Key, Val, HashFunc, EqualFunc>::erase(const Key &key) {
    522501        // This is based on code in the Wikipedia article on Hash tables.
    523         uint i = lookup(key);
     502
     503        const uint hash = _hash(key);
     504        uint i = hash & (_capacity - 1);
     505        uint perturb;
     506
     507        for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
     508                if (_storage[i] == NULL || _equal(_storage[i]->_key, key))
     509                        break;
     510
     511                i = (5 * i + perturb + 1) & (_capacity - 1);
     512        }
     513
    524514        if (_storage[i] == NULL)
    525515                return; // key wasn't present, so no work has to be done
     516
    526517        // If we remove a key, we must check all subsequent keys and possibly
    527518        // reinsert them.
    528519        uint j = i;
    529520        freeNode(_storage[i]);
    530521        _storage[i] = NULL;
    531         while (true) {
     522        for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
    532523                // Look at the next table slot
    533                 j = (j + 1) % _capacity;
     524                j = (5 * j + perturb + 1) & (_capacity - 1);
    534525                // If the next slot is empty, we are done
    535526                if (_storage[j] == NULL)
    536527                        break;
    537528                // Compute the slot where the content of the next slot should normally be,
    538529                // assuming an empty table, and check whether we have to move it.
    539                 uint k = _hash(_storage[j]->_key) % _capacity;
     530                uint k = _hash(_storage[j]->_key) & (_capacity - 1);
    540531                if ((j > i && (k <= i || k > j)) ||
    541532                    (j < i && (k <= i && k > j)) ) {
    542533                        _storage[i] = _storage[j];