Ticket #8915: hashmap-py2.patch

File hashmap-py2.patch, 14.8 KB (added by fingolfin, 16 years ago)

Revised patch

  • common/hashmap.cpp

     
    2424 */
    2525
    2626// The hash map (associative array) implementation in this file is
    27 // based on code by Andrew Y. Ng, 1996:
     27// based on the PyDict implementation of CPython. The erase() method
     28// is based on example code in the Wikipedia article on Hash tables.
    2829
    29 /*
    30  * Copyright (c) 1998-2003 Massachusetts Institute of Technology.
    31  * This code was developed as part of the Haystack research project
    32  * (http://haystack.lcs.mit.edu/). Permission is hereby granted,
    33  * free of charge, to any person obtaining a copy of this software
    34  * and associated documentation files (the "Software"), to deal in
    35  * the Software without restriction, including without limitation
    36  * the rights to use, copy, modify, merge, publish, distribute,
    37  * sublicense, and/or sell copies of the Software, and to permit
    38  * persons to whom the Software is furnished to do so, subject to
    39  * the following conditions:
    40  *
    41  * The above copyright notice and this permission notice shall be
    42  * included in all copies or substantial portions of the Software.
    43  *
    44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    45  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    46  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    47  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    48  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    49  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    50  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    51  * OTHER DEALINGS IN THE SOFTWARE.
    52  */
    53 
    5430#include "common/hashmap.h"
    5531
    5632namespace Common {
    5733
    58 // const char *:
     34// Hash function for strings, taken from CPython.
    5935uint hashit(const char *p) {
    60         uint hash = 0;
     36        uint hash = *p << 7;
    6137        byte c;
    62         while ((c = *p++))
    63                 hash = (hash * 31 + c);
    64         return hash;
     38        int size = 0;
     39        while ((c = *p++)) {
     40                hash = (1000003 * hash) ^ c;
     41                size++;
     42        }
     43        return hash ^ size;
    6544}
    6645
     46// Like hashit, but converts every char to lowercase before hashing.
    6747uint hashit_lower(const char *p) {
    68         uint hash = 0;
     48        uint hash = tolower(*p) << 7;
    6949        byte c;
    70         while ((c = *p++))
    71                 hash = (hash * 31 + tolower(c));
    72         return hash;
     50        int size = 0;
     51        while ((c = *p++)) {
     52                hash = (1000003 * hash) ^ tolower(c);
     53                size++;
     54        }
     55        return hash ^ size;
    7356}
    7457
    75 // The following table is taken from the GNU ISO C++ Library's hashtable.h file.
    76 static const uint primes[] = {
    77         53ul,         97ul,         193ul,       389ul,       769ul,
    78         1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
    79         49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
    80         1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
    81         50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
    82         1610612741ul, 3221225473ul, 4294967291ul
    83 };
    84 
    85 uint nextTableSize(uint x) {
    86         int i = 0;
    87         while (x >= primes[i])
    88                 i++;
    89         return primes[i];
    90 }
    91 
    9258#ifdef DEBUG_HASH_COLLISIONS
    9359static double
    9460        g_collisions = 0,
     
    9864        g_size = 0;
    9965static int g_max_capacity = 0, g_max_size = 0;
    10066static int g_totalHashmaps = 0;
     67static int g_stats[4] = {0,0,0,0};
    10168
    10269void updateHashCollisionStats(int collisions, int lookups, int arrsize, int nele) {
    10370        g_collisions += collisions;
     
    10875        g_size += nele;
    10976        g_totalHashmaps++;
    11077       
     78        if (3*nele <= 2*8)
     79                g_stats[0]++;
     80        if (3*nele <= 2*16)
     81                g_stats[1]++;
     82        if (3*nele <= 2*32)
     83                g_stats[2]++;
     84        if (3*nele <= 2*64)
     85                g_stats[3]++;
     86       
    11187        g_max_capacity = MAX(g_max_capacity, arrsize);
    11288        g_max_size = MAX(g_max_size, nele);
    11389
     
    11894                100 * g_collPerLook / g_totalHashmaps,
    11995                g_size / g_totalHashmaps, g_max_size,
    12096                g_capacity / g_totalHashmaps, g_max_capacity);
     97        fprintf(stdout, "  %d less than %d; %d less than %d; %d less than %d; %d less than %d\n",
     98                        g_stats[0], 2*8/3,
     99                        g_stats[1],2*16/3,
     100                        g_stats[2],2*32/3,
     101                        g_stats[3],2*64/3);
     102
     103        // TODO:
     104        // * Should record the maximal size of the map during its lifetime, not that at its death
     105        // * Should do some statistics: how many maps are less than 2/3*8, 2/3*16, 2/3*32, ...
    121106}
    122107#endif
    123108
  • common/hashmap.h

     
    2424 */
    2525
    2626// The hash map (associative array) implementation in this file is
    27 // based on code by Andrew Y. Ng, 1996:
     27// based on the PyDict implementation of CPython. The erase() method
     28// is based on example code in the Wikipedia article on Hash tables.
    2829
    29 /*
    30  * Copyright (c) 1998-2003 Massachusetts Institute of Technology.
    31  * This code was developed as part of the Haystack research project
    32  * (http://haystack.lcs.mit.edu/). Permission is hereby granted,
    33  * free of charge, to any person obtaining a copy of this software
    34  * and associated documentation files (the "Software"), to deal in
    35  * the Software without restriction, including without limitation
    36  * the rights to use, copy, modify, merge, publish, distribute,
    37  * sublicense, and/or sell copies of the Software, and to permit
    38  * persons to whom the Software is furnished to do so, subject to
    39  * the following conditions:
    40  *
    41  * The above copyright notice and this permission notice shall be
    42  * included in all copies or substantial portions of the Software.
    43  *
    44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    45  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    46  * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    47  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    48  * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    49  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    50  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    51  * OTHER DEALINGS IN THE SOFTWARE.
    52  */
    53 
    5430#ifndef COMMON_HASHMAP_H
    5531#define COMMON_HASHMAP_H
    5632
     
    7450
    7551namespace Common {
    7652
    77 // The table sizes ideally are primes. We use a helper function to find
    78 // suitable table sizes.
    79 uint nextTableSize(uint x);
    80 
    81 
    8253// Enable the following #define if you want to check how many collisions the
    8354// code produces (many collisions indicate either a bad hash function, or a
    8455// hash table that is too small).
     
    136107        }
    137108#endif
    138109
     110        enum {
     111                HASHMAP_PERTURB_SHIFT = 5,
     112                HASHMAP_MIN_CAPACITY = 16,
     113               
     114                // The quotient of the next two constants controls how much the
     115                // internal storage of the hashmap may fill up before being
     116                // increased automatically.
     117                // Note: the quotient of these two must be between and different
     118                // from 0 and 1.
     119                HASHMAP_LOADFACTOR_NUMERATOR = 2,
     120                HASHMAP_LOADFACTOR_DENOMINATOR = 3
     121        };
     122
    139123        Node **_storage;        // hashtable of size arrsize.
    140         uint _capacity;
     124        uint _mask;             /**< Capacity of the HashMap minus one; must be a power of two of minus one */
    141125        uint _size;
    142126
    143127        HashFunc _hash;
     
    153137        void assign(const HM_t &map);
    154138        int lookup(const Key &key) const;
    155139        int lookupAndCreateIfMissing(const Key &key);
    156         void expand_array(uint newsize);
     140        void expandStorage(uint newCapacity);
    157141
    158142        template<class T> friend class IteratorImpl;
    159143
     
    175159
    176160                NodeType *deref() const {
    177161                        assert(_hashmap != 0);
    178                         assert(_idx < _hashmap->_capacity);
     162                        assert(_idx <= _hashmap->_mask);
    179163                        Node *node = _hashmap->_storage[_idx];
    180164                        assert(node != 0);
    181165                        return node;
     
    196180                        assert(_hashmap);
    197181                        do {
    198182                                _idx++;
    199                         } while (_idx < _hashmap->_capacity && _hashmap->_storage[_idx] == 0);
    200                         if (_idx >= _hashmap->_capacity)
     183                        } while (_idx <= _hashmap->_mask && _hashmap->_storage[_idx] == 0);
     184                        if (_idx > _hashmap->_mask)
    201185                                _idx = (uint)-1;
    202186
    203187                        return *this;
     
    247231
    248232        iterator        begin() {
    249233                // Find and return the _key non-empty entry
    250                 for (uint ctr = 0; ctr < _capacity; ++ctr) {
     234                for (uint ctr = 0; ctr <= _mask; ++ctr) {
    251235                        if (_storage[ctr])
    252236                                return iterator(ctr, this);
    253237                }
     
    259243
    260244        const_iterator  begin() const {
    261245                // Find and return the first non-empty entry
    262                 for (uint ctr = 0; ctr < _capacity; ++ctr) {
     246                for (uint ctr = 0; ctr <= _mask; ++ctr) {
    263247                        if (_storage[ctr])
    264248                                return const_iterator(ctr, this);
    265249                }
     
    302286        _nodePool(sizeof(Node)),
    303287#endif
    304288        _defaultVal() {
    305         _capacity = nextTableSize(0);
    306         _storage = new Node *[_capacity];
     289        _mask = HASHMAP_MIN_CAPACITY - 1;
     290        _storage = new Node *[HASHMAP_MIN_CAPACITY];
    307291        assert(_storage != NULL);
    308         memset(_storage, 0, _capacity * sizeof(Node *));
     292        memset(_storage, 0, HASHMAP_MIN_CAPACITY * sizeof(Node *));
    309293
    310294        _size = 0;
    311295
     
    334318 */
    335319template<class Key, class Val, class HashFunc, class EqualFunc>
    336320HashMap<Key, Val, HashFunc, EqualFunc>::~HashMap() {
    337         for (uint ctr = 0; ctr < _capacity; ++ctr)
     321        for (uint ctr = 0; ctr <= _mask; ++ctr)
    338322                if (_storage[ctr] != NULL)
    339323                  freeNode(_storage[ctr]);
    340324
    341325        delete[] _storage;
    342326#ifdef DEBUG_HASH_COLLISIONS
    343327        extern void updateHashCollisionStats(int, int, int, int);
    344         updateHashCollisionStats(_collisions, _lookups, _capacity, _size);
     328        updateHashCollisionStats(_collisions, _lookups, _mask+1, _size);
    345329#endif
    346330}
    347331
     
    354338 */
    355339template<class Key, class Val, class HashFunc, class EqualFunc>
    356340void HashMap<Key, Val, HashFunc, EqualFunc>::assign(const HM_t &map) {
    357         _capacity = map._capacity;
    358         _storage = new Node *[_capacity];
     341        _mask = map._mask;
     342        _storage = new Node *[_mask+1];
    359343        assert(_storage != NULL);
    360         memset(_storage, 0, _capacity * sizeof(Node *));
     344        memset(_storage, 0, (_mask+1) * sizeof(Node *));
    361345
    362346        // Simply clone the map given to us, one by one.
    363347        _size = 0;
    364         for (uint ctr = 0; ctr < _capacity; ++ctr) {
     348        for (uint ctr = 0; ctr <= _mask; ++ctr) {
    365349                if (map._storage[ctr] != NULL) {
    366350                        _storage[ctr] = allocNode(map._storage[ctr]->_key);
    367351                        _storage[ctr]->_value = map._storage[ctr]->_value;
     
    375359
    376360template<class Key, class Val, class HashFunc, class EqualFunc>
    377361void HashMap<Key, Val, HashFunc, EqualFunc>::clear(bool shrinkArray) {
    378         for (uint ctr = 0; ctr < _capacity; ++ctr) {
     362        for (uint ctr = 0; ctr <= _mask; ++ctr) {
    379363                if (_storage[ctr] != NULL) {
    380364                        freeNode(_storage[ctr]);
    381365                        _storage[ctr] = NULL;
    382366                }
    383367        }
    384368
    385         if (shrinkArray && _capacity > nextTableSize(0)) {
     369        if (shrinkArray && _mask >= HASHMAP_MIN_CAPACITY) {
    386370                delete[] _storage;
    387371
    388                 _capacity = nextTableSize(0);
    389                 _storage = new Node *[_capacity];
     372                _mask = HASHMAP_MIN_CAPACITY;
     373                _storage = new Node *[HASHMAP_MIN_CAPACITY];
    390374                assert(_storage != NULL);
    391                 memset(_storage, 0, _capacity * sizeof(Node *));
     375                memset(_storage, 0, HASHMAP_MIN_CAPACITY * sizeof(Node *));
    392376        }
    393377
    394378        _size = 0;
    395379}
    396380
    397381template<class Key, class Val, class HashFunc, class EqualFunc>
    398 void HashMap<Key, Val, HashFunc, EqualFunc>::expand_array(uint newsize) {
    399         assert(newsize > _capacity);
    400         uint ctr, dex;
     382void HashMap<Key, Val, HashFunc, EqualFunc>::expandStorage(uint newCapacity) {
     383        assert(newCapacity > _mask+1);
    401384
    402385        const uint old_size = _size;
    403         const uint old_capacity = _capacity;
     386        const uint old_mask = _mask;
    404387        Node **old_storage = _storage;
    405388
    406389        // allocate a new array
    407390        _size = 0;
    408         _capacity = newsize;
    409         _storage = new Node *[_capacity];
     391        _mask = newCapacity - 1;
     392        _storage = new Node *[newCapacity];
    410393        assert(_storage != NULL);
    411         memset(_storage, 0, _capacity * sizeof(Node *));
     394        memset(_storage, 0, newCapacity * sizeof(Node *));
    412395
    413396        // rehash all the old elements
    414         for (ctr = 0; ctr < old_capacity; ++ctr) {
     397        for (uint ctr = 0; ctr <= old_mask; ++ctr) {
    415398                if (old_storage[ctr] == NULL)
    416399                        continue;
    417400
     
    419402                // Since we know that no key exists twice in the old table, we
    420403                // can do this slightly better than by calling lookup, since we
    421404                // don't have to call _equal().
    422                 dex = _hash(old_storage[ctr]->_key) % _capacity;
    423                 while (_storage[dex] != NULL) {
    424                         dex = (dex + 1) % _capacity;
     405                const uint hash = _hash(old_storage[ctr]->_key);
     406                uint idx = hash & _mask;
     407                for (uint perturb = hash; _storage[idx] != NULL; perturb >>= HASHMAP_PERTURB_SHIFT) {
     408                        idx = (5 * idx + perturb + 1) & _mask;
    425409                }
    426410
    427                 _storage[dex] = old_storage[ctr];
     411                _storage[idx] = old_storage[ctr];
    428412                _size++;
    429413        }
    430414
     
    439423
    440424template<class Key, class Val, class HashFunc, class EqualFunc>
    441425int HashMap<Key, Val, HashFunc, EqualFunc>::lookup(const Key &key) const {
    442         uint ctr = _hash(key) % _capacity;
     426        const uint hash = _hash(key);
     427        uint ctr = hash & _mask;
     428        for (uint perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
     429                if (_storage[ctr] == NULL || _equal(_storage[ctr]->_key, key))
     430                        break;
    443431
    444         while (_storage[ctr] != NULL && !_equal(_storage[ctr]->_key, key)) {
    445                 ctr = (ctr + 1) % _capacity;
     432                ctr = (5 * ctr + perturb + 1) & _mask;
    446433
    447434#ifdef DEBUG_HASH_COLLISIONS
    448435                _collisions++;
     
    453440        _lookups++;
    454441        fprintf(stderr, "collisions %d, lookups %d, ratio %f in HashMap %p; size %d num elements %d\n",
    455442                _collisions, _lookups, ((double) _collisions / (double)_lookups),
    456                 (const void *)this, _capacity, _size);
     443                (const void *)this, _mask+1, _size);
    457444#endif
    458445
    459446        return ctr;
     
    467454                _storage[ctr] = allocNode(key);
    468455                _size++;
    469456
    470                 // Keep the load factor below 75%.
    471                 if (_size > _capacity * 75 / 100) {
    472                         expand_array(nextTableSize(_capacity));
     457                // Keep the load factor below a certain threshold.
     458                uint capacity = _mask + 1;
     459                if (_size * HASHMAP_LOADFACTOR_DENOMINATOR > capacity * HASHMAP_LOADFACTOR_NUMERATOR) {
     460                        capacity = capacity < 500 ? (capacity * 4) : (capacity * 2);
     461                        expandStorage(capacity);
    473462                        ctr = lookup(key);
    474463                }
    475464        }
     
    520509template<class Key, class Val, class HashFunc, class EqualFunc>
    521510void HashMap<Key, Val, HashFunc, EqualFunc>::erase(const Key &key) {
    522511        // This is based on code in the Wikipedia article on Hash tables.
    523         uint i = lookup(key);
     512
     513        const uint hash = _hash(key);
     514        uint i = hash & _mask;
     515        uint perturb;
     516
     517        for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
     518                if (_storage[i] == NULL || _equal(_storage[i]->_key, key))
     519                        break;
     520
     521                i = (5 * i + perturb + 1) & _mask;
     522        }
     523
    524524        if (_storage[i] == NULL)
    525525                return; // key wasn't present, so no work has to be done
     526
    526527        // If we remove a key, we must check all subsequent keys and possibly
    527528        // reinsert them.
    528529        uint j = i;
    529530        freeNode(_storage[i]);
    530531        _storage[i] = NULL;
    531         while (true) {
     532        for (perturb = hash; ; perturb >>= HASHMAP_PERTURB_SHIFT) {
    532533                // Look at the next table slot
    533                 j = (j + 1) % _capacity;
     534                j = (5 * j + perturb + 1) & _mask;
    534535                // If the next slot is empty, we are done
    535536                if (_storage[j] == NULL)
    536537                        break;
    537538                // Compute the slot where the content of the next slot should normally be,
    538539                // assuming an empty table, and check whether we have to move it.
    539                 uint k = _hash(_storage[j]->_key) % _capacity;
     540                uint k = _hash(_storage[j]->_key) & _mask;
    540541                if ((j > i && (k <= i || k > j)) ||
    541542                    (j < i && (k <= i && k > j)) ) {
    542543                        _storage[i] = _storage[j];