一步一步剖析Dictionary实现原理
程序员文章站
2022-05-15 10:31:20
目录 关键的字段和Entry结构 添加键值(Add) 取键值(Find) 移除键值(Remove) 再插入键值 本文是对c#中Dictionary内部实现原理进行简单的剖析。如有表述错误,欢迎指正。 主要对照源码来解析,目前对照源码的版本是.Net Framwork 4.8,源码地址。 1. 关键的 ......
目录
- 关键的字段和entry结构
- 添加键值(add)
- 取键值(find)
- 移除键值(remove)
- 再插入键值
本文是对c#中dictionary内部实现原理进行简单的剖析。如有表述错误,欢迎指正。
主要对照源码来解析,目前对照源码的版本是.net framwork 4.8,。
1. 关键的字段和entry结构
struct entry { public int hashcode; // key的hashcode & 0x7fffffff public int next; // 指向链表下一个元素的地址(实际就是entries的索引),最后一个元素为-1 public tkey key; public tvalue value; } entry[] entries; //存放键值 int[] buckets; //存储entries最新元素的索引,其存储位置由取模结果决定。例:假设键值存储在entries的第1元素的位置上,且hashcode和长度的取模结果为2,那么buckets[2] = 1 int count = 0; //已存储键值的个数 int version; //记录版本,防止迭代过程中集合被更改 iequalitycomparer<tkey> _comparer; int freelist; //entries中最新空元素的索引 int freecount; //entries中空元素的个数
2. 添加键值(add)
public void add(tkey key, tvalue value) { insert(key, value, true); } private void insert(tkey key, tvalue value, bool add) { if( key == null ) { throwhelper.throwargumentnullexception(exceptionargument.key); } if (buckets == null) initialize(0); int hashcode = comparer.gethashcode(key) & 0x7fffffff; //取模 int targetbucket = hashcode % buckets.length; #if feature_randomized_string_hashing int collisioncount = 0; #endif for (int i = buckets[targetbucket]; i >= 0; i = entries[i].next) { if (entries[i].hashcode == hashcode && comparer.equals(entries[i].key, key)) { if (add) { throwhelper.throwargumentexception(exceptionresource.argument_addingduplicate); } //对于已存在的key重新赋值 entries[i].value = value; version++; return; } #if feature_randomized_string_hashing collisioncount++; #endif } int index; if (freecount > 0) { //存在entries中存在空元素 index = freelist; freelist = entries[index].next; freecount--; } else { if (count == entries.length) { //扩容:取大于count * 2的最小素数作为entries和bucket的新容量(即数组长度.length) resize(); targetbucket = hashcode % buckets.length; } index = count; count++; } entries[index].hashcode = hashcode; entries[index].next = buckets[targetbucket]; entries[index].key = key; entries[index].value = value; //存取链表的头元素的索引(即entries最后存入的元素的在enties中的索引) //便于取key的时每次从链表的头元素开始遍历,详细见findentry(tkey key)函数 buckets[targetbucket] = index; version++; #if feature_randomized_string_hashing #if feature_coreclr // in case we hit the collision threshold we'll need to switch to the comparer which is using randomized string hashing // in this case will be equalitycomparer<string>.default. // note, randomized string hashing is turned on by default on coreclr so equalitycomparer<string>.default will // be using randomized string hashing if (collisioncount > hashhelpers.hashcollisionthreshold && comparer == nonrandomizedstringequalitycomparer.default) { comparer = (iequalitycomparer<tkey>) equalitycomparer<string>.default; resize(entries.length, true); } #else if(collisioncount > hashhelpers.hashcollisionthreshold && hashhelpers.iswellknownequalitycomparer(comparer)) { //如果碰撞次数(单链表长度)大于设置的最大碰撞阈值,需要扩容 comparer = (iequalitycomparer<tkey>) hashhelpers.getrandomizedequalitycomparer(comparer); resize(entries.length, true); } #endif // feature_coreclr #endif } ****************************************************************************************************************************************** static void foo() { var dicdata = new dictionary<int, int>(); //添加键值 new list<int> { 1, 2, 4 }.foreach(item => add(item, dicdata)); new list<int> { 22, 29, 36, 20 }.foreach(item => add(item, dicdata)); } static void add(int key, dictionary<int, int> dicdata) { dicdata.add(key, key); }
2.1 数组entries和buckets初始化
private void initialize(int capacity) { //取大于capacity的最小质数(素数) int size = hashhelpers.getprime(capacity); buckets = new int[size]; for (int i = 0; i < buckets.length; i++) buckets[i] = -1; entries = new entry[size]; freelist = -1; } **************************************************** internal static class hashhelpers { ...... public const int hashcollisionthreshold = 100; //碰撞阈值 ...... public static readonly int[] primes = { 3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919, 1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591, 17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437, 187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263, 1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369}; //质数(素数)组 ...... public static int getprime(int min) { if (min < 0) throw new argumentexception(environment.getresourcestring("arg_htcapacityoverflow")); contract.endcontractblock(); //查找primes是否有满足的质数(素数) for (int i = 0; i < primes.length; i++) { int prime = primes[i]; if (prime >= min) return prime; } //outside of our predefined table. //compute the hard way. //primes没有查找到满足的质数(素数),自行计算 for (int i = (min | 1); i < int32.maxvalue;i+=2) { if (isprime(i) && ((i - 1) % hashtable.hashprime != 0)) return i; } return min; } }
2.2 添加键值{1,1},则
hashcode = 1; targetbucket = hascode % buckets.length; //targetbucket = 1 next = buckets[targetbucket]; //next = -1 buckets[targetbucket] = index; //buckets[1] = 0
2.3 添加键值{2,2},则
hashcode = 2; targetbucket = hascode % buckets.length; //targetbucket = 2 next = buckets[targetbucket]; //next = -1 buckets[targetbucket] = index; //buckets[2] = 1
2.4 添加键值{4,4},则
hashcode = 4; targetbucket = hascode % buckets.length; //targetbucket = 1 next = buckets[targetbucket]; //next = 0 buckets[targetbucket] = index; //buckets[1] = 2
接下来将entries数组以单链表的形式呈现(即enteries数组横向);
2.5 在继续添加键值之前,需要扩容操作,因为entries数组长度为3且都已有元素。扩容后需要对buckets和entries每个元素的next需要重新赋值;
private void resize() { //扩容的大小:取大于(当前容量*2)的最小素数 //例: resize(hashhelpers.expandprime(count), false); } private void resize(int newsize, bool forcenewhashcodes) { contract.assert(newsize >= entries.length); //实例化buckets,并将每个元素置为-1 int[] newbuckets = new int[newsize]; for (int i = 0; i < newbuckets.length; i++) newbuckets[i] = -1; entry[] newentries = new entry[newsize]; array.copy(entries, 0, newentries, 0, count); //如果是hash碰撞扩容,使用新hashcode函数重新计算hash值 if(forcenewhashcodes) { for (int i = 0; i < count; i++) { if(newentries[i].hashcode != -1) { newentries[i].hashcode = (comparer.gethashcode(newentries[i].key) & 0x7fffffff); } } } //重建单链表 for (int i = 0; i < count; i++) { if (newentries[i].hashcode >= 0) { //取模重新设置next值和buckets int bucket = newentries[i].hashcode % newsize; newentries[i].next = newbuckets[bucket]; newbuckets[bucket] = i; } } buckets = newbuckets; entries = newentries; } ******************************************************************* internal static class hashhelpers { ...... public static readonly int[] primes = { 3, 7, 11, 17, 23, 29, 37, 47, 59, 71, 89, 107, 131, 163, 197, 239, 293, 353, 431, 521, 631, 761, 919, 1103, 1327, 1597, 1931, 2333, 2801, 3371, 4049, 4861, 5839, 7013, 8419, 10103, 12143, 14591, 17519, 21023, 25229, 30293, 36353, 43627, 52361, 62851, 75431, 90523, 108631, 130363, 156437, 187751, 225307, 270371, 324449, 389357, 467237, 560689, 672827, 807403, 968897, 1162687, 1395263, 1674319, 2009191, 2411033, 2893249, 3471899, 4166287, 4999559, 5999471, 7199369}; //质数(素数)组 ...... // this is the maximum prime smaller than array.maxarraylength public const int maxprimearraylength = 0x7feffffd; //数组最大长度的最小质数 public static int expandprime(int oldsize) { //翻倍 int newsize = 2 * oldsize; // allow the hashtables to grow to maximum possible size (~2g elements) before encoutering capacity overflow. // note that this check works even when _items.length overflowed thanks to the (uint) cast //翻倍的大小不能超过【数组最大长度的最小质数】 if ((uint)newsize > maxprimearraylength && maxprimearraylength > oldsize) { contract.assert( maxprimearraylength == getprime(maxprimearraylength), "invalid maxprimearraylength"); return maxprimearraylength; } //取最小的质数(素数) return getprime(newsize); } public static int getprime(int min) { if (min < 0) throw new argumentexception(environment.getresourcestring("arg_htcapacityoverflow")); contract.endcontractblock(); //查找primes是否有满足的质数(素数) for (int i = 0; i < primes.length; i++) { int prime = primes[i]; if (prime >= min) return prime; } //outside of our predefined table. //compute the hard way. //primes没有查找到满足的质数(素数),自行计算 for (int i = (min | 1); i < int32.maxvalue;i+=2) { if (isprime(i) && ((i - 1) % hashtable.hashprime != 0)) return i; } return min; } }
2.6 继续添加键值{22,22},{29,29},{36,36},{40,40},添加完后其内部存储结果如下
3. 取键值(find)
public tvalue this[tkey key] { get { //取key对应值在entries的索引 int i = findentry(key); if (i >= 0) return entries[i].value; throwhelper.throwkeynotfoundexception(); return default(tvalue); } set { //更新key对应的值 insert(key, value, false); } } private int findentry(tkey key) { if( key == null) { throwhelper.throwargumentnullexception(exceptionargument.key); } if (buckets != null) { int hashcode = comparer.gethashcode(key) & 0x7fffffff; //遍历单链表 for (int i = buckets[hashcode % buckets.length]; i >= 0; i = entries[i].next) { if (entries[i].hashcode == hashcode && comparer.equals(entries[i].key, key)) return i; } } return -1; } ********************************************************************************************* static void foo() { ...... //取key=22 var val =dicdata[22];
}
简化取key对应值的代码
var hashcode =comparer.gethashcode(key) & 0x7fffffff; // 22 var targetbuget = hashcode % buckets.length; //取模运算 1 var i = bucket[targetbuget]; //链表头元素的索引 bucket[1] = 5 //遍历单链表 for (; i >= 0; i = entries[i].next) { if (entries[i].hashcode == hashcode && comparer.equals(entries[i].key, key)) return i; }
4. 移除键值(remove)
public bool remove(tkey key) { if(key == null) { throwhelper.throwargumentnullexception(exceptionargument.key); } if (buckets != null) { int hashcode = comparer.gethashcode(key) & 0x7fffffff; int bucket = hashcode % buckets.length; int last = -1; //其原理先取出键值,然后记录entries空闲的索引(freelist)和空闲个数(freecount) for (int i = buckets[bucket]; i >= 0; last = i, i = entries[i].next) { if (entries[i].hashcode == hashcode && comparer.equals(entries[i].key, key)) { if (last < 0) { buckets[bucket] = entries[i].next; } else { entries[last].next = entries[i].next; } entries[i].hashcode = -1; //建立空闲链表 entries[i].next = freelist; entries[i].key = default(tkey); entries[i].value = default(tvalue); //保存entryies中空元素的索引 //便于插入新键值时,放在当前索引的位置,减少entryies空间上的浪费 freelist = i; //空元素的个数加1 freecount++; version++; return true; } } } return false; } ******************************************************************* static void foo() { ...... //移除 new list<int> { 22, 29 }.foreach(item => dicdata.remove(item)); }
4.1 移除key=22后,freelist = 3, freecount = 1,
4.2 移除key=36后,freelist = 5, freecount = 2,
5. 再插入键值
如上图,当移除掉{36,36}后,会发现又诞生一个含有两个元素的“新链表”(上图灰色框)。这个作用就是为了插入新键值时,按照“新链表”记录的索引顺序插入到entries数组中。
例:添加键值{22,22},{25,25},此时freelist = 5,freecount = 2;
-
给entries[5]赋值,freelist = 3, freecount = 1;
- 给entries[3]赋值,freelist = -1, freecount = 0;
希望此文能够让你对于dictionary内部实现有所认识。
上一篇: 这种泥鳅的做法,你有见过吗