思科VPP源码分析(内存管理)
2016-10-20 15:30
537 查看
思科今年开源的vpp项目,在intel开源的dpdk之上,构建的数据包处理框架。 dpdk组件已经成功榨干硬件IO性能,剩下的瓶颈落在业务处理部分,其中最关键的又在内存访问优化上。 内存优化一句话概括:提高CPU L1,L2,L3 cache命中率。这里将分析vpp内存管理部分源码。
vec变长数组(vec_bootstrap.h)
len是数组元素个数,不是字节长度。每个数组元素都看作是等大小。
自定义头部后接vec 组合使用。组合vec将引申出其他内存管理结构。
_vec_find(v)
v指向数据部分,宏返回vec头部地址
_vec_round_size(s)
s按sizeof (uword)大小,向上对齐
vec_header_bytes (uword header_bytes)
自定义头部+vec 组合这种模型下,header_bytes代表自定义头部大小。函数返回自定义头部+vec总长度,按sizeof (vec_header_t)向上对齐。
vec_header (void *v, uword header_bytes)
自定义头部+vec 组合,按sizeof (vec_header_t)向上对齐这种模型下,v指向数据部分地址,header_bytes代表自定义头部大小,返回自定义头部地址。
vec_header_end (void *v, uword header_bytes)
自定义头部+vec 组合,按sizeof (vec_header_t)向上对齐这种模型下,v指向自定义头部地址,header_bytes代表自定义头部大小,返回数据部分地址。
vec_aligned_header (void *v, uword header_bytes, uword align)
自定义头部+vec 组合,按align向上对齐这种模型下,v指向数据部分地址,header_bytes代表自定义头部大小,返回自定义头部地址。
vec_aligned_header_end (void *v, uword header_bytes, uword align)
自定义头部+vec 组合,按align向上对齐这种模型下,v指向自定义头部地址,header_bytes代表自定义头部大小,返回数据部分地址。
_vec_len(v)
v指向数据部分,返回vec头部的vec_header_t->len
vec_len(v)
v指向数据部分,v等于null时返回0,否则返回vec头部的vec_header_t->len
vec_reset_length(v)
v指向数据部分,vec头部的vec_header_t->len置0
vec_bytes(v)
v指向数据部分,返回数据部分总字节大小
vec_capacity(v,b)
v指向data起始地址,b代表user_header大小,返回该mheap_elt_t字节大小,即vec的最大可占用内存。博主觉得该函数放在这里破坏了该头件的独立性。
vec_max_len(v)
类似vec_capacity(v,b)内存布局,v指向data起始地址,返回该vec可容纳最大数据块数目。
vec_end(v)
v指向data起始地址,返回vec数据部分末尾的下一个字节。
vec_is_member(v,e)
v指向data起始地址,返回e是否在该vec地址范围内。
vec_elt_at_index(v,i)
v指向data起始地址,返回第i个数据块的地址
vec_elt(v,i)
v指向data起始地址,返回第i个数据块的内容
vec_foreach(var,vec)
vec_foreach_backwards(var,vec)
vec_foreach_index(var,v)
迭代宏
mheap(mheap_bootstrap.h, mheap.c)
mheap整体视图
注意每个elt大小不一定相等。这里的vec_header_t把后面数据看成一个字节一个字节的元素,因此vec->len实际是整个数据部分字节长度。
mheap_elt_t视图
基本函数操作在mheap_bootstrap.h中
该头文件基本函数,结合上文内容很容易理解,这里不再详细叙述。
mheap.c中会对上图基本结构更近一步索引管理,也是mheap的核心部分。
mheap_maybe_lock (void *v)
mheap_maybe_unlock (void *v)
自旋锁加锁解锁函数,如果是持有锁的本CPU再次进入,可以无条件获取锁。
mheap_get_aligned (void v, uword n_user_data_bytes, uword align, uword align_offset, uword offset_return)
mheap内存分配接口函数
void * mheap_get_aligned (void *v, uword n_user_data_bytes, uword align, uword align_offset, uword * offset_return) { mheap_t *h; uword offset; u64 cpu_times[2]; cpu_times[0] = clib_cpu_time_now (); //align至少是4字节大小,并且是2的幂 align = clib_max (align, STRUCT_SIZE_OF (mheap_elt_t, user_data[0])); align = max_pow2 (align); /* Correct align offset to be smaller than alignment. */ align_offset &= (align - 1); /* Align offset must be multiple of minimum object size. */ if (align_offset % STRUCT_SIZE_OF (mheap_elt_t, user_data[0]) != 0) { *offset_return = MHEAP_GROUNDED; return v; } /* Round requested size. */ //各种对齐,确保mheap_elt_t头部和数据部分都是STRUCT_SIZE_OF (mheap_elt_t, user_data[0]),之后再用专门章节详细分析数据结构对齐背后的作者设计目的 n_user_data_bytes = clib_max (n_user_data_bytes, MHEAP_MIN_USER_DATA_BYTES); n_user_data_bytes = round_pow2 (n_user_data_bytes, STRUCT_SIZE_OF (mheap_elt_t, user_data[0])); if (!v) v = mheap_alloc (0, 64 << 20);//分配完整的一个mheap结构 mheap_maybe_lock (v); h = mheap_header (v); if (h->flags & MHEAP_FLAG_VALIDATE) mheap_validate (v);//遍历mheap的空闲块链表,验证是否正确链接。 /* First search free lists for object. */ //从空闲块链表中获取数据块 offset = mheap_get_search_free_list (v, &n_user_data_bytes, align, align_offset); h = mheap_header (v); /* If that fails allocate object at end of heap by extending vector. */ if (offset == MHEAP_GROUNDED && _vec_len (v) < h->max_size) { //如果现有空闲块没有满足要求的块,则从mheap中新分配块,注意h->max_size代表该mheap最大内存量,_vec_len (v)代表目前使用量 v = mheap_get_extend_vector (v, n_user_data_bytes, align, align_offset, &offset); //mheap_get_extend_vector()并不会导致mheap重新分配,博主认为下面重新计算h毫无必要 h = mheap_header (v); //如果扩充成功则统计值加1 h->stats.n_vector_expands += offset != MHEAP_GROUNDED; } *offset_return = offset; if (offset != MHEAP_GROUNDED) { h->n_elts += 1; if (h->flags & MHEAP_FLAG_TRACE) { /* Recursion block for case when we are traceing main clib heap. */ h->flags &= ~MHEAP_FLAG_TRACE; mheap_get_trace (v, offset, n_user_data_bytes); h->flags |= MHEAP_FLAG_TRACE; } } if (h->flags & MHEAP_FLAG_VALIDATE) mheap_validate (v); mheap_maybe_unlock (v); cpu_times[1] = clib_cpu_time_now (); h->stats.n_clocks_get += cpu_times[1] - cpu_times[0]; h->stats.n_gets += 1; return v; }
void * mheap_alloc (void *memory, uword size) { uword flags = 0; if (memory != 0) flags |= MHEAP_FLAG_DISABLE_VM; //这里指的是利用cpu的SIMD向量指令加快缓存块的cache查找,之后会详细介绍 #ifdef CLIB_HAVE_VEC128 flags |= MHEAP_FLAG_SMALL_OBJECT_CACHE; #endif return mheap_alloc_with_flags (memory, size, flags); }
void * mheap_alloc_with_flags (void *memory, uword memory_size, uword flags) { mheap_t *h; void *v; uword size; if (!mheap_page_size) mheap_page_size = clib_mem_get_page_size (); if (!memory) { /* No memory given, try to VM allocate some. */ //通常用的是UNIX模式,该分配函数将mmap()匿名内存使用 memory = clib_mem_vm_alloc (memory_size); if (!memory) return 0; /* No memory region implies we have virtual memory. */ flags &= ~MHEAP_FLAG_DISABLE_VM; } /* Make sure that given memory is page aligned. */ { uword am, av, ah; //把memory向上对mheap_page_size 大小对齐 am = pointer_to_uword (memory); av = mheap_page_round (am); //计算对齐后的mheap头部地址 v = uword_to_pointer (av, void *); h = mheap_header (v); ah = pointer_to_uword (h); //mheap头部地址小于memory起始地址,则增加一个mheap_page_size。这样mheap头部地址既在mmap()分配的内存范围中,也满足对齐要求。 while (ah < am) ah += mheap_page_size; h = uword_to_pointer (ah, void *); v = mheap_vector (h); if (PREDICT_FALSE (memory + memory_size < v)) { /* * This will happen when the requested memory_size is too * small to cope with the heap header and/or memory alignment. */ clib_mem_vm_free (memory, memory_size); return 0; } //size代表mheap数据部分大小 size = memory + memory_size - v; } /* VM map header so we can use memory. */ if (!(flags & MHEAP_FLAG_DISABLE_VM)) clib_mem_vm_map (h, sizeof (h[0])); /* Zero vector header: both heap header and vector length. */ memset (h, 0, sizeof (h[0])); _vec_len (v) = 0; h->vm_alloc_offset_from_header = (void *) h - memory; h->vm_alloc_size = memory_size; h->max_size = size; h->owner_cpu = ~0; /* Set flags based on those given less builtin-flags. */ h->flags |= (flags & ~MHEAP_FLAG_TRACE); /* Unmap remainder of heap until we will be ready to use it. */ //虽然不解除映射也不会影响性能。但是这样好处是一旦访问了未分配的内存会导致段错误,提示开发人员编码错误 if (!(h->flags & MHEAP_FLAG_DISABLE_VM)) mheap_vm (v, MHEAP_VM_UNMAP | MHEAP_VM_ROUND_UP, (clib_address_t) v, h->max_size); /* Initialize free list heads to empty. */ //MHEAP_GROUNDED代表该bin下没有空闲数据块,值为~0。所以这里用0xFF初始化。 memset (h->first_free_elt_uoffset_by_bin, 0xFF, sizeof (h->first_free_elt_uoffset_by_bin)); return v; }
/* Search free lists for object with given size and alignment. */ static uword mheap_get_search_free_list (void *v, uword * n_user_bytes_arg, uword align, uword align_offset) { mheap_t *h = mheap_header (v); uword bin, n_user_bytes, i, bi; n_user_bytes = *n_user_bytes_arg; //计算bin值,用来索引对应大小空闲块链表 bin = user_data_size_to_bin_index (n_user_bytes); //如果请求的是满足条件的块,则从一个额外的cache中获取,避免了搜索空闲块链表开销,还可以利用CPU的SIMD向量指令来加速匹配。 if (MHEAP_HAVE_SMALL_OBJECT_CACHE && (h->flags & MHEAP_FLAG_SMALL_OBJECT_CACHE) && bin < 255 && align == STRUCT_SIZE_OF (mheap_elt_t, user_data[0]) && align_offset == 0) { //cache中寻找数据块,之后会详细分析 uword r = mheap_get_small_object (h, bin); h->stats.n_small_object_cache_attempts += 1; if (r != MHEAP_GROUNDED) { h->stats.n_small_object_cache_hits += 1; return r; } } //h->non_empty_free_elt_heads是一个位图,记录了对应bin链表是否为空 for (i = bin / BITS (uword); i < ARRAY_LEN (h->non_empty_free_elt_heads); i++) { uword non_empty_bin_mask = h->non_empty_free_elt_heads[i]; /* No need to search smaller bins. */ //把比bin小的位置的位图略掉 if (i == bin / BITS (uword)) non_empty_bin_mask &= ~pow2_mask (bin % BITS (uword)); /* Search each occupied free bin which is large enough. */ //foreach_set_bit()从右往左遍历每个bit,为1的bit调用mheap_get_search_free_bin() //foreach_set_bit()中first_set (uword x)用来计算从右算第一个为1的bit代表的数 foreach_set_bit (bi, non_empty_bin_mask, ( { uword r = mheap_get_search_free_bin (v, bi + i * BITS (uword), n_user_bytes_arg, align, align_offset); if (r != MHEAP_GROUNDED) return r;} )); } return MHEAP_GROUNDED; }
/*如果每次都是计算bin值再从链表中找空闲块就太麻烦了,对满足特定条件块,也应该是vpp使用频率最多的那种类型的块做了个cache,只需要计算bin值就能直接得到可用空闲块。该函数就是干这事。*/ always_inline uword mheap_get_small_object (mheap_t * h, uword bin) { /*cache中c->bins.as_u8数组每个元素记录了(bin+1),为0代表该位置没有空闲块。c->offsets记录对应bin的空闲块。 mheap_small_object_cache_t *c = &h->small_object_cache; //查找c->bins.as_u8数组中,所有值为(bin+1)的索引组成的位图 uword mask = mheap_small_object_cache_mask (c, bin + 1); uword offset = MHEAP_GROUNDED; //如果位图不为0,说明有空闲块,查找位图最左边那个为1bit的索引,返 回对应的c->offsets值,即可用空闲块。 if (mask) { uword i = min_log2 (mask); uword o = c->offsets[i]; ASSERT (o != MHEAP_GROUNDED); c->bins.as_u8[i] = 0; offset = o; } return offset; }
always_inline uword mheap_small_object_cache_mask (mheap_small_object_cache_t * c, uword bin) { uword mask; /* $$$$ ELIOT FIXME: add Altivec version of this routine */ //本函数依赖于CPU支持向量指令。 #if !defined (CLIB_HAVE_VEC128) || defined (__ALTIVEC__) mask = 0; #else //返回一个由16字节组成的向量,每个字节的值都是bin u8x16 b = u8x16_splat (bin); ASSERT (bin < 256); /*u8x16_is_equal()比较两个128bit位向量,每个字节进行对比,如果相等,返回字节为1,一共返回16个字节,组成一个返回值128bit向量。u8x16_compare_byte_mask()把返回的128bit向量转化成位图,共16个bit位图*/ #define _(i) ((uword) u8x16_compare_byte_mask (u8x16_is_equal (b, c->bins.as_u8x16[i])) << (uword) ((i)*16)) mask = _(0) | _(1); if (BITS (uword) > 32) mask |= _(2) | _(3); #undef _ #endif //32位系统返回的是32bit位图,64位系统返回64bit位图 return mask; }
/*该函数搜索bin对应的空闲块链表。找到一个大小满足要求的空闲块是很容易的,本函数额外大量代码花在了满足空闲块的对齐要求上。align:首地址相对于heap偏移对齐,align_offset:首地址对齐后,再向左做个偏移。
分配内存时从对应的slot中查找,值得注意的是查找算法为了满足参数中对齐要求,很有可能从空闲块中间部分分配内存,这样首尾部分空闲块重新链入新的slot。一切的一切都是为了数据对齐。如果分配的内存是一个带头部的数据结构,例如vec+data类型。那么分配内存时需要把头部大小作为align_offset参数,使数据部分满足起始地址align对齐。*/
static uword mheap_get_search_free_bin (void *v, uword bin, uword * n_user_data_bytes_arg, uword align, uword align_offset) { mheap_t *h = mheap_header (v); mheap_elt_t *e; /* Free object is at offset f0 ... f1; Allocatted object is at offset o0 ... o1. */ //o0,f0是用于首部,f0,f1用于尾部,lo_free_usize切分后首部空间,hi_free_usize切分后尾部空间 word o0, o1, f0, f1, search_n_user_data_bytes; word lo_free_usize, hi_free_usize; ASSERT (h->first_free_elt_uoffset_by_bin[bin] != MHEAP_GROUNDED); e = mheap_elt_at_uoffset (v, h->first_free_elt_uoffset_by_bin[bin]); search_n_user_data_bytes = *n_user_data_bytes_arg; /* Silence compiler warning. */ o0 = o1 = f0 = f1 = 0; h->stats.free_list.n_search_attempts += 1; /* Find an object that is large enough with correct alignment at given alignment offset. */ while (1) { uword this_object_n_user_data_bytes = mheap_elt_data_bytes (e); ASSERT (e->is_free); if (bin < MHEAP_N_SMALL_OBJECT_BINS) ASSERT (this_object_n_user_data_bytes >= search_n_user_data_bytes); h->stats.free_list.n_objects_searched += 1; if (this_object_n_user_data_bytes < search_n_user_data_bytes) goto next; /* Bounds of free object: from f0 to f1. */ //f0指向原始块首部地址偏移,f1指向原始块尾部地址偏移 f0 = ((void *) e->user_data - v); f1 = f0 + this_object_n_user_data_bytes; /* Place candidate object at end of free block and align as requested. */ //从尾部往左分配目标块,o0指向首部偏移并且满足对齐要求 o0 = ((f1 - search_n_user_data_bytes) & ~(align - 1)) - align_offset; //o0比f0还小了,右偏移一个align while (o0 < f0) o0 += align; /* Make sure that first free fragment is either empty or large enough to be valid. */ //确保切分后首部空闲块至少有MHEAP_ELT_OVERHEAD_BYTES大小,这是要给切去的块做头部的 while (1) { lo_free_usize = o0 != f0 ? o0 - f0 - MHEAP_ELT_OVERHEAD_BYTES : 0; if (o0 <= f0 || lo_free_usize >= (word) MHEAP_MIN_USER_DATA_BYTES) break; //越偏移lo_free_usize越小,什么鬼,博主强烈认为这是bug,应该是+= o0 -= align; } //o0~o1之间就是我们要找的内存了,大小,对齐性都满足 o1 = o0 + search_n_user_data_bytes; /* Does it fit? */ if (o0 >= f0 && o1 <= f1) goto found; next: /* Reached end of free list without finding large enough object. */ if (e->free_elt.next_uoffset == MHEAP_GROUNDED) return MHEAP_GROUNDED; /* Otherwise keep searching for large enough object. */ e = mheap_elt_at_uoffset (v, e->free_elt.next_uoffset); } found: /* Free fragment at end. */ hi_free_usize = f1 != o1 ? f1 - o1 - MHEAP_ELT_OVERHEAD_BYTES : 0; /* If fragment at end is too small to be a new object, give user's object a bit more space than requested. */ //从中间切开得到我们要的内存,尾部如果太小,就不单独做成空闲块了,直接给我们用。 if (hi_free_usize < (word) MHEAP_MIN_USER_DATA_BYTES) { search_n_user_data_bytes += f1 - o1; o1 = f1; hi_free_usize = 0; } /* Need to make sure that relevant memory areas are mapped. */ //切去的内存mmap下 if (!(h->flags & MHEAP_FLAG_DISABLE_VM)) { mheap_elt_t *f0_elt = mheap_elt_at_uoffset (v, f0); mheap_elt_t *f1_elt = mheap_elt_at_uoffset (v, f1); mheap_elt_t *o0_elt = mheap_elt_at_uoffset (v, o0); mheap_elt_t *o1_elt = mheap_elt_at_uoffset (v, o1); uword f0_page_start, f0_page_end; uword o0_page_start, o0_page_end; /* Free elt is mapped. Addresses after that may not be mapped. */ f0_page_start = mheap_page_round (pointer_to_uword (f0_elt->user_data)); f0_page_end = mheap_page_truncate (pointer_to_uword (f1_elt)); o0_page_start = mheap_page_truncate (pointer_to_uword (o0_elt)); o0_page_end = mheap_page_round (pointer_to_uword (o1_elt->user_data)); if (o0_page_start < f0_page_start) o0_page_start = f0_page_start; if (o0_page_end > f0_page_end) o0_page_end = f0_page_end; if (o0_page_end > o0_page_start) clib_mem_vm_map (uword_to_pointer (o0_page_start, void *), o0_page_end - o0_page_start); } /* Remove free object from free list. */ remove_free_elt (v, e, bin); /* Free fragment at begining. */ //切分后首部块重新链入bin表 if (lo_free_usize > 0) { ASSERT (lo_free_usize >= (word) MHEAP_MIN_USER_DATA_BYTES); mheap_elt_set_size (v, f0, lo_free_usize, /* is_free */ 1); new_free_elt (v, f0, lo_free_usize); } mheap_elt_set_size (v, o0, search_n_user_data_bytes, /* is_free */ 0); //切分后尾部块重新链入bin表 if (hi_free_usize > 0) { uword uo = o1 + MHEAP_ELT_OVERHEAD_BYTES; mheap_elt_set_size (v, uo, hi_free_usize, /* is_free */ 1); new_free_elt (v, uo, hi_free_usize); } /* Return actual size of block. */ //分配的elt的数据部分大小 *n_user_data_bytes_arg = search_n_user_data_bytes; h->stats.free_list.n_objects_found += 1; //返回分配的elt的数据部分偏移值 return o0; }
user_data_size_to_bin_index (uword n_user_data_bytes)
空闲mheap_elt_t按照数据部分大小分类链接到不同slot中。该函数计算bin值。
/* Find bin for objects with size at least n_user_data_bytes. */ always_inline uword user_data_size_to_bin_index (uword n_user_data_bytes) { uword n_user_data_words; word small_bin, large_bin; /* User size must be at least big enough to hold free elt. */ //n_user_data_bytes至少要能包含mheap_elt_t头部 n_user_data_bytes = clib_max (n_user_data_bytes, MHEAP_MIN_USER_DATA_BYTES); /* Round to words. */ n_user_data_words = (round_pow2 (n_user_data_bytes, MHEAP_USER_DATA_WORD_BYTES) / MHEAP_USER_DATA_WORD_BYTES); ASSERT (n_user_data_words > 0); //除去头部的纯数据部分长度,换算成占用word个数,即为bin值 small_bin = n_user_data_words - (MHEAP_MIN_USER_DATA_BYTES / MHEAP_USER_DATA_WORD_BYTES); ASSERT (small_bin >= 0); large_bin = MHEAP_N_SMALL_OBJECT_BINS + max_log2 (n_user_data_bytes) - MHEAP_LOG2_N_SMALL_OBJECT_BINS; //1 ~ MHEAP_N_SMALL_OBJECT_BINS每个值对应单独bin //>= MHEAP_N_SMALL_OBJECT_BINS多个值映射到一个bin中。 //可见,分配内存越小,越能精准快速定位到可选空闲块。 return small_bin < MHEAP_N_SMALL_OBJECT_BINS ? small_bin : large_bin; }
vector
实际使用的缓存一般是如下结构:
user_header指具体数据结构头部,比如:mhash_t,pool_header_t等等
相关文章推荐
- 思科VPP源码分析(ethernet node分析)
- 思科VPP源码分析(多线程支持分析)
- 思科VPP源码分析(feature机制分析)
- 思科VPP源码分析(CLI支持分析)
- 思科VPP源码分析(trace机制分析)
- 思科VPP源码分析(dpdk node分析)
- 思科VPP源码分析(dpo机制源码分析)
- 思科VPP源码分析(路由框架分析二)
- 思科VPP源码分析(node调度框架)
- 思科VPP源码分析(Bihash分析)
- Memcached源码分析之内存管理
- Linux-0.11内核源码分析系列:内存管理try_to_share()与share_page()函数分析
- Cocos2d-X3.0 刨根问底(四)----- 内存管理源码分析
- linux内存管理源码分析 - 概述
- cocos2d-x 源码分析 : Ref (CCObject) 源码分析 cocos2d-x内存管理策略
- Linux-0.11内核源码分析系列:内存管理up_wp_page()与do_wp_page()函数分析
- nginx源码分析—内存池结构ngx_pool_t及内存管理
- Nginx源码分析---内存池结构ngx_pool_t及内存管理
- linux内核源码分析(内存管理)--之数据结构