首页
Web开发
Windows程序
编程语言
数据库
移动开发
系统相关
微信
其他好文
会员
首页
>
系统相关
> 详细
Memcached源码分析之slabs.c
时间:
2016-09-07 19:03:22
阅读:
216
评论:
0
收藏:
0
[点我收藏+]
标签:
#include
"memcached.h"
#include
<sys/stat.h>
#include
<sys/socket.h>
#include
<sys/signal.h>
#include
<sys/resource.h>
#include
<fcntl.h>
#include
<netinet/in.h>
#include
<errno.h>
#include
<stdlib.h>
#include
<stdio.h>
#include
<string.h>
#include
<assert.h>
#include
<pthread.h>
typedef
struct
{
unsigned
int
size
;
/* sizes of items */
//item或者说chunk的大小
unsigned
int
perslab
;
/* how many items per slab */
//每个slab有多少个item,slab又称“页”
/**
当前slabclass的空闲item链表,也是可用item链表,当前slabclass一切可以用的内存空间都在此,
这里是内存分配的入口,分配内存的时候都是在这个链表上挤一个出去。
ps:memcached的新版本才开始把slots作为“所有空闲的item链接”的用途,以前的版本slots链表保存的是“回收的item”的意思,
而旧版本新分配的slab,是用end_page_ptr指针及end_page_free来控制,此版本已不用。
*/
void
*
slots
;
/* list of item ptrs */
unsigned
int
sl_curr
;
/* total free items in list */
//当前slabclass还剩多少空闲的item,即上面的slots数
unsigned
int
slabs
;
/* how many slabs were allocated for this class */
//这个slabclass分配了多少个slab了
/**
slab_list是这个slabclass下的slabs列表,逻辑上是一个数组,每个元素是一个slab指针。
list_size是slab_list的元素个数。
注意这个list_size和上面的slabs的不同:
由于slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组元素的个数,代表slab_list的空间大小。
slabs代表已经分配出去的slabs数,list_size则代表可以有多少个slabs数
所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
*/
void
**
slab_list
;
/* array of slab pointers */
unsigned
int
list_size
;
/* size of prev array */
unsigned
int
killing
;
/* index+1 of dying slab, or zero if none */
size_t
requested
;
/* The number of requested bytes */
}
slabclass_t
;
static
slabclass_t
slabclass
[
MAX_NUMBER_OF_SLAB_CLASSES
];
static
size_t
mem_limit
=
0
;
//内存上限
static
size_t
mem_malloced
=
0
;
//已分配的内存
static
int
power_largest
;
static
void
*
mem_base
=
NULL
;
//预分配的内存空间
static
void
*
mem_current
=
NULL
;
static
size_t
mem_avail
=
0
;
static
pthread_mutex_t
slabs_lock
=
PTHREAD_MUTEX_INITIALIZER
;
static
pthread_mutex_t
slabs_rebalance_lock
=
PTHREAD_MUTEX_INITIALIZER
;
static
int
do_slabs_newslab
(
const
unsigned
int
id
);
static
void
*
memory_allocate
(
size_t
size
);
static
void
do_slabs_free
(
void
*
ptr
,
const
size_t
size
,
unsigned
int
id
);
static
void
slabs_preallocate
(
const
unsigned
int
maxslabs
);
//根据item大小找到合适的slabclass
unsigned
int
slabs_clsid
(
const
size_t
size
)
{
int
res
=
POWER_SMALLEST
;
if
(
size
==
0
)
return
0
;
while
(
size
>
slabclass
[
res
].
size
)
if
(
res
++
==
power_largest
)
/* won‘t fit in the biggest slab */
return
0
;
return
res
;
}
/**
初始化slabs,这里会对一些内存管理进行初始化
*/
void
slabs_init
(
const
size_t
limit
,
const
double
factor
,
const
bool
prealloc
)
{
int
i
=
POWER_SMALLEST
-
1
;
unsigned
int
size
=
sizeof
(
item
)
+
settings
.
chunk_size
;
mem_limit
=
limit
;
//这个limit就是启动时候用户设置的-m xx中的xx,最大的内存上限
if
(
prealloc
)
{
/**
如果用户开启了预分配,则先把上限的内存先分配出来,放到mem_base全局变量中。
所以这个时候服务就拥有了一大坨内存,以后要分配的内存都是从这一坨里面割下来。
*/
mem_base
=
malloc
(
mem_limit
);
if
(
mem_base
!=
NULL
)
{
mem_current
=
mem_base
;
mem_avail
=
mem_limit
;
}
else
{
fprintf
(
stderr
,
"Warning: Failed to allocate requested memory in"
" one large chunk.\nWill allocate in smaller chunks\n"
);
}
}
//下面是初始化各个slabclass对象
memset
(
slabclass
,
0
,
sizeof
(
slabclass
));
while
(++
i
<
POWER_LARGEST
&&
size
<=
settings
.
item_size_max
/
factor
)
{
/* Make sure items are always n-byte aligned */
if
(
size
%
CHUNK_ALIGN_BYTES
)
size
+=
CHUNK_ALIGN_BYTES
-
(
size
%
CHUNK_ALIGN_BYTES
);
slabclass
[
i
].
size
=
size
;
slabclass
[
i
].
perslab
=
settings
.
item_size_max
/
slabclass
[
i
].
size
;
size
*=
factor
;
if
(
settings
.
verbose
>
1
)
{
fprintf
(
stderr
,
"slab class %3d: chunk size %9u perslab %7u\n"
,
i
,
slabclass
[
i
].
size
,
slabclass
[
i
].
perslab
);
}
}
power_largest
=
i
;
slabclass
[
power_largest
].
size
=
settings
.
item_size_max
;
slabclass
[
power_largest
].
perslab
=
1
;
if
(
settings
.
verbose
>
1
)
{
fprintf
(
stderr
,
"slab class %3d: chunk size %9u perslab %7u\n"
,
i
,
slabclass
[
i
].
size
,
slabclass
[
i
].
perslab
);
}
{
char
*
t_initial_malloc
=
getenv
(
"T_MEMD_INITIAL_MALLOC"
);
if
(
t_initial_malloc
)
{
mem_malloced
=
(
size_t
)
atol
(
t_initial_malloc
);
}
}
if
(
prealloc
)
{
slabs_preallocate
(
power_largest
);
}
}
/**
内存预分配,如果用户开启了预分配,则会调用此方法,先从mem_base为分每个slabclass割一个slab大小下来。
*/
static
void
slabs_preallocate
(
const
unsigned
int
maxslabs
)
{
int
i
;
unsigned
int
prealloc
=
0
;
for
(
i
=
POWER_SMALLEST
;
i
<=
POWER_LARGEST
;
i
++)
{
if
(++
prealloc
>
maxslabs
)
return
;
if
(
do_slabs_newslab
(
i
)
==
0
)
{
fprintf
(
stderr
,
"Error while preallocating slab memory!\n"
"If using -L or other prealloc options, max memory must be "
"at least %d megabytes.\n"
,
power_largest
);
exit
(
1
);
}
}
}
static
int
grow_slab_list
(
const
unsigned
int
id
)
{
slabclass_t
*
p
=
&
slabclass
[
id
];
/**
p->slab_list是一个空间大小固定的数组,是数组!而list_size是这个数组分配的空间。
p->slabs代表已经分配出去的slabs数
而p->list_size代表可以用多少个slabs数
所以当slabs等于list_size的时候代表这个slab_list已经满了,得增大空间。
*/
if
(
p
->
slabs
==
p
->
list_size
)
{
size_t
new_size
=
(
p
->
list_size
!=
0
)
?
p
->
list_size
*
2
:
16
;
void
*
new_list
=
realloc
(
p
->
slab_list
,
new_size
*
sizeof
(
void
*));
//
if
(
new_list
==
0
)
return
0
;
p
->
list_size
=
new_size
;
p
->
slab_list
=
new_list
;
}
return
1
;
}
/**
把整个slab打散成一个个(也叫chunk)放到相应的slots链表中
*/
static
void
split_slab_page_into_freelist
(
char
*
ptr
,
const
unsigned
int
id
)
{
slabclass_t
*
p
=
&
slabclass
[
id
];
int
x
;
for
(
x
=
0
;
x
<
p
->
perslab
;
x
++)
{
do_slabs_free
(
ptr
,
0
,
id
);
//这个函数主要作用是让当前item空间可用,即加到slots链表中。
ptr
+=
p
->
size
;
}
}
/**
为slabclass[id]分配新的slab,仅当当前的slabclass中slots没有空闲的空间才调用
此函数分配新的slab
*/
static
int
do_slabs_newslab
(
const
unsigned
int
id
)
{
slabclass_t
*
p
=
&
slabclass
[
id
];
int
len
=
settings
.
slab_reassign
?
settings
.
item_size_max
:
p
->
size
*
p
->
perslab
;
//先判断是否开启了自定义slab大小,如果没有就按默认的,即约1M
char
*
ptr
;
/**
下面if的逻辑是:
如果内存超出了限制,分配失败进入if,返回0
否则调用grow_slab_list检查是否要增大slab_list的大小
如果在grow_slab_list返回失败,则不继续分配空间,进入if,返回0
否则分配空间memory_allocate,如果分配失败,同样进入if,返回0;
*/
if
((
mem_limit
&&
mem_malloced
+
len
>
mem_limit
&&
p
->
slabs
>
0
)
||
(
grow_slab_list
(
id
)
==
0
)
||
((
ptr
=
memory_allocate
((
size_t
)
len
))
==
0
))
{
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED
(
id
);
return
0
;
}
memset
(
ptr
,
0
,
(
size_t
)
len
);
//清干净内存空间
split_slab_page_into_freelist
(
ptr
,
id
);
//把新申请的slab放到slots中去
p
->
slab_list
[
p
->
slabs
++]
=
ptr
;
//把新的slab加到slab_list数组中
mem_malloced
+=
len
;
//记下已分配的空间大小
MEMCACHED_SLABS_SLABCLASS_ALLOCATE
(
id
);
return
1
;
}
/**
根据item大小和slabsclass分配空间
*/
static
void
*
do_slabs_alloc
(
const
size_t
size
,
unsigned
int
id
)
{
slabclass_t
*
p
;
void
*
ret
=
NULL
;
item
*
it
=
NULL
;
if
(
id
<
POWER_SMALLEST
||
id
>
power_largest
)
{
//默认最大是200,最小是1
MEMCACHED_SLABS_ALLOCATE_FAILED
(
size
,
0
);
return
NULL
;
}
p
=
&
slabclass
[
id
];
//slabclass是一个全局变量,是各个slabclass对象数组,在这取得当前id对应的slabclass
assert
(
p
->
sl_curr
==
0
||
((
item
*)
p
->
slots
)->
slabs_clsid
==
0
);
/* fail unless we have space at the end of a recently allocated page,
we have something on our freelist, or we could allocate a new page */
/**
下面这个if的逻辑相当于:
如果p->sl_curr==0,即slots链表中没有空闲的空间,则do_slabs_newslab分配新slab
如果p->sl_curr==0,且do_slabs_newslab分配新slab失败,则进入if,ret = NULL,否则进入下面的elseif
*/
if
(!
(
p
->
sl_curr
!=
0
||
do_slabs_newslab
(
id
)
!=
0
))
{
/* We don‘t have more memory available */
ret
=
NULL
;
}
else
if
(
p
->
sl_curr
!=
0
)
{
//如果进入此分支是因为slots链表中还有空闲的空间
/* return off our freelist */
//把空闲的item分配出去
it
=
(
item
*)
p
->
slots
;
p
->
slots
=
it
->
next
;
if
(
it
->
next
)
it
->
next
->
prev
=
0
;
p
->
sl_curr
--;
ret
=
(
void
*)
it
;
}
if
(
ret
)
{
p
->
requested
+=
size
;
//分配成功,记下已分配的字节数
MEMCACHED_SLABS_ALLOCATE
(
size
,
id
,
p
->
size
,
ret
);
}
else
{
MEMCACHED_SLABS_ALLOCATE_FAILED
(
size
,
id
);
}
return
ret
;
}
/**
这个函数的命名虽然叫do_slabs_free,听上去好像是释放空间,其实质是把空间变成可用。
怎样的空间才算可用?就是加到当前slabclass的slots链表中而已。
所以新申请的slab也会调用这个函数,让整个slab变为可用。
ps: 以前的memcached版本slots链表保存的是回收的item空间,而
现在保存的是所有可用的item空间。
*/
static
void
do_slabs_free
(
void
*
ptr
,
const
size_t
size
,
unsigned
int
id
)
{
slabclass_t
*
p
;
item
*
it
;
assert
(((
item
*)
ptr
)->
slabs_clsid
==
0
);
assert
(
id
>=
POWER_SMALLEST
&&
id
<=
power_largest
);
if
(
id
<
POWER_SMALLEST
||
id
>
power_largest
)
return
;
MEMCACHED_SLABS_FREE
(
size
,
id
,
ptr
);
p
=
&
slabclass
[
id
];
it
=
(
item
*)
ptr
;
it
->
it_flags
|=
ITEM_SLABBED
;
//把item标记为slabbed状态
it
->
prev
=
0
;
it
->
next
=
p
->
slots
;
//插入到slots链表中
if
(
it
->
next
)
it
->
next
->
prev
=
it
;
p
->
slots
=
it
;
p
->
sl_curr
++;
//空闲item数加1
p
->
requested
-=
size
;
return
;
}
static
int
nz_strcmp
(
int
nzlength
,
const
char
*
nz
,
const
char
*
z
)
{
int
zlength
=
strlen
(
z
);
return
(
zlength
==
nzlength
)
&&
(
strncmp
(
nz
,
z
,
zlength
)
==
0
)
?
0
:
-
1
;
}
bool
get_stats
(
const
char
*
stat_type
,
int
nkey
,
ADD_STAT add_stats
,
void
*
c
)
{
bool
ret
=
true
;
if
(
add_stats
!=
NULL
)
{
if
(!
stat_type
)
{
/* prepare general statistics for the engine */
STATS_LOCK
();
APPEND_STAT
(
"bytes"
,
"%llu"
,
(
unsigned
long
long
)
stats
.
curr_bytes
);
APPEND_STAT
(
"curr_items"
,
"%u"
,
stats
.
curr_items
);
APPEND_STAT
(
"total_items"
,
"%u"
,
stats
.
total_items
);
STATS_UNLOCK
();
item_stats_totals
(
add_stats
,
c
);
}
else
if
(
nz_strcmp
(
nkey
,
stat_type
,
"items"
)
==
0
)
{
item_stats
(
add_stats
,
c
);
}
else
if
(
nz_strcmp
(
nkey
,
stat_type
,
"slabs"
)
==
0
)
{
slabs_stats
(
add_stats
,
c
);
}
else
if
(
nz_strcmp
(
nkey
,
stat_type
,
"sizes"
)
==
0
)
{
item_stats_sizes
(
add_stats
,
c
);
}
else
{
ret
=
false
;
}
}
else
{
ret
=
false
;
}
return
ret
;
}
static
void
do_slabs_stats
(
ADD_STAT add_stats
,
void
*
c
)
{
int
i
,
total
;
/* Get the per-thread stats which contain some interesting aggregates */
struct
thread_stats thread_stats
;
threadlocal_stats_aggregate
(&
thread_stats
);
total
=
0
;
for
(
i
=
POWER_SMALLEST
;
i
<=
power_largest
;
i
++)
{
slabclass_t
*
p
=
&
slabclass
[
i
];
if
(
p
->
slabs
!=
0
)
{
uint32_t
perslab
,
slabs
;
slabs
=
p
->
slabs
;
perslab
=
p
->
perslab
;
char
key_str
[
STAT_KEY_LEN
];
char
val_str
[
STAT_VAL_LEN
];
int
klen
=
0
,
vlen
=
0
;
APPEND_NUM_STAT
(
i
,
"chunk_size"
,
"%u"
,
p
->
size
);
APPEND_NUM_STAT
(
i
,
"chunks_per_page"
,
"%u"
,
perslab
);
APPEND_NUM_STAT
(
i
,
"total_pages"
,
"%u"
,
slabs
);
APPEND_NUM_STAT
(
i
,
"total_chunks"
,
"%u"
,
slabs
*
perslab
);
APPEND_NUM_STAT
(
i
,
"used_chunks"
,
"%u"
,
slabs
*
perslab
-
p
->
sl_curr
);
APPEND_NUM_STAT
(
i
,
"free_chunks"
,
"%u"
,
p
->
sl_curr
);
/* Stat is dead, but displaying zero instead of removing it. */
APPEND_NUM_STAT
(
i
,
"free_chunks_end"
,
"%u"
,
0
);
APPEND_NUM_STAT
(
i
,
"mem_requested"
,
"%llu"
,
(
unsigned
long
long
)
p
->
requested
);
APPEND_NUM_STAT
(
i
,
"get_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
get_hits
);
APPEND_NUM_STAT
(
i
,
"cmd_set"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
set_cmds
);
APPEND_NUM_STAT
(
i
,
"delete_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
delete_hits
);
APPEND_NUM_STAT
(
i
,
"incr_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
incr_hits
);
APPEND_NUM_STAT
(
i
,
"decr_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
decr_hits
);
APPEND_NUM_STAT
(
i
,
"cas_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
cas_hits
);
APPEND_NUM_STAT
(
i
,
"cas_badval"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
cas_badval
);
APPEND_NUM_STAT
(
i
,
"touch_hits"
,
"%llu"
,
(
unsigned
long
long
)
thread_stats
.
slab_stats
[
i
].
touch_hits
);
total
++;
}
}
APPEND_STAT
(
"active_slabs"
,
"%d"
,
total
);
APPEND_STAT
(
"total_malloced"
,
"%llu"
,
(
unsigned
long
long
)
mem_malloced
);
add_stats
(
NULL
,
0
,
NULL
,
0
,
c
);
}
/**
分配内存空间
*/
static
void
*
memory_allocate
(
size_t
size
)
{
void
*
ret
;
/**
有两种分配策略
1)如果是开启了内存预分配策略,则只需要从预分配好的内存块那里割一块出来。即进入下面的else分支
2)如果没有开启预分配,则malloc分配内存
关于预分配详见 slabs_init
*/
if
(
mem_base
==
NULL
)
{
/* We are not using a preallocated large memory chunk */
ret
=
malloc
(
size
);
}
else
{
ret
=
mem_current
;
if
(
size
>
mem_avail
)
{
return
NULL
;
}
/* mem_current pointer _must_ be aligned!!! */
if
(
size
%
CHUNK_ALIGN_BYTES
)
{
size
+=
CHUNK_ALIGN_BYTES
-
(
size
%
CHUNK_ALIGN_BYTES
);
}
mem_current
=
((
char
*)
mem_current
)
+
size
;
if
(
size
<
mem_avail
)
{
mem_avail
-=
size
;
}
else
{
mem_avail
=
0
;
}
}
return
ret
;
}
void
*
slabs_alloc
(
size_t
size
,
unsigned
int
id
)
{
void
*
ret
;
pthread_mutex_lock
(&
slabs_lock
);
ret
=
do_slabs_alloc
(
size
,
id
);
pthread_mutex_unlock
(&
slabs_lock
);
return
ret
;
}
void
slabs_free
(
void
*
ptr
,
size_t
size
,
unsigned
int
id
)
{
pthread_mutex_lock
(&
slabs_lock
);
do_slabs_free
(
ptr
,
size
,
id
);
pthread_mutex_unlock
(&
slabs_lock
);
}
void
slabs_stats
(
ADD_STAT add_stats
,
void
*
c
)
{
pthread_mutex_lock
(&
slabs_lock
);
do_slabs_stats
(
add_stats
,
c
);
pthread_mutex_unlock
(&
slabs_lock
);
}
void
slabs_adjust_mem_requested
(
unsigned
int
id
,
size_t
old
,
size_t
ntotal
)
{
pthread_mutex_lock
(&
slabs_lock
);
slabclass_t
*
p
;
if
(
id
<
POWER_SMALLEST
||
id
>
power_largest
)
{
fprintf
(
stderr
,
"Internal error! Invalid slab class\n"
);
abort
();
}
p
=
&
slabclass
[
id
];
p
->
requested
=
p
->
requested
-
old
+
ntotal
;
pthread_mutex_unlock
(&
slabs_lock
);
}
static
pthread_cond_t
maintenance_cond
=
PTHREAD_COND_INITIALIZER
;
static
pthread_cond_t
slab_rebalance_cond
=
PTHREAD_COND_INITIALIZER
;
static
volatile
int
do_run_slab_thread
=
1
;
static
volatile
int
do_run_slab_rebalance_thread
=
1
;
#define
DEFAULT_SLAB_BULK_CHECK
1
int
slab_bulk_check
=
DEFAULT_SLAB_BULK_CHECK
;
static
int
slab_rebalance_start
(
void
)
{
slabclass_t
*
s_cls
;
int
no_go
=
0
;
pthread_mutex_lock
(&
cache_lock
);
pthread_mutex_lock
(&
slabs_lock
);
if
(
slab_rebal
.
s_clsid
<
POWER_SMALLEST
||
slab_rebal
.
s_clsid
>
power_largest
||
slab_rebal
.
d_clsid
<
POWER_SMALLEST
||
slab_rebal
.
d_clsid
>
power_largest
||
slab_rebal
.
s_clsid
==
slab_rebal
.
d_clsid
)
no_go
=
-
2
;
s_cls
=
&
slabclass
[
slab_rebal
.
s_clsid
];
if
(!
grow_slab_list
(
slab_rebal
.
d_clsid
))
{
no_go
=
-
1
;
}
if
(
s_cls
->
slabs
<
2
)
no_go
=
-
3
;
if
(
no_go
!=
0
)
{
pthread_mutex_unlock
(&
slabs_lock
);
pthread_mutex_unlock
(&
cache_lock
);
return
no_go
;
/* Should use a wrapper function... */
}
s_cls
->
killing
=
1
;
slab_rebal
.
slab_start
=
s_cls
->
slab_list
[
s_cls
->
killing
-
1
];
slab_rebal
.
slab_end
=
(
char
*)
slab_rebal
.
slab_start
+
(
s_cls
->
size
*
s_cls
->
perslab
);
slab_rebal
.
slab_pos
=
slab_rebal
.
slab_start
;
slab_rebal
.
done
=
0
;
/* Also tells do_item_get to search for items in this slab */
slab_rebalance_signal
=
2
;
if
(
settings
.
verbose
>
1
)
{
fprintf
(
stderr
,
"Started a slab rebalance\n"
);
}
pthread_mutex_unlock
(&
slabs_lock
);
pthread_mutex_unlock
(&
cache_lock
);
STATS_LOCK
();
stats
.
slab_reassign_running
=
true
;
STATS_UNLOCK
();
return
0
;
}
enum
move_status
{
MOVE_PASS
=
0
,
MOVE_DONE
,
MOVE_BUSY
,
MOVE_LOCKED
};
static
int
slab_rebalance_move
(
void
)
{
slabclass_t
*
s_cls
;
int
x
;
int
was_busy
=
0
;
int
refcount
=
0
;
enum
move_status status
=
MOVE_PASS
;
pthread_mutex_lock
(&
cache_lock
);
pthread_mutex_lock
(&
slabs_lock
);
s_cls
=
&
slabclass
[
slab_rebal
.
s_clsid
];
for
(
x
=
0
;
x
<
slab_bulk_check
;
x
++)
{
item
*
it
=
slab_rebal
.
slab_pos
;
status
=
MOVE_PASS
;
if
(
it
->
slabs_clsid
!=
255
)
{
void
*
hold_lock
=
NULL
;
uint32_t
hv
=
hash
(
ITEM_key
(
it
),
it
->
nkey
);
if
((
hold_lock
=
item_trylock
(
hv
))
==
NULL
)
{
status
=
MOVE_LOCKED
;
}
else
{
refcount
=
refcount_incr
(&
it
->
refcount
);
if
(
refcount
==
1
)
{
/* item is unlinked, unused */
if
(
it
->
it_flags
&
ITEM_SLABBED
)
{
/* remove from slab freelist */
if
(
s_cls
->
slots
==
it
)
{
s_cls
->
slots
=
it
->
next
;
}
if
(
it
->
next
)
it
->
next
->
prev
=
it
->
prev
;
if
(
it
->
prev
)
it
->
prev
->
next
=
it
->
next
;
s_cls
->
sl_curr
--;
status
=
MOVE_DONE
;
}
else
{
status
=
MOVE_BUSY
;
}
}
else
if
(
refcount
==
2
)
{
/* item is linked but not busy */
if
((
it
->
it_flags
&
ITEM_LINKED
)
!=
0
)
{
do_item_unlink_nolock
(
it
,
hv
);
status
=
MOVE_DONE
;
}
else
{
/* refcount == 1 + !ITEM_LINKED means the item is being
* uploaded to, or was just unlinked but hasn‘t been freed
* yet. Let it bleed off on its own and try again later */
status
=
MOVE_BUSY
;
}
}
else
{
if
(
settings
.
verbose
>
2
)
{
fprintf
(
stderr
,
"Slab reassign hit a busy item: refcount: %d (%d -> %d)\n"
,
it
->
refcount
,
slab_rebal
.
s_clsid
,
slab_rebal
.
d_clsid
);
}
status
=
MOVE_BUSY
;
}
item_trylock_unlock
(
hold_lock
);
}
}
switch
(
status
)
{
case
MOVE_DONE
:
it
->
refcount
=
0
;
it
->
it_flags
=
0
;
it
->
slabs_clsid
=
255
;
break
;
case
MOVE_BUSY
:
refcount_decr
(&
it
->
refcount
);
case
MOVE_LOCKED
:
slab_rebal
.
busy_items
++;
was_busy
++;
break
;
case
MOVE_PASS
:
break
;
}
slab_rebal
.
slab_pos
=
(
char
*)
slab_rebal
.
slab_pos
+
s_cls
->
size
;
if
(
slab_rebal
.
slab_pos
>=
slab_rebal
.
slab_end
)
break
;
}
if
(
slab_rebal
.
slab_pos
>=
slab_rebal
.
slab_end
)
{
/* Some items were busy, start again from the top */
if
(
slab_rebal
.
busy_items
)
{
slab_rebal
.
slab_pos
=
slab_rebal
.
slab_start
;
slab_rebal
.
busy_items
=
0
;
}
else
{
slab_rebal
.
done
++;
}
}
pthread_mutex_unlock
(&
slabs_lock
);
pthread_mutex_unlock
(&
cache_lock
);
return
was_busy
;
}
static
void
slab_rebalance_finish
(
void
)
{
slabclass_t
*
s_cls
;
slabclass_t
*
d_cls
;
pthread_mutex_lock
(&
cache_lock
);
pthread_mutex_lock
(&
slabs_lock
);
s_cls
=
&
slabclass
[
slab_rebal
.
s_clsid
];
d_cls
=
&
slabclass
[
slab_rebal
.
d_clsid
];
/* At this point the stolen slab is completely clear */
s_cls
->
slab_list
[
s_cls
->
killing
-
1
]
=
s_cls
->
slab_list
[
s_cls
->
slabs
-
1
];
s_cls
->
slabs
--;
s_cls
->
killing
=
0
;
memset
(
slab_rebal
.
slab_start
,
0
,
(
size_t
)
settings
.
item_size_max
);
d_cls
->
slab_list
[
d_cls
->
slabs
++]
=
slab_rebal
.
slab_start
;
split_slab_page_into_freelist
(
slab_rebal
.
slab_start
,
slab_rebal
.
d_clsid
);
slab_rebal
.
done
=
0
;
slab_rebal
.
s_clsid
=
0
;
slab_rebal
.
d_clsid
=
0
;
slab_rebal
.
slab_start
=
NULL
;
slab_rebal
.
slab_end
=
NULL
;
slab_rebal
.
slab_pos
=
NULL
;
slab_rebalance_signal
=
0
;
pthread_mutex_unlock
(&
slabs_lock
);
pthread_mutex_unlock
(&
cache_lock
);
STATS_LOCK
();
stats
.
slab_reassign_running
=
false
;
stats
.
slabs_moved
++;
STATS_UNLOCK
();
if
(
settings
.
verbose
>
1
)
{
fprintf
(
stderr
,
"finished a slab move\n"
);
}
}
/*
slab自动重分配时,执行此函数做出重分配方案决定
*/
static
int
slab_automove_decision
(
int
*
src
,
int
*
dst
)
{
static
uint64_t
evicted_old
[
POWER_LARGEST
];
static
unsigned
int
slab_zeroes
[
POWER_LARGEST
];
static
unsigned
int
slab_winner
=
0
;
static
unsigned
int
slab_wins
=
0
;
uint64_t
evicted_new
[
POWER_LARGEST
];
uint64_t
evicted_diff
=
0
;
uint64_t
evicted_max
=
0
;
unsigned
int
highest_slab
=
0
;
unsigned
int
total_pages
[
POWER_LARGEST
];
int
i
;
int
source
=
0
;
int
dest
=
0
;
static
rel_time_t
next_run
;
/* Run less frequently than the slabmove tester. */
if
(
current_time
>=
next_run
)
{
next_run
=
current_time
+
10
;
}
else
{
return
0
;
}
item_stats_evictions
(
evicted_new
);
pthread_mutex_lock
(&
cache_lock
);
for
(
i
=
POWER_SMALLEST
;
i
<
power_largest
;
i
++)
{
total_pages
[
i
]
=
slabclass
[
i
].
slabs
;
}
pthread_mutex_unlock
(&
cache_lock
);
/* Find a candidate source; something with zero evicts 3+ times */
for
(
i
=
POWER_SMALLEST
;
i
<
power_largest
;
i
++)
{
evicted_diff
=
evicted_new
[
i
]
-
evicted_old
[
i
];
if
(
evicted_diff
==
0
&&
total_pages
[
i
]
>
2
)
{
slab_zeroes
[
i
]++;
if
(
source
==
0
&&
slab_zeroes
[
i
]
>=
3
)
source
=
i
;
}
else
{
slab_zeroes
[
i
]
=
0
;
if
(
evicted_diff
>
evicted_max
)
{
evicted_max
=
evicted_diff
;
highest_slab
=
i
;
}
}
evicted_old
[
i
]
=
evicted_new
[
i
];
}
/* Pick a valid destination */
if
(
slab_winner
!=
0
&&
slab_winner
==
highest_slab
)
{
slab_wins
++;
if
(
slab_wins
>=
3
)
dest
=
slab_winner
;
}
else
{
slab_wins
=
1
;
slab_winner
=
highest_slab
;
}
if
(
source
&&
dest
)
{
*
src
=
source
;
*
dst
=
dest
;
return
1
;
}
return
0
;
}
/* Slab rebalancer thread.
* Does not use spinlocks since it is not timing sensitive. Burn less CPU and
* go to sleep if locks are contended
运行slab维护线程,slab维护线程的执行入口
*/
static
void
*
slab_maintenance_thread
(
void
*
arg
)
{
int
src
,
dest
;
while
(
do_run_slab_thread
)
{
if
(
settings
.
slab_automove
==
1
)
{
if
(
slab_automove_decision
(&
src
,
&
dest
)
==
1
)
{
/* Blind to the return codes. It will retry on its own */
slabs_reassign
(
src
,
dest
);
//移动slab,重分配
}
sleep
(
1
);
}
else
{
/* Don‘t wake as often if we‘re not enabled.
* This is lazier than setting up a condition right now. */
sleep
(
5
);
}
}
return
NULL
;
}
/* Slab mover thread.
* Sits waiting for a condition to jump off and shovel some memory about
*/
static
void
*
slab_rebalance_thread
(
void
*
arg
)
{
int
was_busy
=
0
;
/* So we first pass into cond_wait with the mutex held */
mutex_lock
(&
slabs_rebalance_lock
);
while
(
do_run_slab_rebalance_thread
)
{
if
(
slab_rebalance_signal
==
1
)
{
if
(
slab_rebalance_start
()
<
0
)
{
/* Handle errors with more specifity as required. */
slab_rebalance_signal
=
0
;
}
was_busy
=
0
;
}
else
if
(
slab_rebalance_signal
&&
slab_rebal
.
slab_start
!=
NULL
)
{
was_busy
=
slab_rebalance_move
();
}
if
(
slab_rebal
.
done
)
{
slab_rebalance_finish
();
}
else
if
(
was_busy
)
{
/* Stuck waiting for some items to unlock, so slow down a bit
* to give them a chance to free up */
usleep
(
50
);
}
if
(
slab_rebalance_signal
==
0
)
{
/* always hold this lock while we‘re running */
pthread_cond_wait
(&
slab_rebalance_cond
,
&
slabs_rebalance_lock
);
}
}
return
NULL
;
}
static
int
slabs_reassign_pick_any
(
int
dst
)
{
static
int
cur
=
POWER_SMALLEST
-
1
;
int
tries
=
power_largest
-
POWER_SMALLEST
+
1
;
for
(;
tries
>
0
;
tries
--)
{
cur
++;
if
(
cur
>
power_largest
)
cur
=
POWER_SMALLEST
;
if
(
cur
==
dst
)
continue
;
if
(
slabclass
[
cur
].
slabs
>
1
)
{
return
cur
;
}
}
return
-
1
;
}
static
enum
reassign_result_type do_slabs_reassign
(
int
src
,
int
dst
)
{
if
(
slab_rebalance_signal
!=
0
)
return
REASSIGN_RUNNING
;
if
(
src
==
dst
)
return
REASSIGN_SRC_DST_SAME
;
/* Special indicator to choose ourselves. */
if
(
src
==
-
1
)
{
src
=
slabs_reassign_pick_any
(
dst
);
/* TODO: If we end up back at -1, return a new error type */
}
if
(
src
<
POWER_SMALLEST
||
src
>
power_largest
||
dst
<
POWER_SMALLEST
||
dst
>
power_largest
)
return
REASSIGN_BADCLASS
;
if
(
slabclass
[
src
].
slabs
<
2
)
return
REASSIGN_NOSPARE
;
slab_rebal
.
s_clsid
=
src
;
slab_rebal
.
d_clsid
=
dst
;
slab_rebalance_signal
=
1
;
pthread_cond_signal
(&
slab_rebalance_cond
);
return
REASSIGN_OK
;
}
enum
reassign_result_type slabs_reassign
(
int
src
,
int
dst
)
{
enum
reassign_result_type ret
;
if
(
pthread_mutex_trylock
(&
slabs_rebalance_lock
)
!=
0
)
{
return
REASSIGN_RUNNING
;
}
ret
=
do_slabs_reassign
(
src
,
dst
);
pthread_mutex_unlock
(&
slabs_rebalance_lock
);
return
ret
;
}
/* If we hold this lock, rebalancer can‘t wake up or move */
void
slabs_rebalancer_pause
(
void
)
{
pthread_mutex_lock
(&
slabs_rebalance_lock
);
}
void
slabs_rebalancer_resume
(
void
)
{
pthread_mutex_unlock
(&
slabs_rebalance_lock
);
}
static
pthread_t
maintenance_tid
;
static
pthread_t
rebalance_tid
;
/**
启动slab维护线程
*/
int
start_slab_maintenance_thread
(
void
)
{
int
ret
;
slab_rebalance_signal
=
0
;
slab_rebal
.
slab_start
=
NULL
;
char
*
env
=
getenv
(
"MEMCACHED_SLAB_BULK_CHECK"
);
if
(
env
!=
NULL
)
{
slab_bulk_check
=
atoi
(
env
);
if
(
slab_bulk_check
==
0
)
{
slab_bulk_check
=
DEFAULT_SLAB_BULK_CHECK
;
}
}
if
(
pthread_cond_init
(&
slab_rebalance_cond
,
NULL
)
!=
0
)
{
fprintf
(
stderr
,
"Can‘t intiialize rebalance condition\n"
);
return
-
1
;
}
pthread_mutex_init
(&
slabs_rebalance_lock
,
NULL
);
if
((
ret
=
pthread_create
(&
maintenance_tid
,
NULL
,
slab_maintenance_thread
,
NULL
))
!=
0
)
{
fprintf
(
stderr
,
"Can‘t create slab maint thread: %s\n"
,
strerror
(
ret
));
return
-
1
;
}
if
((
ret
=
pthread_create
(&
rebalance_tid
,
NULL
,
slab_rebalance_thread
,
NULL
))
!=
0
)
{
fprintf
(
stderr
,
"Can‘t create rebal thread: %s\n"
,
strerror
(
ret
));
return
-
1
;
}
return
0
;
}
/**
停止slab维护线程,逻辑和停止哈希表维护线程一样。
*/
void
stop_slab_maintenance_thread
(
void
)
{
mutex_lock
(&
cache_lock
);
do_run_slab_thread
=
0
;
do_run_slab_rebalance_thread
=
0
;
pthread_cond_signal
(&
maintenance_cond
);
pthread_mutex_unlock
(&
cache_lock
);
/* Wait for the maintenance thread to stop */
pthread_join
(
maintenance_tid
,
NULL
);
pthread_join
(
rebalance_tid
,
NULL
);
}
Memcached源码分析之slabs.c
标签:
原文地址:http://www.cnblogs.com/guolanzhu/p/5850272.html
踩
(
0
)
赞
(
0
)
举报
评论
一句话评论(
0
)
登录后才能评论!
分享档案
更多>
2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)
周排行
更多
gitlab 在linux安装环境下存储地址
2021-07-29
当 Mac 未检测到外部显示器时如何修复它
2021-07-29
Ubuntu18.04安装qemu遇到问题-qemu : Depends: qemu-system (>= 1:2.11+dfsg-1ubuntu7)
2021-07-28
[Linux]Shell编程【待续】
2021-07-28
Linux系统资源查看
2021-07-27
Archlinux爬坑指南
2021-07-27
[Linux]Linux发展历程
2021-07-27
非桌面系统 (ubuntu)安装google-chrome
2021-07-27
在Ubuntu18.04系统中源码安装 gcc7.3.0
2021-07-23
Linux快捷键杂记
2021-07-22
友情链接
兰亭集智
国之画
百度统计
站长统计
阿里云
chrome插件
新版天听网
关于我们
-
联系我们
-
留言反馈
© 2014
mamicode.com
版权所有 联系我们:gaon5@hotmail.com
迷上了代码!