标签:linux block bio io scheuler
前段时间看了Linux Block Layber的相关代码,主要看了Linux最简单的IO调度器NOOP的实现。
接下来总结下Linux BLOCK层work flow的机制。
继续把SCSI 探测设备这一部分的内容添加进来。
每个 块设备有一个request_queue,一个queue可以选择调度器去调度request。Linux实现了多种调度器,其中NOOP是最简单的。下面看一下noop是怎么工作的。
先看elevator的结构,struct elevator_ops;struct elevator_queue;struct elevator_type;以上3个数据结构是与elevator有关系的。
/*
* identifies an elevator type, such as AS or deadline
*/
struct elevator_type
{
/* managed by elevator core */
struct kmem_cache *icq_cache;
/* fields provided by elevator implementation */
struct elevator_ops ops;
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX];
struct module *elevator_owner;
/* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
struct list_head list;
};
/*
* each queue has an elevator_queue associated with it
*/
struct elevator_queue
{
struct elevator_type *type;
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
unsigned int registered:1;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};elevator_ops是重要的结构,其定义了相关的操作,
struct elevator_ops
{
elevator_merge_fn *elevator_merge_fn;
elevator_merged_fn *elevator_merged_fn;
elevator_merge_req_fn *elevator_merge_req_fn;
elevator_allow_merge_fn *elevator_allow_merge_fn;
elevator_bio_merged_fn *elevator_bio_merged_fn;
elevator_dispatch_fn *elevator_dispatch_fn;
elevator_add_req_fn *elevator_add_req_fn;
elevator_activate_req_fn *elevator_activate_req_fn;
elevator_deactivate_req_fn *elevator_deactivate_req_fn;
elevator_completed_req_fn *elevator_completed_req_fn;
elevator_request_list_fn *elevator_former_req_fn;
elevator_request_list_fn *elevator_latter_req_fn;
elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
elevator_set_req_fn *elevator_set_req_fn;
elevator_put_req_fn *elevator_put_req_fn;
elevator_may_queue_fn *elevator_may_queue_fn;
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
};elevator_merge_fn查询一个request,用于将bio并入
elevator_merge_req_fn将两个合并后的请求中多余的那个给删除
*elevator_dispatch_fn将调度器的队列最前面的元素取出,分派给request_queue中的请求队列以等候响应*
*elevator_add_req_fn将一个新的request添加进调度器的队列
elevator_queue_empty_fn检查调度器的队列是否为空
elevator_set_req_fn和elevator_put_req_fn分别在创建新请求和将请求所占的空间释放到内存时调用
*elevator_init_fn用于初始化调度器实例
typedef void (request_fn_proc) (struct request_queue *q);
使用typedef来简化函数定义。
request_fn_proc *request_fn等价为void (*request_fn) (struct request_queue *q)
scsi_add_device->__scsi_add_device->scsi_probe_and_add_lun
/**
* scsi_alloc_sdev - allocate and setup a scsi_Device
* @starget: which target to allocate a &scsi_device for
* @lun: which lun
* @hostdata: usually NULL and set by ->slave_alloc instead
*
* Description:
* Allocate, initialize for io, and return a pointer to a scsi_Device.
* Stores the @shost, @channel, @id, and @lun in the scsi_Device, and
* adds scsi_Device to the appropriate list.
*
* Return value:
* scsi_Device pointer, or NULL on failure.
**/
static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
unsigned int lun, void *hostdata)
{
struct scsi_device *sdev;
int display_failure_msg = 1, ret;
struct Scsi_Host *shost = dev_to_shost(starget->dev.parent);
extern void scsi_evt_thread(struct work_struct *work);
extern void scsi_requeue_run_queue(struct work_struct *work);
sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
GFP_ATOMIC);
if (!sdev)
goto out;
sdev->vendor = scsi_null_device_strs;
sdev->model = scsi_null_device_strs;
sdev->rev = scsi_null_device_strs;
sdev->host = shost;
sdev->queue_ramp_up_period = SCSI_DEFAULT_RAMP_UP_PERIOD;
sdev->id = starget->id;
sdev->lun = lun;
sdev->channel = starget->channel;
sdev->sdev_state = SDEV_CREATED;
INIT_LIST_HEAD(&sdev->siblings);
INIT_LIST_HEAD(&sdev->same_target_siblings);
INIT_LIST_HEAD(&sdev->cmd_list);
INIT_LIST_HEAD(&sdev->starved_entry);
INIT_LIST_HEAD(&sdev->event_list);
spin_lock_init(&sdev->list_lock);
INIT_WORK(&sdev->event_work, scsi_evt_thread);
INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue);
sdev->sdev_gendev.parent = get_device(&starget->dev);
sdev->sdev_target = starget;
/* usually NULL and set by ->slave_alloc instead */
sdev->hostdata = hostdata;
/* if the device needs this changing, it may do so in the
* slave_configure function */
sdev->max_device_blocked = SCSI_DEFAULT_DEVICE_BLOCKED;
/*
* Some low level driver could use device->type
*/
sdev->type = -1;
/*
* Assume that the device will have handshaking problems,
* and then fix this field later if it turns out it
* doesn‘t
*/
sdev->borken = 1;
sdev->request_queue = scsi_alloc_queue(sdev);
if (!sdev->request_queue) {
/* release fn is set up in scsi_sysfs_device_initialise, so
* have to free and put manually here */
put_device(&starget->dev);
kfree(sdev);
goto out;
}
WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
sdev->request_queue->queuedata = sdev;
scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
scsi_sysfs_device_initialize(sdev);
if (shost->hostt->slave_alloc) {
ret = shost->hostt->slave_alloc(sdev);
if (ret) {
/*
* if LLDD reports slave not present, don‘t clutter
* console with alloc failure messages
*/
if (ret == -ENXIO)
display_failure_msg = 0;
goto out_device_destroy;
}
}
return sdev;
out_device_destroy:
__scsi_remove_device(sdev);
out:
if (display_failure_msg)
printk(ALLOC_FAILURE_MSG, __func__);
return NULL;
}struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
{
struct request_queue *q;
scsi的请求处理函数
q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
if (!q)
return NULL;
blk_queue_prep_rq(q, scsi_prep_fn);
blk_queue_softirq_done(q, scsi_softirq_done);
blk_queue_rq_timed_out(q, scsi_times_out);
blk_queue_lld_busy(q, scsi_lld_busy);
return q;
}struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost,
request_fn_proc *request_fn)
{
struct request_queue *q;
struct device *dev = shost->dma_dev;
q = blk_init_queue(request_fn, NULL);
if (!q)
return NULL;
/*
* this limit is imposed by hardware restrictions
*/
blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize,
SCSI_MAX_SG_CHAIN_SEGMENTS));
if (scsi_host_prot_dma(shost)) {
shost->sg_prot_tablesize =
min_not_zero(shost->sg_prot_tablesize,
(unsigned short)SCSI_MAX_PROT_SG_SEGMENTS);
BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize);
blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize);
}
blk_queue_max_hw_sectors(q, shost->max_sectors);
blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost));
blk_queue_segment_boundary(q, shost->dma_boundary);
dma_set_seg_boundary(dev, shost->dma_boundary);
blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
if (!shost->use_clustering)
q->limits.cluster = 0;
/*
* set a reasonable default alignment on word boundaries: the
* host and device may alter it using
* blk_queue_update_dma_alignment() later.
*/
blk_queue_dma_alignment(q, 0x03);
return q;
}/**
* blk_init_queue - prepare a request queue for use with a block device
* @rfn: The function to be called to process requests that have been
* placed on the queue.
* @lock: Request queue spin lock
*
* Description:
* If a block device wishes to use the standard request handling procedures,
* which sorts requests and coalesces adjacent requests, then it must
* call blk_init_queue(). The function @rfn will be called when there
* are requests on the queue that need to be processed. If the device
* supports plugging, then @rfn may not be called immediately when requests
* are available on the queue, but may be called at some time later instead.
* Plugged queues are generally unplugged when a buffer belonging to one
* of the requests on the queue is needed, or due to memory pressure.
*
* @rfn is not required, or even expected, to remove all requests off the
* queue, but only as many as it can handle at a time. If it does leave
* requests on the queue, it is responsible for arranging that the requests
* get dealt with eventually.
*
* The queue spin lock must be held while manipulating the requests on the
* request queue; this lock will be taken also from interrupt context, so irq
* disabling is needed for it.
*
* Function returns a pointer to the initialized request queue, or %NULL if
* it didn‘t succeed.
*
* Note:
* blk_init_queue() must be paired with a blk_cleanup_queue() call
* when the block device is deactivated (such as at module unload).
**/
struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
{
return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
}创建请求处理队列
struct request_queue *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *uninit_q, *q;
uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
if (!uninit_q)
return NULL;
q = blk_init_allocated_queue(uninit_q, rfn, lock);
if (!q)
blk_cleanup_queue(uninit_q);
return q;
}struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
spinlock_t *lock)
{
if (!q)
return NULL;
q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL);
if (!q->flush_rq)
return NULL;
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
在这里设置请求处理函数。
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->queue_flags |= QUEUE_FLAG_DEFAULT;
/* Override internal queue lock with supplied lock pointer */
if (lock)
q->queue_lock = lock;
应该是比较重要的函数,bio从这里被发送到设备上?
/*
* This also sets hw/phys segments, boundary and size
*/
blk_queue_make_request(q, blk_queue_bio);
q->sg_reserved_size = INT_MAX;
/* Protect q->elevator from elevator_change */
mutex_lock(&q->sysfs_lock);
初始化IO调度器
/* init elevator */
if (elevator_init(q, NULL)) {
mutex_unlock(&q->sysfs_lock);
goto fail;
}
mutex_unlock(&q->sysfs_lock);
return q;
fail:
kfree(q->flush_rq);
return NULL;
}int elevator_init(struct request_queue *q, char *name)
{
struct elevator_type *e = NULL;
int err;
/*
* q->sysfs_lock must be held to provide mutual exclusion between
* elevator_switch() and here.
*/
lockdep_assert_held(&q->sysfs_lock);
if (unlikely(q->elevator))
return 0;
初始化调度器时,把清空队列。
INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->end_sector = 0;
q->boundary_rq = NULL;
if (name) {
e = elevator_get(name, true);
if (!e)
return -EINVAL;
}
/*
* Use the default elevator specified by config boot param or
* config option. Don‘t try to load modules as we could be running
* off async and request_module() isn‘t allowed from async.
*/
if (!e && *chosen_elevator) {
e = elevator_get(chosen_elevator, false);
if (!e)
printk(KERN_ERR "I/O scheduler %s not found\n",
chosen_elevator);
}
选用默认的调度器,如果默认的不存在,使用noop;
if (!e) {
e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
if (!e) {
printk(KERN_ERR
"Default I/O scheduler not found. " "Using noop.\n");
e = elevator_get("noop", false);
}
}
调用调度器的初始化函数。
err = e->ops.elevator_init_fn(q, e);
return 0;
}static struct elevator_type elevator_noop = {
.ops = {
.elevator_merge_req_fn = noop_merged_requests,
.elevator_dispatch_fn = noop_dispatch,
.elevator_add_req_fn = noop_add_request,
.elevator_former_req_fn = noop_former_request,
.elevator_latter_req_fn = noop_latter_request,
.elevator_init_fn = noop_init_queue,
.elevator_exit_fn = noop_exit_queue,
},
.elevator_name = "noop",
.elevator_owner = THIS_MODULE,
};static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct noop_data *nd;
struct elevator_queue *eq;
在这里将elevator_queue的type进行赋值。
eq = elevator_alloc(q, e);
if (!eq)
return -ENOMEM;
nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
if (!nd) {
kobject_put(&eq->kobj);
return -ENOMEM;
}
eq->elevator_data = nd;
INIT_LIST_HEAD(&nd->queue);
spin_lock_irq(q->queue_lock);
将elevator_queue与request_queue绑定在一起。
q->elevator = eq;
spin_unlock_irq(q->queue_lock);
return 0;
}struct noop_data {
struct list_head queue;
};从这里可以看到noop调度器里也有一个queue。
struct request_queue->struct elevator_queue->void *elevator_data;;
noop添加request到其内部的队列时,加载自己队列的末尾。
static void noop_add_request(struct request_queue *q, struct request *rq)
{
struct noop_data *nd = q->elevator->elevator_data;
list_add_tail(&rq->queuelist, &nd->queue);
}发送一个request;从自己内部的queue去取一个request,然后调用elv_dispatch_sort;
其实noop是没有自己的queue。
static int noop_dispatch(struct request_queue *q, int force)
{
struct noop_data *nd = q->elevator->elevator_data;
if (!list_empty(&nd->queue)) {
struct request *rq;
rq = list_entry(nd->queue.next, struct request, queuelist);
list_del_init(&rq->queuelist);
elv_dispatch_sort(q, rq);
return 1;
}
return 0;
}
从struct request_queue从取一个request出来,并把这个request从链表中移除,之后调用elv_dispatch_sort(将rq插入request_queue );一个请求在创建到销毁的过程遵循下面三种流程
set_req_fn ->
i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn -> (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
iii. [none]
-> put_req_fn
/*
* Function: scsi_request_fn()
*
* Purpose: Main strategy routine for SCSI.
*
* Arguments: q - Pointer to actual queue.
*
* Returns: Nothing
*
* Lock status: IO request lock assumed to be held when called.
*/
static void scsi_request_fn(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
struct scsi_cmnd *cmd;
struct request *req;
if(!get_device(&sdev->sdev_gendev))
/* We must be tearing the block queue down already */
return;
/*
* To start with, we keep looping until the queue is empty, or until
* the host is no longer able to accept any more requests.
*/
shost = sdev->host;
for (;;) {
int rtn;
/*
* get next queueable request. We do this early to make sure
* that the request is fully prepared even if we cannot
* accept it.
*/
req = blk_peek_request(q);
if (!req || !scsi_dev_queue_ready(q, sdev))
break;
if (unlikely(!scsi_device_online(sdev))) {
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to offline device\n");
scsi_kill_request(req, q);
continue;
}
/*
* Remove the request from the request list.
*/
if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
blk_start_request(req);
sdev->device_busy++;
spin_unlock(q->queue_lock);
cmd = req->special;
if (unlikely(cmd == NULL)) {
printk(KERN_CRIT "impossible request in %s.\n"
"please mail a stack trace to "
"linux-scsi@vger.kernel.org\n",
__func__);
blk_dump_rq_flags(req, "foo");
BUG();
}
spin_lock(shost->host_lock);
/*
* We hit this when the driver is using a host wide
* tag map. For device level tag maps the queue_depth check
* in the device ready fn would prevent us from trying
* to allocate a tag. Since the map is a shared host resource
* we add the dev to the starved list so it eventually gets
* a run when a tag is freed.
*/
if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
if (list_empty(&sdev->starved_entry))
list_add_tail(&sdev->starved_entry,
&shost->starved_list);
goto not_ready;
}
if (!scsi_target_queue_ready(shost, sdev))
goto not_ready;
if (!scsi_host_queue_ready(q, shost, sdev))
goto not_ready;
scsi_target(sdev)->target_busy++;
shost->host_busy++;
/*
* XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
* take the lock again.
*/
spin_unlock_irq(shost->host_lock);
/*
* Finally, initialize any error handling parameters, and set up
* the timers for timeouts.
*/
scsi_init_cmd_errh(cmd);
/*
* Dispatch the command to the low-level driver.
*/
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
if (rtn)
goto out_delay;
}
goto out;
not_ready:
spin_unlock_irq(shost->host_lock);
/*
* lock q, handle tag, requeue req, and decrement device_busy. We
* must return with queue_lock held.
*
* Decrementing device_busy without checking it is OK, as all such
* cases (host limits or settings) should run the queue at some
* later time.
*/
spin_lock_irq(q->queue_lock);
blk_requeue_request(q, req);
sdev->device_busy--;
out_delay:
if (sdev->device_busy == 0)
blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:
/* must be careful here...if we trigger the ->remove() function
* we cannot be holding the q lock */
spin_unlock_irq(q->queue_lock);
put_device(&sdev->sdev_gendev);
spin_lock_irq(q->queue_lock);
}标签:linux block bio io scheuler
原文地址:http://9899672.blog.51cto.com/9889672/1611580