标签:linux block bio io scheuler
前段时间看了Linux Block Layber的相关代码,主要看了Linux最简单的IO调度器NOOP的实现。
接下来总结下Linux BLOCK层work flow的机制。
继续把SCSI 探测设备这一部分的内容添加进来。
每个 块设备有一个request_queue,一个queue可以选择调度器去调度request。Linux实现了多种调度器,其中NOOP是最简单的。下面看一下noop是怎么工作的。
先看elevator的结构,struct elevator_ops;struct elevator_queue;struct elevator_type;以上3个数据结构是与elevator有关系的。
/* * identifies an elevator type, such as AS or deadline */ struct elevator_type { /* managed by elevator core */ struct kmem_cache *icq_cache; /* fields provided by elevator implementation */ struct elevator_ops ops; size_t icq_size; /* see iocontext.h */ size_t icq_align; /* ditto */ struct elv_fs_entry *elevator_attrs; char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ struct list_head list; }; /* * each queue has an elevator_queue associated with it */ struct elevator_queue { struct elevator_type *type; void *elevator_data; struct kobject kobj; struct mutex sysfs_lock; unsigned int registered:1; DECLARE_HASHTABLE(hash, ELV_HASH_BITS); };
elevator_ops是重要的结构,其定义了相关的操作,
struct elevator_ops { elevator_merge_fn *elevator_merge_fn; elevator_merged_fn *elevator_merged_fn; elevator_merge_req_fn *elevator_merge_req_fn; elevator_allow_merge_fn *elevator_allow_merge_fn; elevator_bio_merged_fn *elevator_bio_merged_fn; elevator_dispatch_fn *elevator_dispatch_fn; elevator_add_req_fn *elevator_add_req_fn; elevator_activate_req_fn *elevator_activate_req_fn; elevator_deactivate_req_fn *elevator_deactivate_req_fn; elevator_completed_req_fn *elevator_completed_req_fn; elevator_request_list_fn *elevator_former_req_fn; elevator_request_list_fn *elevator_latter_req_fn; elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */ elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */ elevator_set_req_fn *elevator_set_req_fn; elevator_put_req_fn *elevator_put_req_fn; elevator_may_queue_fn *elevator_may_queue_fn; elevator_init_fn *elevator_init_fn; elevator_exit_fn *elevator_exit_fn; };
elevator_merge_fn查询一个request,用于将bio并入
elevator_merge_req_fn将两个合并后的请求中多余的那个给删除
*elevator_dispatch_fn将调度器的队列最前面的元素取出,分派给request_queue中的请求队列以等候响应*
*elevator_add_req_fn将一个新的request添加进调度器的队列
elevator_queue_empty_fn检查调度器的队列是否为空
elevator_set_req_fn和elevator_put_req_fn分别在创建新请求和将请求所占的空间释放到内存时调用
*elevator_init_fn用于初始化调度器实例
typedef void (request_fn_proc) (struct request_queue *q);
使用typedef来简化函数定义。
request_fn_proc *request_fn等价为void (*request_fn) (struct request_queue *q)
scsi_add_device->__scsi_add_device->scsi_probe_and_add_lun /** * scsi_alloc_sdev - allocate and setup a scsi_Device * @starget: which target to allocate a &scsi_device for * @lun: which lun * @hostdata: usually NULL and set by ->slave_alloc instead * * Description: * Allocate, initialize for io, and return a pointer to a scsi_Device. * Stores the @shost, @channel, @id, and @lun in the scsi_Device, and * adds scsi_Device to the appropriate list. * * Return value: * scsi_Device pointer, or NULL on failure. **/ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, unsigned int lun, void *hostdata) { struct scsi_device *sdev; int display_failure_msg = 1, ret; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); extern void scsi_evt_thread(struct work_struct *work); extern void scsi_requeue_run_queue(struct work_struct *work); sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size, GFP_ATOMIC); if (!sdev) goto out; sdev->vendor = scsi_null_device_strs; sdev->model = scsi_null_device_strs; sdev->rev = scsi_null_device_strs; sdev->host = shost; sdev->queue_ramp_up_period = SCSI_DEFAULT_RAMP_UP_PERIOD; sdev->id = starget->id; sdev->lun = lun; sdev->channel = starget->channel; sdev->sdev_state = SDEV_CREATED; INIT_LIST_HEAD(&sdev->siblings); INIT_LIST_HEAD(&sdev->same_target_siblings); INIT_LIST_HEAD(&sdev->cmd_list); INIT_LIST_HEAD(&sdev->starved_entry); INIT_LIST_HEAD(&sdev->event_list); spin_lock_init(&sdev->list_lock); INIT_WORK(&sdev->event_work, scsi_evt_thread); INIT_WORK(&sdev->requeue_work, scsi_requeue_run_queue); sdev->sdev_gendev.parent = get_device(&starget->dev); sdev->sdev_target = starget; /* usually NULL and set by ->slave_alloc instead */ sdev->hostdata = hostdata; /* if the device needs this changing, it may do so in the * slave_configure function */ sdev->max_device_blocked = SCSI_DEFAULT_DEVICE_BLOCKED; /* * Some low level driver could use device->type */ sdev->type = -1; /* * Assume that the device will have handshaking problems, * and then fix this field later if it turns out it * doesn‘t */ sdev->borken = 1; sdev->request_queue = scsi_alloc_queue(sdev); if (!sdev->request_queue) { /* release fn is set up in scsi_sysfs_device_initialise, so * have to free and put manually here */ put_device(&starget->dev); kfree(sdev); goto out; } WARN_ON_ONCE(!blk_get_queue(sdev->request_queue)); sdev->request_queue->queuedata = sdev; scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun); scsi_sysfs_device_initialize(sdev); if (shost->hostt->slave_alloc) { ret = shost->hostt->slave_alloc(sdev); if (ret) { /* * if LLDD reports slave not present, don‘t clutter * console with alloc failure messages */ if (ret == -ENXIO) display_failure_msg = 0; goto out_device_destroy; } } return sdev; out_device_destroy: __scsi_remove_device(sdev); out: if (display_failure_msg) printk(ALLOC_FAILURE_MSG, __func__); return NULL; }
struct request_queue *scsi_alloc_queue(struct scsi_device *sdev) { struct request_queue *q; scsi的请求处理函数 q = __scsi_alloc_queue(sdev->host, scsi_request_fn); if (!q) return NULL; blk_queue_prep_rq(q, scsi_prep_fn); blk_queue_softirq_done(q, scsi_softirq_done); blk_queue_rq_timed_out(q, scsi_times_out); blk_queue_lld_busy(q, scsi_lld_busy); return q; }
struct request_queue *__scsi_alloc_queue(struct Scsi_Host *shost, request_fn_proc *request_fn) { struct request_queue *q; struct device *dev = shost->dma_dev; q = blk_init_queue(request_fn, NULL); if (!q) return NULL; /* * this limit is imposed by hardware restrictions */ blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, SCSI_MAX_SG_CHAIN_SEGMENTS)); if (scsi_host_prot_dma(shost)) { shost->sg_prot_tablesize = min_not_zero(shost->sg_prot_tablesize, (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize); } blk_queue_max_hw_sectors(q, shost->max_sectors); blk_queue_bounce_limit(q, scsi_calculate_bounce_limit(shost)); blk_queue_segment_boundary(q, shost->dma_boundary); dma_set_seg_boundary(dev, shost->dma_boundary); blk_queue_max_segment_size(q, dma_get_max_seg_size(dev)); if (!shost->use_clustering) q->limits.cluster = 0; /* * set a reasonable default alignment on word boundaries: the * host and device may alter it using * blk_queue_update_dma_alignment() later. */ blk_queue_dma_alignment(q, 0x03); return q; }
/** * blk_init_queue - prepare a request queue for use with a block device * @rfn: The function to be called to process requests that have been * placed on the queue. * @lock: Request queue spin lock * * Description: * If a block device wishes to use the standard request handling procedures, * which sorts requests and coalesces adjacent requests, then it must * call blk_init_queue(). The function @rfn will be called when there * are requests on the queue that need to be processed. If the device * supports plugging, then @rfn may not be called immediately when requests * are available on the queue, but may be called at some time later instead. * Plugged queues are generally unplugged when a buffer belonging to one * of the requests on the queue is needed, or due to memory pressure. * * @rfn is not required, or even expected, to remove all requests off the * queue, but only as many as it can handle at a time. If it does leave * requests on the queue, it is responsible for arranging that the requests * get dealt with eventually. * * The queue spin lock must be held while manipulating the requests on the * request queue; this lock will be taken also from interrupt context, so irq * disabling is needed for it. * * Function returns a pointer to the initialized request queue, or %NULL if * it didn‘t succeed. * * Note: * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) { return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); }
创建请求处理队列 struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { struct request_queue *uninit_q, *q; uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); if (!uninit_q) return NULL; q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) blk_cleanup_queue(uninit_q); return q; }
struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) { if (!q) return NULL; q->flush_rq = kzalloc(sizeof(struct request), GFP_KERNEL); if (!q->flush_rq) return NULL; if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; 在这里设置请求处理函数。 q->request_fn = rfn; q->prep_rq_fn = NULL; q->queue_flags |= QUEUE_FLAG_DEFAULT; /* Override internal queue lock with supplied lock pointer */ if (lock) q->queue_lock = lock; 应该是比较重要的函数,bio从这里被发送到设备上? /* * This also sets hw/phys segments, boundary and size */ blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; /* Protect q->elevator from elevator_change */ mutex_lock(&q->sysfs_lock); 初始化IO调度器 /* init elevator */ if (elevator_init(q, NULL)) { mutex_unlock(&q->sysfs_lock); goto fail; } mutex_unlock(&q->sysfs_lock); return q; fail: kfree(q->flush_rq); return NULL; }
int elevator_init(struct request_queue *q, char *name) { struct elevator_type *e = NULL; int err; /* * q->sysfs_lock must be held to provide mutual exclusion between * elevator_switch() and here. */ lockdep_assert_held(&q->sysfs_lock); if (unlikely(q->elevator)) return 0; 初始化调度器时,把清空队列。 INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; q->boundary_rq = NULL; if (name) { e = elevator_get(name, true); if (!e) return -EINVAL; } /* * Use the default elevator specified by config boot param or * config option. Don‘t try to load modules as we could be running * off async and request_module() isn‘t allowed from async. */ if (!e && *chosen_elevator) { e = elevator_get(chosen_elevator, false); if (!e) printk(KERN_ERR "I/O scheduler %s not found\n", chosen_elevator); } 选用默认的调度器,如果默认的不存在,使用noop; if (!e) { e = elevator_get(CONFIG_DEFAULT_IOSCHED, false); if (!e) { printk(KERN_ERR "Default I/O scheduler not found. " "Using noop.\n"); e = elevator_get("noop", false); } } 调用调度器的初始化函数。 err = e->ops.elevator_init_fn(q, e); return 0; }
static struct elevator_type elevator_noop = { .ops = { .elevator_merge_req_fn = noop_merged_requests, .elevator_dispatch_fn = noop_dispatch, .elevator_add_req_fn = noop_add_request, .elevator_former_req_fn = noop_former_request, .elevator_latter_req_fn = noop_latter_request, .elevator_init_fn = noop_init_queue, .elevator_exit_fn = noop_exit_queue, }, .elevator_name = "noop", .elevator_owner = THIS_MODULE, };
static int noop_init_queue(struct request_queue *q, struct elevator_type *e) { struct noop_data *nd; struct elevator_queue *eq; 在这里将elevator_queue的type进行赋值。 eq = elevator_alloc(q, e); if (!eq) return -ENOMEM; nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); if (!nd) { kobject_put(&eq->kobj); return -ENOMEM; } eq->elevator_data = nd; INIT_LIST_HEAD(&nd->queue); spin_lock_irq(q->queue_lock); 将elevator_queue与request_queue绑定在一起。 q->elevator = eq; spin_unlock_irq(q->queue_lock); return 0; }
struct noop_data { struct list_head queue; };
从这里可以看到noop调度器里也有一个queue。
struct request_queue->struct elevator_queue->void *elevator_data;;
noop添加request到其内部的队列时,加载自己队列的末尾。 static void noop_add_request(struct request_queue *q, struct request *rq) { struct noop_data *nd = q->elevator->elevator_data; list_add_tail(&rq->queuelist, &nd->queue); }
发送一个request;从自己内部的queue去取一个request,然后调用elv_dispatch_sort; 其实noop是没有自己的queue。 static int noop_dispatch(struct request_queue *q, int force) { struct noop_data *nd = q->elevator->elevator_data; if (!list_empty(&nd->queue)) { struct request *rq; rq = list_entry(nd->queue.next, struct request, queuelist); list_del_init(&rq->queuelist); elv_dispatch_sort(q, rq); return 1; } return 0; } 从struct request_queue从取一个request出来,并把这个request从链表中移除,之后调用elv_dispatch_sort(将rq插入request_queue );
一个请求在创建到销毁的过程遵循下面三种流程
set_req_fn ->
i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn -> (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
iii. [none]
-> put_req_fn
/* * Function: scsi_request_fn() * * Purpose: Main strategy routine for SCSI. * * Arguments: q - Pointer to actual queue. * * Returns: Nothing * * Lock status: IO request lock assumed to be held when called. */ static void scsi_request_fn(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost; struct scsi_cmnd *cmd; struct request *req; if(!get_device(&sdev->sdev_gendev)) /* We must be tearing the block queue down already */ return; /* * To start with, we keep looping until the queue is empty, or until * the host is no longer able to accept any more requests. */ shost = sdev->host; for (;;) { int rtn; /* * get next queueable request. We do this early to make sure * that the request is fully prepared even if we cannot * accept it. */ req = blk_peek_request(q); if (!req || !scsi_dev_queue_ready(q, sdev)) break; if (unlikely(!scsi_device_online(sdev))) { sdev_printk(KERN_ERR, sdev, "rejecting I/O to offline device\n"); scsi_kill_request(req, q); continue; } /* * Remove the request from the request list. */ if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) blk_start_request(req); sdev->device_busy++; spin_unlock(q->queue_lock); cmd = req->special; if (unlikely(cmd == NULL)) { printk(KERN_CRIT "impossible request in %s.\n" "please mail a stack trace to " "linux-scsi@vger.kernel.org\n", __func__); blk_dump_rq_flags(req, "foo"); BUG(); } spin_lock(shost->host_lock); /* * We hit this when the driver is using a host wide * tag map. For device level tag maps the queue_depth check * in the device ready fn would prevent us from trying * to allocate a tag. Since the map is a shared host resource * we add the dev to the starved list so it eventually gets * a run when a tag is freed. */ if (blk_queue_tagged(q) && !blk_rq_tagged(req)) { if (list_empty(&sdev->starved_entry)) list_add_tail(&sdev->starved_entry, &shost->starved_list); goto not_ready; } if (!scsi_target_queue_ready(shost, sdev)) goto not_ready; if (!scsi_host_queue_ready(q, shost, sdev)) goto not_ready; scsi_target(sdev)->target_busy++; shost->host_busy++; /* * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will * take the lock again. */ spin_unlock_irq(shost->host_lock); /* * Finally, initialize any error handling parameters, and set up * the timers for timeouts. */ scsi_init_cmd_errh(cmd); /* * Dispatch the command to the low-level driver. */ rtn = scsi_dispatch_cmd(cmd); spin_lock_irq(q->queue_lock); if (rtn) goto out_delay; } goto out; not_ready: spin_unlock_irq(shost->host_lock); /* * lock q, handle tag, requeue req, and decrement device_busy. We * must return with queue_lock held. * * Decrementing device_busy without checking it is OK, as all such * cases (host limits or settings) should run the queue at some * later time. */ spin_lock_irq(q->queue_lock); blk_requeue_request(q, req); sdev->device_busy--; out_delay: if (sdev->device_busy == 0) blk_delay_queue(q, SCSI_QUEUE_DELAY); out: /* must be careful here...if we trigger the ->remove() function * we cannot be holding the q lock */ spin_unlock_irq(q->queue_lock); put_device(&sdev->sdev_gendev); spin_lock_irq(q->queue_lock); }
标签:linux block bio io scheuler
原文地址:http://9899672.blog.51cto.com/9889672/1611580