/*
* For highmem pages, we can‘t trust "virtual" until
* after we have the lock.
*
* We cannot call this from interrupts, as it may block
*/
spin_lock(&kmap_lock); //保护页表免受多处理器系统上的并发访问。
vaddr = (unsigned long)page_address(page);
if (!vaddr)
vaddr = map_new_virtual(page);
pkmap_count[PKMAP_NR(vaddr)]++;
if (pkmap_count[PKMAP_NR(vaddr)] < 2)
BUG();
spin_unlock(&kmap_lock);
return (void*) vaddr;
}
kunmap()函数撤销先前由kmap()建立的永久内核映射。
void kunmap(struct page *page)
{
if (in_interrupt())
BUG();
if (!PageHighMem(page))
return;
kunmap_high(page);
}
如果页确实在高端内存中,则调用kunmap_high()函数。
void fastcall kunmap_high(struct page *page)
{
unsigned long vaddr;
unsigned long nr;
int need_wakeup;
spin_lock(&kmap_lock);
vaddr = (unsigned long)page_address(page);
if (!vaddr)
BUG();
nr = PKMAP_NR(vaddr);
/*
* A count must never go down to zero
* without a TLB flush!
*/
need_wakeup = 0;
switch (--pkmap_count[nr]) {
case 0:
BUG();
case 1:
/*
* Avoid an unnecessary wake_up() function call.
* The common case is pkmap_count[] == 1, but
* no waiters.
* The tasks queued in the wait-queue are guarded
* by both the lock in the wait-queue-head and by
* the kmap_lock. As the kmap_lock is held here,
* no need for the wait-queue-head‘s lock. Simply
* test if the queue is empty.
*/
need_wakeup = waitqueue_active(&pkmap_map_wait);
}
spin_unlock(&kmap_lock);
/* do wake-up, if needed, race-free outside of the spin lock */
if (need_wakeup)
wake_up(&pkmap_map_wait);
}
临时内核映射
临时内核映射比永久内核映射的实现要简单。此外,它们可以用在中断处理程序和可延迟函数的内部,因为它们从不阻塞当前进程。
在高端内存的任一页框都可以通过一个“窗口”(为此而保留的一个页表项)映射到内核地址空间。留给临时内核映射的窗口数是非常少的。
每个CPU都有它自己包含13个窗口集合,它们用enum km_type数据结构表示。
内核必须确保同一窗口永不会被两个不同的控制路径同时使用。
为了建立临时内核映射,内核调用kmap_atomic()函数,
void *kmap_atomic(struct page *page, enum km_type type)
{
enum fixed_addresses idx;
unsigned long vaddr;
/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
inc_preempt_count();
if (!PageHighMem(page))
return page_address(page);
idx = type + KM_TYPE_NR*smp_processor_id();
vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
#ifdef CONFIG_DEBUG_HIGHMEM
if (!pte_none(*(kmap_pte-idx)))
BUG();
#endif
set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
__flush_tlb_one(vaddr);
管理区分配器是内核页框分配器的前端。该成分必须分配一个包含足够多空闲页框的内存管理区,使它能满足内存请求。管理区分配器必须满足几个目标:
它应当保护保留的页框池。
当内存不足且允许阻塞当前进程是,它应当触发页框回收算法;一旦某个页框被释放,管理区分配器将再次尝试分配。
如果可能,它应当保存小而珍贵的ZONE_DMA内存管理区。
struct page * fastcall
__alloc_pages(unsigned int gfp_mask, unsigned int order,
struct zonelist *zonelist) //分配内存相关函数
{
const int wait = gfp_mask & __GFP_WAIT;
struct zone **zones, *z;
struct page *page;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
int i;
int classzone_idx;
int do_retry;
int can_try_harder;
int did_some_progress;
might_sleep_if(wait);
/*
* The caller may dip into page reserves a bit more if the caller
* cannot run direct reclaim, or is the caller has realtime scheduling
* policy
*/
can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (unlikely(zones[0] == NULL)) {
/* Should this ever happen?? */
return NULL;
}
classzone_idx = zone_idx(zones[0]);
restart:
/* Go through the zonelist once, looking for a zone with enough free */
//执行对内存管理区的第一次扫描
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_low,
classzone_idx, 0, 0))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
//如果函数在上一步没有终止,那么没有剩余多少空闲内存:函数唤醒kswapd内核线程来异步的开始回收页框
for (i = 0; (z = zones[i]) != NULL; i++)
wakeup_kswapd(z, order);
/*
* Go through the zonelist again. Let __GFP_HIGH and allocations
* coming from realtime tasks to go deeper into reserves
*/
//执行对内存的第二次扫描,将z->pages_min作为阈值base传递。
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
/* This allocation should allow future memory freeing. */
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
/* go through the zonelist yet again, ignoring mins */
for (i = 0; (z = zones[i]) != NULL; i++) {
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
goto nopage;
}
/* Atomic allocations - we can‘t balance anything */
if (!wait)
goto nopage;
rebalance:
//检查是否有其他进程需要CPU
cond_resched();
/* We now go into synchronous reclaim */
//时至current的PF_MEMALLOC标志来表示进程已经准备好执行内存回收
p->flags |= PF_MEMALLOC;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
//寻找一些页框来回收
did_some_progress = try_to_free_pages(zones, gfp_mask, order);
if (likely(did_some_progress)) {
/*
* Go through the zonelist yet one more time, keep
* very high watermark here, this is only to catch
* a parallel oom killing, we must fail if we‘re still
* under heavy pressure.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_min,
classzone_idx, can_try_harder,
gfp_mask & __GFP_HIGH))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
/*
* Go through the zonelist yet one more time, keep
* very high watermark here, this is only to catch
* a parallel oom killing, we must fail if we‘re still
* under heavy pressure.
*/
for (i = 0; (z = zones[i]) != NULL; i++) {
if (!zone_watermark_ok(z, order, z->pages_high,
classzone_idx, 0, 0))
continue;
page = buffered_rmqueue(z, order, gfp_mask);
if (page)
goto got_pg;
}
out_of_memory(gfp_mask);
goto restart;
}
/*
* Don‘t let big-order allocations loop unless the caller explicitly
* requests that. Wait for some write requests to complete then retry.
*
* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
* <= 3, but that may not be true in other implementations.
*/
do_retry = 0;
if (!(gfp_mask & __GFP_NORETRY)) {
if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
do_retry = 1;
if (gfp_mask & __GFP_NOFAIL)
do_retry = 1;
}
if (do_retry) {
blk_congestion_wait(WRITE, HZ/50);
goto rebalance;
}