现有电商平台,徐州关键字优化资讯,渭南自建站网站建设,室内装修设计资质1. 引言 在用户进程发生缺页异常时#xff0c;Linux内核需要分配所需物理页面以及建立也表映射#xff0c;来维持进程的正常内存使用需求。而对于分配物理页面仅依赖于buddy系统#xff0c;对于小order页面的分配效率较低。因此Linux通过在每个cpu维护一个page链表#xff…1. 引言 在用户进程发生缺页异常时Linux内核需要分配所需物理页面以及建立也表映射来维持进程的正常内存使用需求。而对于分配物理页面仅依赖于buddy系统对于小order页面的分配效率较低。因此Linux通过在每个cpu维护一个page链表percpu page list简称pageset用来满足小order页面分配请求提高页面分配效率。 下面我们重点来看一下pageset的原理是什么以及在Linux内核中是怎样实现和使用的。 2. pageset定义
struct zone {
......struct pglist_data *zone_pgdat;struct per_cpu_pages __percpu *per_cpu_pageset;struct per_cpu_zonestat __percpu *per_cpu_zonestats;/** the high and batch values are copied to individual pagesets for* faster access*/int pageset_high;int pageset_batch;
......
};pageset的定义是放在zone里每个zone里有一个per_cpu_pageset成员用于这个zone内小order页面的快速分配。
3. pageset的初始化流程
调用流程
start_kernel(void)
--- setup_per_cpu_pageset();从内核启动流程开始通过调用setup_per_cpu_pageset()函数完成per_cpu_pageset初始化动作
/** Allocate per cpu pagesets and initialize them.* Before this call only boot pagesets were available.*/
void __init setup_per_cpu_pageset(void)
{struct pglist_data *pgdat;struct zone *zone;int __maybe_unused cpu;for_each_populated_zone(zone) // 遍历可用的zone设置zone_pagesetsetup_zone_pageset(zone);#ifdef CONFIG_NUMA/** Unpopulated zones continue using the boot pagesets.* The numa stats for these pagesets need to be reset.* Otherwise, they will end up skewing the stats of* the nodes these zones are associated with.*/for_each_possible_cpu(cpu) {struct per_cpu_zonestat *pzstats per_cpu(boot_zonestats, cpu);memset(pzstats-vm_numa_event, 0,sizeof(pzstats-vm_numa_event));}
#endiffor_each_online_pgdat(pgdat)pgdat-per_cpu_nodestats alloc_percpu(struct per_cpu_nodestat);
}void __meminit setup_zone_pageset(struct zone *zone)
{int cpu;/* Size may be 0 on !SMP !NUMA */if (sizeof(struct per_cpu_zonestat) 0)zone-per_cpu_zonestats alloc_percpu(struct per_cpu_zonestat);zone-per_cpu_pageset alloc_percpu(struct per_cpu_pages); // 为当前zone的per_cpu_pageset分配percpu内存for_each_possible_cpu(cpu) { // 遍历所有cpustruct per_cpu_pages *pcp;struct per_cpu_zonestat *pzstats;pcp per_cpu_ptr(zone-per_cpu_pageset, cpu);pzstats per_cpu_ptr(zone-per_cpu_zonestats, cpu);per_cpu_pages_init(pcp, pzstats); // 初始化per_cpu_pages}zone_set_pageset_high_and_batch(zone, 0);
}/** PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed* costly to service. That is between allocation orders which should* coalesce naturally under reasonable reclaim pressure and those which* will not.*/
#define PAGE_ALLOC_COSTLY_ORDER 3 // 这个是指pageset支持分配的最大order[0-3]
enum migratetype {MIGRATE_UNMOVABLE,MIGRATE_MOVABLE,MIGRATE_RECLAIMABLE,MIGRATE_PCPTYPES, /* the number of types on the pcp lists */MIGRATE_HIGHATOMIC MIGRATE_PCPTYPES,......MIGRATE_TYPES
};
/** One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional list* for THP which will usually be GFP_MOVABLE. Even if it is another type,* it should not contribute to serious fragmentation causing THP allocation* failures.*/
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 1
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS NR_PCP_THP)static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
{int pindex;memset(pcp, 0, sizeof(*pcp));memset(pzstats, 0, sizeof(*pzstats));spin_lock_init(pcp-lock);for (pindex 0; pindex NR_PCP_LISTS; pindex) // 初始化pcp中不同迁移类型不同order用来存放页面的链表INIT_LIST_HEAD(pcp-lists[pindex]);/** Set batch and high values safe for a boot pageset. A true percpu* pagesets initialization will update them subsequently. Here we dont* need to be as careful as pageset_update() as nobody can access the* pageset yet.*/pcp-high BOOT_PAGESET_HIGH;pcp-batch BOOT_PAGESET_BATCH;pcp-free_factor 0;
}4. pageset的页面分配用来分配order为[0-3]的页面
调用流程
alloc_pages()
--- alloc_pages_node()
------- __alloc_pages_node()
---------- __alloc_pages()
------------- get_page_from_freelist()
----------------- rmqueue()/** Allocate a page from the given zone.* Use pcplists for THP or cheap high-order allocations.*//** Do not instrument rmqueue() with KMSAN. This function may call* __msan_poison_alloca() through a call to set_pfnblock_flags_mask().* If __msan_poison_alloca() attempts to allocate pages for the stack depot, it* may call rmqueue() again, which will result in a deadlock.*/
__no_sanitize_memory
static inline
struct page *rmqueue(struct zone *preferred_zone,struct zone *zone, unsigned int order,gfp_t gfp_flags, unsigned int alloc_flags,int migratetype)
{struct page *page;/** We most definitely dont want callers attempting to* allocate greater than order-1 page units with __GFP_NOFAIL.*/WARN_ON_ONCE((gfp_flags __GFP_NOFAIL) (order 1));if (likely(pcp_allowed_order(order))) { // 检查要分配的页面order是否是pcp允许的order/** MIGRATE_MOVABLE pcplist could have the pages on CMA area and* we need to skip it when CMA area isnt allowed.*/if (!IS_ENABLED(CONFIG_CMA) || alloc_flags ALLOC_CMA ||migratetype ! MIGRATE_MOVABLE) { // 进行一些参数检查如果满足条件则从pageset中分配pagepage rmqueue_pcplist(preferred_zone, zone, order, // 从pageset中分配页面migratetype, alloc_flags);if (likely(page))goto out;}}page rmqueue_buddy(preferred_zone, zone, order, alloc_flags,migratetype);out:/* Separate testclear to avoid unnecessary atomics */if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, zone-flags))) {clear_bit(ZONE_BOOSTED_WATERMARK, zone-flags);wakeup_kswapd(zone, 0, 0, zone_idx(zone));}VM_BUG_ON_PAGE(page bad_range(zone, page), page);return page;
}/** PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed* costly to service. That is between allocation orders which should* coalesce naturally under reasonable reclaim pressure and those which* will not.*/
#define PAGE_ALLOC_COSTLY_ORDER 3static inline bool pcp_allowed_order(unsigned int order) // 检查该order页面是否允许从pageset中分配
{if (order PAGE_ALLOC_COSTLY_ORDER) // 主要就是判断order是否小于PAGE_ALLOC_COSTLY_ORDER可以从前面的定义入手发现order只要在[0-3]范围内就允许从pageset中分配return true;
#ifdef CONFIG_TRANSPARENT_HUGEPAGEif (order pageblock_order)return true;
#endifreturn false;
}接下来我们看一下rmqueue_pcplist()是如何从pageset中分配页面的
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,struct zone *zone, unsigned int order,int migratetype, unsigned int alloc_flags)
{struct per_cpu_pages *pcp;struct list_head *list;struct page *page;unsigned long flags;unsigned long __maybe_unused UP_flags;/** spin_trylock may fail due to a parallel drain. In the future, the* trylock will also protect against IRQ reentrancy.*/pcp_trylock_prepare(UP_flags);pcp pcp_spin_trylock_irqsave(zone-per_cpu_pageset, flags); // 获取当前cpu上的per_cpu_pages对象if (!pcp) {pcp_trylock_finish(UP_flags);return NULL;}/** On allocation, reduce the number of pages that are batch freed.* See nr_pcp_free() where free_factor is increased for subsequent* frees.*/pcp-free_factor 1;list pcp-lists[order_to_pindex(migratetype, order)]; // 根据迁移类型和order大小找寻要从哪个页面链表中摘取页面page __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); // 摘取页面pcp_spin_unlock_irqrestore(pcp, flags);pcp_trylock_finish(UP_flags);if (page) { // 如果分配页面成功做一些统计__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 order);zone_statistics(preferred_zone, zone, 1);}return page; // 返回从pageset中分配到的页面
}static inline unsigned int order_to_pindex(int migratetype, int order) // 根据迁移类型和要分配的order计算要从哪条页面链表中摘取页面这个计算index的逻辑和一开始pageset初始化时一致看不明白可以往前翻找一下
{int base order;#ifdef CONFIG_TRANSPARENT_HUGEPAGEif (order PAGE_ALLOC_COSTLY_ORDER) {VM_BUG_ON(order ! pageblock_order);return NR_LOWORDER_PCP_LISTS;}
#elseVM_BUG_ON(order PAGE_ALLOC_COSTLY_ORDER);
#endifreturn (MIGRATE_PCPTYPES * base) migratetype;
}接下来看看__rmqueue_pcplist()函数内部是如何实现的
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,int migratetype,unsigned int alloc_flags,struct per_cpu_pages *pcp,struct list_head *list)
{struct page *page;do {if (list_empty(list)) { // 如果当前list中没有页面则需要从buddy系统中请求页面int batch READ_ONCE(pcp-batch);int alloced;/** Scale batch relative to order if batch implies* free pages can be stored on the PCP. Batch can* be 1 for small zones or for boot pagesets which* should never store free pages as the pages may* belong to arbitrary zones.*/if (batch 1)batch max(batch order, 2);alloced rmqueue_bulk(zone, order, // 从buddy中批量申请batch个order大小、migratetype类型的页面batch, list,migratetype, alloc_flags);pcp-count alloced order;if (unlikely(list_empty(list))) // 如果从buddy系统中申请不到页面则返回NULLreturn NULL;}page list_first_entry(list, struct page, pcp_list); // 从list中获取页面list_del(page-pcp_list); // 删除页面pcp-count - 1 order; // pcp页面个数更新} while (check_new_pcp(page, order));return page;
}/** Obtain a specified number of elements from the buddy allocator, all under* a single hold of the lock, for efficiency. Add them to the supplied list.* Returns the number of new pages which were placed at *list.*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,unsigned long count, struct list_head *list,int migratetype, unsigned int alloc_flags)
{int i, allocated 0;/* Caller must hold IRQ-safe pcp-lock so IRQs are disabled. */spin_lock(zone-lock);for (i 0; i count; i) { // 重复count次struct page *page __rmqueue(zone, order, migratetype, // 每次从zone的buddy系统中申请一个对应order和migratetype的页面alloc_flags);if (unlikely(page NULL)) // 从buddy系统中申请不到内存则退出否则继续申请break;if (unlikely(check_pcp_refill(page, order)))continue;/** Split buddy pages returned by expand() are received here in* physical page order. The page is added to the tail of* callers list. From the callers perspective, the linked list* is ordered by page number under some conditions. This is* useful for IO devices that can forward direction from the* head, thus also in the physical page order. This is useful* for IO devices that can merge IO requests if the physical* pages are ordered properly.*/list_add_tail(page-pcp_list, list); // 将申请到的页面挂载到pageset中的页面链表中allocated; // 已分配的个数加一if (is_migrate_cma(get_pcppage_migratetype(page)))__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,-(1 order));}/** i pages were removed from the buddy list even if some leak due* to check_pcp_refill failing so adjust NR_FREE_PAGES based* on i. Do not confuse with allocated which is the number of* pages added to the pcp list.*/__mod_zone_page_state(zone, NR_FREE_PAGES, -(i order));spin_unlock(zone-lock);return allocated; // 返回已分配页面个数
}5. pageset的页面释放
调用流程
free_pages()
--- __free_pages()
------ free_the_page()static inline void free_the_page(struct page *page, unsigned int order)
{if (pcp_allowed_order(order)) /* Via pcp? */ // 检查该order页面是否是从pageset中分配的free_unref_page(page, order); // 如果是的话则释放到pageset中else__free_pages_ok(page, order, FPI_NONE);
}/** Free a pcp page*/
void free_unref_page(struct page *page, unsigned int order)
{unsigned long flags;unsigned long __maybe_unused UP_flags;struct per_cpu_pages *pcp;struct zone *zone;unsigned long pfn page_to_pfn(page);int migratetype;if (!free_unref_page_prepare(page, pfn, order))return;/** We only track unmovable, reclaimable and movable on pcp lists.* Place ISOLATE pages on the isolated list because they are being* offlined but treat HIGHATOMIC as movable pages so we can get those* areas back if necessary. Otherwise, we may have to free* excessively into the page allocator*/migratetype get_pcppage_migratetype(page);if (unlikely(migratetype MIGRATE_PCPTYPES)) {if (unlikely(is_migrate_isolate(migratetype))) {free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);return;}migratetype MIGRATE_MOVABLE;}zone page_zone(page);pcp_trylock_prepare(UP_flags);pcp pcp_spin_trylock_irqsave(zone-per_cpu_pageset, flags); // 获取当前cpu的pageset对象if (pcp) {free_unref_page_commit(zone, pcp, page, migratetype, order); // 调用该函数将页面释放到pageset中pcp_spin_unlock_irqrestore(pcp, flags);} else {free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);}pcp_trylock_finish(UP_flags);
}static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,struct page *page, int migratetype,unsigned int order)
{int high;int pindex;bool free_high;__count_vm_events(PGFREE, 1 order);pindex order_to_pindex(migratetype, order); // 计算该migratetype和order应该对应pageset哪条页面链表list_add(page-pcp_list, pcp-lists[pindex]); // 将该页面重新挂载到该链表中用于后续分配pcp-count 1 order; // 更新pageset页面个数/** As high-order pages other than THPs stored on PCP can contribute* to fragmentation, limit the number stored when PCP is heavily* freeing without allocation. The remainder after bulk freeing* stops will be drained from vmstat refresh context.*/free_high (pcp-free_factor order order PAGE_ALLOC_COSTLY_ORDER);high nr_pcp_high(pcp, zone, free_high);if (pcp-count high) { // 计算当前pageset保存的页面数量是否超过high值int batch READ_ONCE(pcp-batch); // 如果超过则需要将batch个页面返还给buddy系统free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); // 将多余页面返还给buddy系统}
}/** Frees a number of pages from the PCP lists* Assumes all pages on list are in same zone.* count is the number of pages to free.*/
static void free_pcppages_bulk(struct zone *zone, int count,struct per_cpu_pages *pcp,int pindex)
{int min_pindex 0;int max_pindex NR_PCP_LISTS - 1;unsigned int order;bool isolated_pageblocks;struct page *page;/** Ensure proper count is passed which otherwise would stuck in the* below while (list_empty(list)) loop.*/count min(pcp-count, count);/* Ensure requested pindex is drained first. */pindex pindex - 1;/* Caller must hold IRQ-safe pcp-lock so IRQs are disabled. */spin_lock(zone-lock);isolated_pageblocks has_isolate_pageblock(zone);while (count 0) { // 不断地将页面返还给buddy系统struct list_head *list;int nr_pages;/* Remove pages from lists in a round-robin fashion. */do {if (pindex max_pindex)pindex min_pindex;list pcp-lists[pindex]; // 获取到页面所在链表if (!list_empty(list)) // 如果链表不为空则跳出循环break;if (pindex max_pindex)max_pindex--;if (pindex min_pindex)min_pindex;} while (1);order pindex_to_order(pindex);nr_pages 1 order;do {int mt;page list_last_entry(list, struct page, pcp_list); // 获取当前list中最后一个页面mt get_pcppage_migratetype(page); // 获取页面的迁移类型/* must delete to avoid corrupting pcp list */list_del(page-pcp_list); // 将页面从list中删除count - nr_pages; // 减少要释放到页面数量pcp-count - nr_pages; // 更新pageset页面个数if (bulkfree_pcp_prepare(page))continue;/* MIGRATE_ISOLATE page should not go to pcplists */VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);/* Pageblock could have been isolated meanwhile */if (unlikely(isolated_pageblocks))mt get_pageblock_migratetype(page);__free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); // 释放页面trace_mm_page_pcpu_drain(page, order, mt);} while (count 0 !list_empty(list));}spin_unlock(zone-lock);
}至此Linux pageset初始化和使用流程介绍完毕感谢各位读者浏览