From cfd1b6ca17bd13043a7c6b3da7b9eb1ca6a965c3 Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Tue, 26 Sep 2023 02:15:49 +0900 Subject: [PATCH] zsmalloc: backport from 5994eabf3bbb Backport zsmalloc from commit 5994eabf3bbb ("merge mm-hotfixes-stable into mm-stable to pick up depended-upon changes"). Signed-off-by: Juhyung Park --- include/linux/zsmalloc.h | 2 + mm/Kconfig | 30 +- mm/zsmalloc.c | 1119 ++++++++++++++++---------------------- 3 files changed, 500 insertions(+), 651 deletions(-) diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 2a430e713..a48cd0ffe 100755 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -55,5 +55,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle); unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size); + void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 1b65fab65..dd0a66d83 100755 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -708,15 +708,12 @@ config Z3FOLD still there. config ZSMALLOC - tristate "Memory allocator for compressed pages" + tristate "N:1 compression allocator (zsmalloc)" depends on MMU help zsmalloc is a slab-based memory allocator designed to store - compressed RAM pages. zsmalloc uses virtual memory mapping - in order to reduce fragmentation. However, this results in a - non-standard allocator interface where a handle, not a pointer, is - returned by an alloc(). This handle must be mapped in order to - access the allocated space. + pages of various compression levels efficiently. It achieves + the highest storage density with the least amount of fragmentation. config ZSMALLOC_STAT bool "Export zsmalloc statistics" @@ -724,10 +721,29 @@ config ZSMALLOC_STAT select DEBUG_FS help This option enables code in the zsmalloc to collect various - statistics about whats happening in zsmalloc and exports that + statistics about what's happening in zsmalloc and exports that information to userspace via debugfs. If unsure, say N. +config ZSMALLOC_CHAIN_SIZE + int "Maximum number of physical pages per-zspage" + default 8 + range 4 16 + depends on ZSMALLOC + help + This option sets the upper limit on the number of physical pages + that a zmalloc page (zspage) can consist of. The optimal zspage + chain size is calculated for each size class during the + initialization of the pool. + + Changing this option can alter the characteristics of size classes, + such as the number of pages per zspage and the number of objects + per zspage. This can also result in different configurations of + the pool, as zsmalloc merges size classes with similar + characteristics. + + For more information, see zsmalloc documentation. + config GENERIC_EARLY_IOREMAP bool diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 16f34d0e7..f5140b252 100755 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -17,10 +17,10 @@ * * Usage of struct page fields: * page->private: points to zspage - * page->freelist(index): links together all component pages of a zspage + * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. - * page->units: first object offset in a subpage of zspage + * page->page_type: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page @@ -30,6 +30,13 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +/* + * lock ordering: + * page_lock + * pool->lock + * zspage->lock + */ + #include #include #include @@ -59,24 +66,18 @@ #include #include #include +#include #define ZSPAGE_MAGIC 0x58 /* - * This must be power of 2 and greater than of equal to sizeof(link_free). + * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values. */ #define ZS_ALIGN 8 -/* - * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) - * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. - */ -#define ZS_MAX_ZSPAGE_ORDER 2 -#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) - #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* @@ -102,15 +103,6 @@ #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) -/* - * Memory for allocating for handle keeps object position by - * encoding and the encoded value has a room - * in least bit(ie, look at obj_to_location). - * We use the bit to synchronize between object access by - * user and migration. - */ -#define HANDLE_PIN_BIT 0 - /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. @@ -119,16 +111,23 @@ * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 -#define OBJ_TAG_BITS 1 + +#define OBJ_TAG_BITS 1 +#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG + #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) -#define FULLNESS_BITS 2 +#define HUGE_BITS 1 +#define FULLNESS_BITS 4 #define CLASS_BITS 8 -#define ISOLATED_BITS 3 +#define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) + +#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) + /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) @@ -152,56 +151,44 @@ #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ ZS_SIZE_CLASS_DELTA) + 1) +/* + * Pages are distinguished by the ratio of used memory (that is the ratio + * of ->inuse objects to all objects that page can store). For example, + * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. + * + * The number of fullness groups is not random. It allows us to keep + * difference between the least busy page in the group (minimum permitted + * number of ->inuse objects) and the most busy page (maximum permitted + * number of ->inuse objects) at a reasonable value. + */ enum fullness_group { - ZS_EMPTY, - ZS_ALMOST_EMPTY, - ZS_ALMOST_FULL, - ZS_FULL, - NR_ZS_FULLNESS, + ZS_INUSE_RATIO_0, + ZS_INUSE_RATIO_10, + /* NOTE: 8 more fullness groups here */ + ZS_INUSE_RATIO_99 = 10, + ZS_INUSE_RATIO_100, + NR_FULLNESS_GROUPS, }; -enum zs_stat_type { - CLASS_EMPTY, - CLASS_ALMOST_EMPTY, - CLASS_ALMOST_FULL, - CLASS_FULL, - OBJ_ALLOCATED, - OBJ_USED, - NR_ZS_STAT_TYPE, +enum class_stat_type { + /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */ + ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS, + ZS_OBJS_INUSE, + NR_CLASS_STAT_TYPES, }; struct zs_size_stat { - unsigned long objs[NR_ZS_STAT_TYPE]; + unsigned long objs[NR_CLASS_STAT_TYPES]; }; #ifdef CONFIG_ZSMALLOC_STAT static struct dentry *zs_stat_root; #endif -#ifdef CONFIG_COMPACTION -static struct vfsmount *zsmalloc_mnt; -#endif - -/* - * We assign a page to ZS_ALMOST_EMPTY fullness group when: - * n <= N / f, where - * n = number of allocated objects - * N = total number of objects zspage can store - * f = fullness_threshold_frac - * - * Similarly, we assign zspage to: - * ZS_ALMOST_FULL when n > N / f - * ZS_EMPTY when n == 0 - * ZS_FULL when n == N - * - * (see: fix_fullness_group()) - */ -static const int fullness_threshold_frac = 4; static size_t huge_class_size; struct size_class { - spinlock_t lock; - struct list_head fullness_list[NR_ZS_FULLNESS]; + struct list_head fullness_list[NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. @@ -215,21 +202,9 @@ struct size_class { struct zs_size_stat stats; }; -/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ -static void SetPageHugeObject(struct page *page) -{ - SetPageOwnerPriv1(page); -} - -static void ClearPageHugeObject(struct page *page) -{ - ClearPageOwnerPriv1(page); -} - -static int PageHugeObject(struct page *page) -{ - return PageOwnerPriv1(page); -} +#ifdef CONFIG_COMPACTION +static struct vfsmount *zsmalloc_mnt; +#endif /* * Placed within free objects to form a singly linked list. @@ -271,15 +246,14 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; - /* A wait queue for when migration races with async_free_zspage() */ - struct wait_queue_head migration_wait; - atomic_long_t isolated_pages; - bool destroying; #endif + spinlock_t lock; + atomic_t compaction_in_progress; }; struct zspage { struct { + unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; @@ -289,23 +263,37 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ -#ifdef CONFIG_COMPACTION rwlock_t lock; -#endif }; struct mapping_area { + local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; -#ifdef CONFIG_COMPACTION -static int zs_register_migration(struct zs_pool *pool); -static void zs_unregister_migration(struct zs_pool *pool); +/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ +static void SetZsHugePage(struct zspage *zspage) +{ + zspage->huge = 1; +} + +static bool ZsHugePage(struct zspage *zspage) +{ + return zspage->huge; +} + static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION +static int zs_register_migration(struct zs_pool *pool); +static void zs_unregister_migration(struct zs_pool *pool); +static void migrate_write_lock(struct zspage *zspage); +static void migrate_write_lock_nested(struct zspage *zspage); +static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); @@ -314,9 +302,9 @@ static int zsmalloc_mount(void) { return 0; } static void zsmalloc_unmount(void) {} static int zs_register_migration(struct zs_pool *pool) { return 0; } static void zs_unregister_migration(struct zs_pool *pool) {} -static void migrate_lock_init(struct zspage *zspage) {} -static void migrate_read_lock(struct zspage *zspage) {} -static void migrate_read_unlock(struct zspage *zspage) {} +static void migrate_write_lock(struct zspage *zspage) {} +static void migrate_write_lock_nested(struct zspage *zspage) {} +static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} @@ -349,7 +337,7 @@ static void destroy_cache(struct zs_pool *pool) static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA)); + gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_handle(struct zs_pool *pool, unsigned long handle) @@ -359,8 +347,8 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { - return kmem_cache_alloc(pool->zspage_cachep, - flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE|__GFP_CMA)); + return kmem_cache_zalloc(pool->zspage_cachep, + flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) @@ -368,23 +356,17 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } +/* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { - /* - * lsb of @obj represents handle lock while other bits - * represent object value the handle is pointing so - * updating shouldn't do store tearing. - */ - WRITE_ONCE(*(unsigned long *)handle, obj); + *(unsigned long *)handle = obj; } /* zpool driver */ #ifdef CONFIG_ZPOOL -static void *zs_zpool_create(const char *name, gfp_t gfp, - const struct zpool_ops *zpool_ops, - struct zpool *zpool) +static void *zs_zpool_create(const char *name, gfp_t gfp) { /* * Ignore global gfp flags: zs_malloc() may be invoked from @@ -403,7 +385,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); - return *handle ? 0 : -1; + + if (IS_ERR_VALUE(*handle)) + return PTR_ERR((void *)*handle); + return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { @@ -457,19 +442,16 @@ MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area); - -static bool is_zspage_isolated(struct zspage *zspage) -{ - return zspage->isolated; -} +static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } -/* Protected by class->lock */ +/* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; @@ -489,14 +471,14 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -static inline int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct page *page) { - return page->units; + return page->page_type; } -static inline void set_first_obj_offset(struct page *page, int offset) +static inline void set_first_obj_offset(struct page *page, unsigned int offset) { - page->units = offset; + page->page_type = offset; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -510,8 +492,8 @@ static inline void set_freeobj(struct zspage *zspage, unsigned int obj) } static void get_zspage_mapping(struct zspage *zspage, - unsigned int *class_idx, - enum fullness_group *fullness) + unsigned int *class_idx, + int *fullness) { BUG_ON(zspage->magic != ZSPAGE_MAGIC); @@ -519,9 +501,15 @@ static void get_zspage_mapping(struct zspage *zspage, *class_idx = zspage->class; } +static struct size_class *zspage_class(struct zs_pool *pool, + struct zspage *zspage) +{ + return pool->size_class[zspage->class]; +} + static void set_zspage_mapping(struct zspage *zspage, - unsigned int class_idx, - enum fullness_group fullness) + unsigned int class_idx, + int fullness) { zspage->class = class_idx; zspage->fullness = fullness; @@ -532,7 +520,7 @@ static void set_zspage_mapping(struct zspage *zspage, * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the - * size class which has chunk size big enough to hold the give size. + * size class which has chunk size big enough to hold the given size. */ static int get_size_class_index(int size) { @@ -545,23 +533,19 @@ static int get_size_class_index(int size) return min_t(int, ZS_SIZE_CLASSES - 1, idx); } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_inc(struct size_class *class, +static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline void zs_stat_dec(struct size_class *class, +static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } -/* type can be of enum type zs_stat_type or fullness_group */ -static inline unsigned long zs_stat_get(struct size_class *class, - int type) +static inline unsigned long zs_stat_get(struct size_class *class, int type) { return class->stats.objs[type]; } @@ -587,47 +571,49 @@ static unsigned long zs_can_compact(struct size_class *class); static int zs_stats_size_show(struct seq_file *s, void *v) { - int i; + int i, fg; struct zs_pool *pool = s->private; struct size_class *class; int objs_per_zspage; - unsigned long class_almost_full, class_almost_empty; unsigned long obj_allocated, obj_used, pages_used, freeable; - unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; unsigned long total_freeable = 0; + unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, }; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", - "class", "size", "almost_full", "almost_empty", + seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n", + "class", "size", "10%", "20%", "30%", "40%", + "50%", "60%", "70%", "80%", "90%", "99%", "100%", "obj_allocated", "obj_used", "pages_used", "pages_per_zspage", "freeable"); for (i = 0; i < ZS_SIZE_CLASSES; i++) { + class = pool->size_class[i]; if (class->index != i) continue; - spin_lock(&class->lock); - class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); - class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); - obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); - obj_used = zs_stat_get(class, OBJ_USED); + spin_lock(&pool->lock); + + seq_printf(s, " %5u %5u ", i, class->size); + for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) { + inuse_totals[fg] += zs_stat_get(class, fg); + seq_printf(s, "%9lu ", zs_stat_get(class, fg)); + } + + obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); + obj_used = zs_stat_get(class, ZS_OBJS_INUSE); freeable = zs_can_compact(class); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu" - " %10lu %10lu %16d %8lu\n", - i, class->size, class_almost_full, class_almost_empty, - obj_allocated, obj_used, pages_used, - class->pages_per_zspage, freeable); + seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n", + obj_allocated, obj_used, pages_used, + class->pages_per_zspage, freeable); - total_class_almost_full += class_almost_full; - total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; @@ -635,10 +621,14 @@ static int zs_stats_size_show(struct seq_file *s, void *v) } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", - "Total", "", total_class_almost_full, - total_class_almost_empty, total_objs, - total_used_objs, total_pages, "", total_freeable); + seq_printf(s, " %5s %5s ", "Total", ""); + + for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) + seq_printf(s, "%9lu ", inuse_totals[fg]); + + seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n", + total_objs, total_used_objs, total_pages, "", + total_freeable); return 0; } @@ -683,30 +673,28 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) /* * For each size class, zspages are divided into different groups - * depending on how "full" they are. This was done so that we could - * easily find empty or nearly empty zspages when we try to shrink - * the pool (not yet implemented). This function returns fullness + * depending on their usage ratio. This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct size_class *class, - struct zspage *zspage) +static int get_fullness_group(struct size_class *class, struct zspage *zspage) { - int inuse, objs_per_zspage; - enum fullness_group fg; + int inuse, objs_per_zspage, ratio; inuse = get_zspage_inuse(zspage); objs_per_zspage = class->objs_per_zspage; if (inuse == 0) - fg = ZS_EMPTY; - else if (inuse == objs_per_zspage) - fg = ZS_FULL; - else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) - fg = ZS_ALMOST_EMPTY; - else - fg = ZS_ALMOST_FULL; + return ZS_INUSE_RATIO_0; + if (inuse == objs_per_zspage) + return ZS_INUSE_RATIO_100; - return fg; + ratio = 100 * inuse / objs_per_zspage; + /* + * Take integer division into consideration: a page with one inuse + * object out of 127 possible, will end up having 0 usage ratio, + * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group. + */ + return ratio / 10 + 1; } /* @@ -717,23 +705,9 @@ static enum fullness_group get_fullness_group(struct size_class *class, */ static void insert_zspage(struct size_class *class, struct zspage *zspage, - enum fullness_group fullness) + int fullness) { - struct zspage *head; - - zs_stat_inc(class, fullness, 1); - head = list_first_entry_or_null(&class->fullness_list[fullness], - struct zspage, list); - /* - * We want to see more ZS_FULL pages and less almost empty/full. - * Put pages with higher ->inuse first. - */ - if (head) { - if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { - list_add(&zspage->list, &head->list); - return; - } - } + class_stat_inc(class, fullness, 1); list_add(&zspage->list, &class->fullness_list[fullness]); } @@ -743,85 +717,43 @@ static void insert_zspage(struct size_class *class, */ static void remove_zspage(struct size_class *class, struct zspage *zspage, - enum fullness_group fullness) + int fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); - VM_BUG_ON(is_zspage_isolated(zspage)); list_del_init(&zspage->list); - zs_stat_dec(class, fullness, 1); + class_stat_dec(class, fullness, 1); } /* * Each size class maintains zspages in different fullness groups depending * on the number of live objects they contain. When allocating or freeing - * objects, the fullness status of the page can change, say, from ALMOST_FULL - * to ALMOST_EMPTY when freeing an object. This function checks if such - * a status change has occurred for the given page and accordingly moves the - * page from the freelist of the old fullness group to that of the new - * fullness group. + * objects, the fullness status of the page can change, for instance, from + * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function + * checks if such a status change has occurred for the given page and + * accordingly moves the page from the list of the old fullness group to that + * of the new fullness group. */ -static enum fullness_group fix_fullness_group(struct size_class *class, - struct zspage *zspage) +static int fix_fullness_group(struct size_class *class, struct zspage *zspage) { int class_idx; - enum fullness_group currfg, newfg; + int currfg, newfg; get_zspage_mapping(zspage, &class_idx, &currfg); newfg = get_fullness_group(class, zspage); if (newfg == currfg) goto out; - if (!is_zspage_isolated(zspage)) { - remove_zspage(class, zspage, currfg); - insert_zspage(class, zspage, newfg); - } - + remove_zspage(class, zspage, currfg); + insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); - out: return newfg; } -/* - * We have to decide on how many pages to link together - * to form a zspage for each size class. This is important - * to reduce wastage due to unusable space left at end of - * each zspage which is given as: - * wastage = Zp % class_size - * usage = Zp - wastage - * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... - * - * For example, for size class of 3/8 * PAGE_SIZE, we should - * link together 3 PAGE_SIZE sized pages to form a zspage - * since then we can perfectly fit in 8 such objects. - */ -static int get_pages_per_zspage(int class_size) -{ - int i, max_usedpc = 0; - /* zspage order which gives maximum used size per KB */ - int max_usedpc_order = 1; - - for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { - int zspage_size; - int waste, usedpc; - - zspage_size = i * PAGE_SIZE; - waste = zspage_size % class_size; - usedpc = (zspage_size - waste) * 100 / zspage_size; - - if (usedpc > max_usedpc) { - max_usedpc = usedpc; - max_usedpc_order = i; - } - } - - return max_usedpc_order; -} - static struct zspage *get_zspage(struct page *page) { - struct zspage *zspage = (struct zspage *)page->private; + struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; @@ -829,10 +761,12 @@ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - if (unlikely(PageHugeObject(page))) + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) return NULL; - return page->freelist; + return (struct page *)page->index; } /** @@ -849,6 +783,12 @@ static void obj_to_location(unsigned long obj, struct page **page, *obj_idx = (obj & OBJ_INDEX_MASK); } +static void obj_to_page(unsigned long obj, struct page **page) +{ + obj >>= OBJ_TAG_BITS; + *page = pfn_to_page(obj >> OBJ_INDEX_BITS); +} + /** * location_to_obj - get obj value encoded from (, ) * @page: page object resides in zspage @@ -870,33 +810,24 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static unsigned long obj_to_head(struct page *page, void *obj) +static inline bool obj_allocated(struct page *page, void *obj, + unsigned long *phandle) { - if (unlikely(PageHugeObject(page))) { + unsigned long handle; + struct zspage *zspage = get_zspage(page); + + if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); - return page->index; + handle = page->index; } else - return *(unsigned long *)obj; -} + handle = *(unsigned long *)obj; -static inline int testpin_tag(unsigned long handle) -{ - return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); -} + if (!(handle & OBJ_ALLOCATED_TAG)) + return false; -static inline int trypin_tag(unsigned long handle) -{ - return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void pin_tag(unsigned long handle) __acquires(bitlock) -{ - bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); -} - -static void unpin_tag(unsigned long handle) __releases(bitlock) -{ - bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); + /* Clear all tags before returning the handle */ + *phandle = handle & ~OBJ_TAG_MASK; + return true; } static void reset_page(struct page *page) @@ -905,8 +836,7 @@ static void reset_page(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); - ClearPageHugeObject(page); - page->freelist = NULL; + page->index = 0; } static int trylock_zspage(struct zspage *zspage) @@ -934,15 +864,15 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct page *page, *next; - enum fullness_group fg; + int fg; unsigned int class_idx; get_zspage_mapping(zspage, &class_idx, &fg); - assert_spin_locked(&class->lock); + assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); - VM_BUG_ON(fg != ZS_EMPTY); + VM_BUG_ON(fg != ZS_INUSE_RATIO_0); next = page = get_first_page(zspage); do { @@ -957,9 +887,8 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, cache_free_zspage(pool, zspage); - zs_stat_dec(class, OBJ_ALLOCATED, class->objs_per_zspage); - atomic_long_sub(class->pages_per_zspage, - &pool->pages_allocated); + class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); + atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } static void free_zspage(struct zs_pool *pool, struct size_class *class, @@ -968,12 +897,17 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); + /* + * Since zs_free couldn't be sleepable, this function cannot call + * lock_page. The page locks trylock_zspage got will be released + * by __free_zspage. + */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; } - remove_zspage(class, zspage, ZS_EMPTY); + remove_zspage(class, zspage, ZS_INUSE_RATIO_0); __free_zspage(pool, class, zspage); } @@ -1032,7 +966,7 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, /* * Allocate individual pages and link them together as: - * 1. all pages are linked together using page->freelist + * 1. all pages are linked together using page->index * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page @@ -1041,15 +975,15 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, for (i = 0; i < nr_pages; i++) { page = pages[i]; set_page_private(page, (unsigned long)zspage); - page->freelist = NULL; + page->index = 0; if (i == 0) { zspage->first_page = page; SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) - SetPageHugeObject(page); + SetZsHugePage(zspage); } else { - prev_page->freelist = page; + prev_page->index = (unsigned long)page; } prev_page = page; } @@ -1069,7 +1003,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zspage) return NULL; - memset(zspage, 0, sizeof(struct zspage)); zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); @@ -1101,9 +1034,9 @@ static struct zspage *find_get_zspage(struct size_class *class) int i; struct zspage *zspage; - for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { + for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) { zspage = list_first_entry_or_null(&class->fullness_list[i], - struct zspage, list); + struct zspage, list); if (zspage) break; } @@ -1223,6 +1156,32 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +static bool zspage_empty(struct zspage *zspage) +{ + return get_zspage_inuse(zspage) == 0; +} + +/** + * zs_lookup_class_index() - Returns index of the zsmalloc &size_class + * that hold objects of the provided size. + * @pool: zsmalloc pool to use + * @size: object size + * + * Context: Any context. + * + * Return: the index of the zsmalloc &size_class that hold objects of the + * provided size. + */ +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) +{ + struct size_class *class; + + class = pool->size_class[get_size_class_index(size)]; + + return class->index; +} +EXPORT_SYMBOL_GPL(zs_lookup_class_index); + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1233,7 +1192,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc - * @mm: maping mode to use + * @mm: mapping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using @@ -1252,8 +1211,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; struct page *pages[2]; @@ -1266,21 +1223,26 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, */ BUG_ON(in_interrupt()); - /* From now on, migration cannot move the object */ - pin_tag(handle); - + /* It guarantees it can get zspage from handle safely */ + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - /* migration cannot move any subpage in this zspage */ + /* + * migration cannot move any zpages in this zspage. Here, pool->lock + * is too heavy since callers would take some time until they calls + * zs_unmap_object API so delegate the locking from class to zspage + * which is smaller granularity. + */ migrate_read_lock(zspage); + spin_unlock(&pool->lock); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; - off = (class->size * obj_idx) & ~PAGE_MASK; + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); - area = &get_cpu_var(zs_map_area); + local_lock(&zs_map_area.lock); + area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ @@ -1296,7 +1258,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, ret = __zs_map_object(area, pages, off, class->size); out: - if (likely(!PageHugeObject(page))) + if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; @@ -1310,17 +1272,14 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) unsigned long obj, off; unsigned int obj_idx; - unsigned int class_idx; - enum fullness_group fg; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); - class = pool->size_class[class_idx]; - off = (class->size * obj_idx) & ~PAGE_MASK; + class = zspage_class(pool, zspage); + off = offset_in_page(class->size * obj_idx); area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) @@ -1334,10 +1293,9 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) __zs_unmap_object(area, pages, off, class->size); } - put_cpu_var(zs_map_area); + local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); - unpin_tag(handle); } EXPORT_SYMBOL_GPL(zs_unmap_object); @@ -1360,23 +1318,25 @@ size_t zs_huge_class_size(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_huge_class_size); -static unsigned long obj_malloc(struct size_class *class, +static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; + struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; + class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); offset = obj * class->size; nr_page = offset >> PAGE_SHIFT; - m_offset = offset & ~PAGE_MASK; + m_offset = offset_in_page(offset); m_page = get_first_page(zspage); for (i = 0; i < nr_page; i++) @@ -1385,7 +1345,7 @@ static unsigned long obj_malloc(struct size_class *class, vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); - if (likely(!PageHugeObject(m_page))) + if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else @@ -1394,7 +1354,6 @@ static unsigned long obj_malloc(struct size_class *class, kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); - zs_stat_inc(class, OBJ_USED, 1); obj = location_to_obj(m_page, obj); @@ -1409,66 +1368,68 @@ static unsigned long obj_malloc(struct size_class *class, * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, - * otherwise 0. + * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; - enum fullness_group newfg; + int newfg; struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; + return (unsigned long)ERR_PTR(-EINVAL); handle = cache_alloc_handle(pool, gfp); if (!handle) - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; - spin_lock(&class->lock); + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { - obj = obj_malloc(class, zspage, handle); + obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); - spin_unlock(&class->lock); + class_stat_inc(class, ZS_OBJS_INUSE, 1); - return handle; + goto out; } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); } - spin_lock(&class->lock); - obj = obj_malloc(class, zspage, handle); + spin_lock(&pool->lock); + obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); - atomic_long_add(class->pages_per_zspage, - &pool->pages_allocated); - zs_stat_inc(class, OBJ_ALLOCATED, class->objs_per_zspage); + atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); + class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); + class_stat_inc(class, ZS_OBJS_INUSE, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); - spin_unlock(&class->lock); +out: + spin_unlock(&pool->lock); return handle; } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct size_class *class, unsigned long obj) +static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; @@ -1477,20 +1438,22 @@ static void obj_free(struct size_class *class, unsigned long obj) unsigned int f_objidx; void *vaddr; - obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); - f_offset = (class->size * f_objidx) & ~PAGE_MASK; + f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); + link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ - link = (struct link_free *)(vaddr + f_offset); - link->next = get_freeobj(zspage) << OBJ_TAG_BITS; - kunmap_atomic(vaddr); + if (likely(!ZsHugePage(zspage))) + link->next = get_freeobj(zspage) << OBJ_TAG_BITS; + else + f_page->index = 0; set_freeobj(zspage, f_objidx); + + kunmap_atomic(vaddr); mod_zspage_inuse(zspage, -1); - zs_stat_dec(class, OBJ_USED, 1); } void zs_free(struct zs_pool *pool, unsigned long handle) @@ -1498,42 +1461,30 @@ void zs_free(struct zs_pool *pool, unsigned long handle) struct zspage *zspage; struct page *f_page; unsigned long obj; - unsigned int f_objidx; - int class_idx; struct size_class *class; - enum fullness_group fullness; - bool isolated; + int fullness; - if (unlikely(!handle)) + if (IS_ERR_OR_NULL((void *)handle)) return; - pin_tag(handle); + /* + * The pool->lock protects the race with zpage's migration + * so it's safe to get the page from handle. + */ + spin_lock(&pool->lock); obj = handle_to_obj(handle); - obj_to_location(obj, &f_page, &f_objidx); + obj_to_page(obj, &f_page); zspage = get_zspage(f_page); + class = zspage_class(pool, zspage); - migrate_read_lock(zspage); + class_stat_dec(class, ZS_OBJS_INUSE, 1); + obj_free(class->size, obj); - get_zspage_mapping(zspage, &class_idx, &fullness); - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); - obj_free(class, obj); fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) { - migrate_read_unlock(zspage); - goto out; - } - - isolated = is_zspage_isolated(zspage); - migrate_read_unlock(zspage); - /* If zspage is isolated, zs_page_putback will free the zspage */ - if (likely(!isolated)) + if (fullness == ZS_INUSE_RATIO_0) free_zspage(pool, class, zspage); -out: - spin_unlock(&class->lock); - unpin_tag(handle); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1553,8 +1504,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); - s_off = (class->size * s_objidx) & ~PAGE_MASK; - d_off = (class->size * d_objidx) & ~PAGE_MASK; + s_off = offset_in_page(class->size * s_objidx); + d_off = offset_in_page(class->size * d_objidx); if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; @@ -1578,6 +1529,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_off += size; d_size -= size; + /* + * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() + * calls must occurs in reverse order of calls to kmap_atomic(). + * So, to call kunmap_atomic(s_addr) we should first call + * kunmap_atomic(d_addr). For more details see + * Documentation/mm/highmem.rst. + */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); @@ -1606,10 +1564,9 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) + struct page *page, int *obj_idx) { - unsigned long head; - int offset = 0; + unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1618,13 +1575,8 @@ static unsigned long find_alloced_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - head = obj_to_head(page, addr + offset); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (trypin_tag(handle)) - break; - handle = 0; - } + if (obj_allocated(page, addr + offset, &handle)) + break; offset += class->size; index++; @@ -1637,26 +1589,14 @@ static unsigned long find_alloced_obj(struct size_class *class, return handle; } -struct zs_compact_control { - /* Source spage for migration which could be a subpage of zspage */ - struct page *s_page; - /* Destination page for migration which should be a first page - * of zspage. */ - struct page *d_page; - /* Starting object index within @s_page which used for live object - * in the subpage. */ - int obj_idx; -}; - -static int migrate_zspage(struct zs_pool *pool, struct size_class *class, - struct zs_compact_control *cc) +static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, + struct zspage *dst_zspage) { unsigned long used_obj, free_obj; unsigned long handle; - struct page *s_page = cc->s_page; - struct page *d_page = cc->d_page; - int obj_idx = cc->obj_idx; - int ret = 0; + int obj_idx = 0; + struct page *s_page = get_first_page(src_zspage); + struct size_class *class = pool->size_class[src_zspage->class]; while (1) { handle = find_alloced_obj(class, s_page, &obj_idx); @@ -1668,53 +1608,50 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, continue; } - /* Stop if there is no more space */ - if (zspage_full(class, get_zspage(d_page))) { - unpin_tag(handle); - ret = -ENOMEM; - break; - } - used_obj = handle_to_obj(handle); - free_obj = obj_malloc(class, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, dst_zspage, handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; - /* - * record_obj updates handle's value to free_obj and it will - * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which - * breaks synchronization using pin_tag(e,g, zs_free) so - * let's keep the lock bit. - */ - free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); - unpin_tag(handle); - obj_free(class, used_obj); + obj_free(class->size, used_obj); + + /* Stop if there is no more space */ + if (zspage_full(class, dst_zspage)) + break; + + /* Stop if there are no more objects to migrate */ + if (zspage_empty(src_zspage)) + break; } - - /* Remember last position in this iteration */ - cc->s_page = s_page; - cc->obj_idx = obj_idx; - - return ret; } -static struct zspage *isolate_zspage(struct size_class *class, bool source) +static struct zspage *isolate_src_zspage(struct size_class *class) { - int i; struct zspage *zspage; - enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; + int fg; - if (!source) { - fg[0] = ZS_ALMOST_FULL; - fg[1] = ZS_ALMOST_EMPTY; + for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) { + zspage = list_first_entry_or_null(&class->fullness_list[fg], + struct zspage, list); + if (zspage) { + remove_zspage(class, zspage, fg); + return zspage; + } } - for (i = 0; i < 2; i++) { - zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], - struct zspage, list); + return zspage; +} + +static struct zspage *isolate_dst_zspage(struct size_class *class) +{ + struct zspage *zspage; + int fg; + + for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) { + zspage = list_first_entry_or_null(&class->fullness_list[fg], + struct zspage, list); if (zspage) { - VM_BUG_ON(is_zspage_isolated(zspage)); - remove_zspage(class, zspage, fg[i]); + remove_zspage(class, zspage, fg); return zspage; } } @@ -1727,14 +1664,11 @@ static struct zspage *isolate_zspage(struct size_class *class, bool source) * @class: destination class * @zspage: target page * - * Return @zspage's fullness_group + * Return @zspage's fullness status */ -static enum fullness_group putback_zspage(struct size_class *class, - struct zspage *zspage) +static int putback_zspage(struct size_class *class, struct zspage *zspage) { - enum fullness_group fullness; - - VM_BUG_ON(is_zspage_isolated(zspage)); + int fullness; fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); @@ -1785,6 +1719,7 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } +#endif /* CONFIG_COMPACTION */ static int zs_init_fs_context(struct fs_context *fc) { @@ -1828,11 +1763,17 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); } +static void migrate_write_lock_nested(struct zspage *zspage) +{ + write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); +} + static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); @@ -1846,35 +1787,10 @@ static void inc_zspage_isolation(struct zspage *zspage) static void dec_zspage_isolation(struct zspage *zspage) { + VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } -static void putback_zspage_deferred(struct zs_pool *pool, - struct size_class *class, - struct zspage *zspage) -{ - enum fullness_group fg; - - fg = putback_zspage(class, zspage); - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); - -} - -static inline void zs_pool_dec_isolated(struct zs_pool *pool) -{ - VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); - atomic_long_dec(&pool->isolated_pages); - /* - * Checking pool->destroying must happen after atomic_long_dec() - * for pool->isolated_pages above. Paired with the smp_mb() in - * zs_unregister_migration(). - */ - smp_mb__after_atomic(); - if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) - wake_up_all(&pool->migration_wait); -} - static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1893,7 +1809,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); - if (unlikely(PageHugeObject(oldpage))) + if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, page_mapping(oldpage)); } @@ -1901,9 +1817,6 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct address_space *mapping; @@ -1911,45 +1824,14 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - - /* - * Without class lock, fullness could be stale while class_idx is okay - * because class_idx is constant unless page is freed so we should get - * fullness again under class lock. - */ - get_zspage_mapping(zspage, &class_idx, &fullness); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); - if (get_zspage_inuse(zspage) == 0) { - spin_unlock(&class->lock); - return false; - } - - /* zspage is isolated for object migration */ - if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - spin_unlock(&class->lock); - return false; - } - - /* - * If this is first time isolation for the zspage, isolate zspage from - * size_class to prevent further object allocation from the zspage. - */ - if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { - get_zspage_mapping(zspage, &class_idx, &fullness); - atomic_long_inc(&pool->isolated_pages); - remove_zspage(class, zspage, fullness); - } - + spin_lock(&pool->lock); inc_zspage_isolation(zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return true; } @@ -1959,16 +1841,13 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, { struct zs_pool *pool; struct size_class *class; - int class_idx; - enum fullness_group fullness; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset, pos; - unsigned long handle, head; + unsigned int offset; + unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - int ret = -EAGAIN; /* * We cannot support the _NO_COPY case here, because copy needs to @@ -1978,38 +1857,23 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - zspage = get_zspage(page); - - /* Concurrent compactor cannot migrate any subpage in zspage */ - migrate_write_lock(zspage); - get_zspage_mapping(zspage, &class_idx, &fullness); pool = mapping->private_data; - class = pool->size_class[class_idx]; + + /* + * The pool's lock protects the race between zpage migration + * and zs_free. + */ + spin_lock(&pool->lock); + zspage = get_zspage(page); + class = zspage_class(pool, zspage); + + /* the migrate_write_lock protects zpage access via zs_map_object */ + migrate_write_lock(zspage); + offset = get_first_obj_offset(page); - - spin_lock(&class->lock); - if (!get_zspage_inuse(zspage)) { - /* - * Set "offset" to end of the page so that every loops - * skips unnecessary object scanning. - */ - offset = PAGE_SIZE; - } - - pos = offset; s_addr = kmap_atomic(page); - while (pos < PAGE_SIZE) { - head = obj_to_head(page, s_addr + pos); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (!trypin_tag(handle)) - goto unpin_objects; - } - pos += class->size; - } /* * Here, any user cannot access all objects in the zspage so let's move. @@ -2018,43 +1882,29 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, copy_page(d_addr, s_addr); kunmap_atomic(d_addr); - for (addr = s_addr + offset; addr < s_addr + pos; + for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); + if (obj_allocated(page, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); - new_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, new_obj); } } + kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); - get_page(newpage); - dec_zspage_isolation(zspage); - /* - * Page migration is done so let's putback isolated zspage to - * the list if @page is final isolated subpage in the zspage. + * Since we complete the data copy and set up new zspage structure, + * it's okay to release the pool's lock. */ - if (!is_zspage_isolated(zspage)) { - /* - * We cannot race with zs_destroy_pool() here because we wait - * for isolation to hit zero before we start destroying. - * Also, we ensure that everyone can see pool->destroying before - * we start waiting. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } + spin_unlock(&pool->lock); + migrate_write_unlock(zspage); + get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); @@ -2062,56 +1912,24 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, reset_page(page); put_page(page); - page = newpage; - ret = MIGRATEPAGE_SUCCESS; -unpin_objects: - for (addr = s_addr + offset; addr < s_addr + pos; - addr += class->size) { - head = obj_to_head(page, addr); - if (head & OBJ_ALLOCATED_TAG) { - handle = head & ~OBJ_ALLOCATED_TAG; - if (!testpin_tag(handle)) - BUG(); - unpin_tag(handle); - } - } - kunmap_atomic(s_addr); - spin_unlock(&class->lock); - migrate_write_unlock(zspage); - - return ret; + return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) { struct zs_pool *pool; - struct size_class *class; - int class_idx; - enum fullness_group fg; struct address_space *mapping; struct zspage *zspage; - VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - get_zspage_mapping(zspage, &class_idx, &fg); mapping = page_mapping(page); pool = mapping->private_data; - class = pool->size_class[class_idx]; - - spin_lock(&class->lock); + spin_lock(&pool->lock); dec_zspage_isolation(zspage); - if (!is_zspage_isolated(zspage)) { - /* - * Due to page_lock, we cannot free zspage immediately - * so let's defer. - */ - putback_zspage_deferred(pool, class, zspage); - zs_pool_dec_isolated(pool); - } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } static const struct address_space_operations zsmalloc_aops = { @@ -2133,36 +1951,8 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } -static bool pool_isolated_are_drained(struct zs_pool *pool) -{ - return atomic_long_read(&pool->isolated_pages) == 0; -} - -/* Function for resolving migration */ -static void wait_for_isolated_drain(struct zs_pool *pool) -{ - - /* - * We're in the process of destroying the pool, so there are no - * active allocations. zs_page_isolate() fails for completely free - * zspages, so we need only wait for the zs_pool's isolated - * count to hit zero. - */ - wait_event(pool->migration_wait, - pool_isolated_are_drained(pool)); -} - static void zs_unregister_migration(struct zs_pool *pool) { - pool->destroying = true; - /* - * We need a memory barrier here to ensure global visibility of - * pool->destroying. Thus pool->isolated pages will either be 0 in which - * case we don't care, or it will be > 0 and pool->destroying will - * ensure that we wake up once isolation hits 0. - */ - smp_mb(); - wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2176,7 +1966,7 @@ static void async_free_zspage(struct work_struct *work) int i; struct size_class *class; unsigned int class_idx; - enum fullness_group fullness; + int fullness; struct zspage *zspage, *tmp; LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool, @@ -2187,22 +1977,22 @@ static void async_free_zspage(struct work_struct *work) if (class->index != i) continue; - spin_lock(&class->lock); - list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); - spin_unlock(&class->lock); + spin_lock(&pool->lock); + list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0], + &free_pages); + spin_unlock(&pool->lock); } - list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); get_zspage_mapping(zspage, &class_idx, &fullness); - VM_BUG_ON(fullness != ZS_EMPTY); + VM_BUG_ON(fullness != ZS_INUSE_RATIO_0); class = pool->size_class[class_idx]; - spin_lock(&class->lock); - __free_zspage(pool, pool->size_class[class_idx], zspage); - spin_unlock(&class->lock); + spin_lock(&pool->lock); + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); } }; @@ -2236,8 +2026,8 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; - unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); - unsigned long obj_used = zs_stat_get(class, OBJ_USED); + unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE); if (obj_allocated <= obj_used) return 0; @@ -2251,50 +2041,63 @@ static unsigned long zs_can_compact(struct size_class *class) static unsigned long __zs_compact(struct zs_pool *pool, struct size_class *class) { - struct zs_compact_control cc; - struct zspage *src_zspage; + struct zspage *src_zspage = NULL; struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; - spin_lock(&class->lock); - while ((src_zspage = isolate_zspage(class, true))) { + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); + while (zs_can_compact(class)) { + int fg; - if (!zs_can_compact(class)) - break; - - cc.obj_idx = 0; - cc.s_page = get_first_page(src_zspage); - - while ((dst_zspage = isolate_zspage(class, false))) { - cc.d_page = get_first_page(dst_zspage); - /* - * If there is no more space in dst_page, resched - * and see if anyone had allocated another zspage. - */ - if (!migrate_zspage(pool, class, &cc)) + if (!dst_zspage) { + dst_zspage = isolate_dst_zspage(class); + if (!dst_zspage) break; - - putback_zspage(class, dst_zspage); + migrate_write_lock(dst_zspage); } - /* Stop if we couldn't find slot */ - if (dst_zspage == NULL) + src_zspage = isolate_src_zspage(class); + if (!src_zspage) break; - putback_zspage(class, dst_zspage); - if (putback_zspage(class, src_zspage) == ZS_EMPTY) { + migrate_write_lock_nested(src_zspage); + + migrate_zspage(pool, src_zspage, dst_zspage); + fg = putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + + if (fg == ZS_INUSE_RATIO_0) { free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; } - spin_unlock(&class->lock); - cond_resched(); - spin_lock(&class->lock); + src_zspage = NULL; + + if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 + || spin_is_contended(&pool->lock)) { + putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + dst_zspage = NULL; + + spin_unlock(&pool->lock); + cond_resched(); + spin_lock(&pool->lock); + } } - if (src_zspage) + if (src_zspage) { putback_zspage(class, src_zspage); + migrate_write_unlock(src_zspage); + } - spin_unlock(&class->lock); + if (dst_zspage) { + putback_zspage(class, dst_zspage); + migrate_write_unlock(dst_zspage); + } + spin_unlock(&pool->lock); return pages_freed; } @@ -2305,15 +2108,23 @@ unsigned long zs_compact(struct zs_pool *pool) struct size_class *class; unsigned long pages_freed = 0; + /* + * Pool compaction is performed under pool->lock so it is basically + * single-threaded. Having more than one thread in __zs_compact() + * will increase pool->lock contention, which will impact other + * zsmalloc operations that need pool->lock. + */ + if (atomic_xchg(&pool->compaction_in_progress, 1)) + return 0; + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); } atomic_long_add(pages_freed, &pool->stats.pages_compacted); + atomic_set(&pool->compaction_in_progress, 0); return pages_freed; } @@ -2360,8 +2171,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; @@ -2391,6 +2200,27 @@ static int zs_register_shrinker(struct zs_pool *pool) return register_shrinker(&pool->shrinker); } +static int calculate_zspage_chain_size(int class_size) +{ + int i, min_waste = INT_MAX; + int chain_size = 1; + + if (is_power_of_2(class_size)) + return chain_size; + + for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { + int waste; + + waste = (i * PAGE_SIZE) % class_size; + if (waste < min_waste) { + min_waste = waste; + chain_size = i; + } + } + + return chain_size; +} + /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created @@ -2412,15 +2242,13 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); + spin_lock_init(&pool->lock); + atomic_set(&pool->compaction_in_progress, 0); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err; -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pool->migration_wait); -#endif - if (create_cache(pool)) goto err; @@ -2433,12 +2261,12 @@ struct zs_pool *zs_create_pool(const char *name) int pages_per_zspage; int objs_per_zspage; struct size_class *class; - int fullness = 0; + int fullness; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; - pages_per_zspage = get_pages_per_zspage(size); + pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* @@ -2486,11 +2314,13 @@ struct zs_pool *zs_create_pool(const char *name) class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; - spin_lock_init(&class->lock); pool->size_class[i] = class; - for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; - fullness++) + + fullness = ZS_INUSE_RATIO_0; + while (fullness < NR_FULLNESS_GROUPS) { INIT_LIST_HEAD(&class->fullness_list[fullness]); + fullness++; + } prev_class = class; } @@ -2535,11 +2365,12 @@ void zs_destroy_pool(struct zs_pool *pool) if (class->index != i) continue; - for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { - if (!list_empty(&class->fullness_list[fg])) { - pr_info("Freeing non-empty class with size %db, fullness group %d\n", - class->size, fg); - } + for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) { + if (list_empty(&class->fullness_list[fg])) + continue; + + pr_err("Class-%d fullness group %d is not empty\n", + class->size, fg); } kfree(class); }