Commit 7e2ab150 authored by Christoph Lameter's avatar Christoph Lameter Committed by Linus Torvalds

[PATCH] Direct Migration V9: upgrade MPOL_MF_MOVE and sys_migrate_pages()

Modify policy layer to support direct page migration

- Add migrate_pages_to() allowing the migration of a list of pages to a a
  specified node or to vma with a specific allocation policy in sets of

- Modify do_migrate_pages() to do a staged move of pages from the source
  nodes to the target nodes.
Signed-off-by: default avatarPaul Jackson <>
Signed-off-by: default avatarChristoph Lameter <>
Signed-off-by: default avatarAndrew Morton <>
Signed-off-by: default avatarLinus Torvalds <>
parent a3351e52
......@@ -95,6 +95,9 @@
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
/* The number of pages to migrate per call to migrate_pages() */
static kmem_cache_t *policy_cache;
static kmem_cache_t *sn_cache;
......@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
static int swap_pages(struct list_head *pagelist)
* Migrate the list 'pagelist' of pages to a certain destination.
* Specify destination with either non-NULL vma or dest_node >= 0
* Return the number of pages not migrated or error code
static int migrate_pages_to(struct list_head *pagelist,
struct vm_area_struct *vma, int dest)
int n;
int err = 0;
int nr_pages;
struct page *page;
struct list_head *p;
n = migrate_pages(pagelist, NULL, &moved, &failed);
nr_pages = 0;
list_for_each(p, pagelist) {
if (vma)
page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
return n;
if (!page) {
err = -ENOMEM;
goto out;
list_add(&page->lru, &newlist);
if (nr_pages > MIGRATE_CHUNK_SIZE);
err = migrate_pages(pagelist, &newlist, &moved, &failed);
putback_lru_pages(&moved); /* Call release pages instead ?? */
if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
goto redo;
/* Return leftover allocated pages */
while (!list_empty(&newlist)) {
page = list_entry(, struct page, lru);
list_splice(&failed, pagelist);
if (err < 0)
return err;
/* Calculate number of leftover pages */
nr_pages = 0;
list_for_each(p, pagelist)
return nr_pages;
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
nodemask_t nmask;
int err = 0;
node_set(source, nmask);
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
err = migrate_pages_to(&pagelist, NULL, dest);
if (!list_empty(&pagelist))
return err;
* For now migrate_pages simply swaps out the pages from nodes that are in
* the source set but not in the target set. In the future, we would
* want a function that moves pages between the two nodesets in such
* a way as to preserve the physical layout as much as possible.
* Move pages between the two nodesets so as to preserve the physical
* layout as much as possible.
* Returns the number of page that could not be moved.
......@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
int count = 0;
nodemask_t nodes;
int busy = 0;
int err = 0;
nodemask_t tmp;
nodes_andnot(nodes, *from_nodes, *to_nodes);
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
* bit in 'tmp', and return that <source, dest> pair for migration.
* The pair of nodemasks 'to' and 'from' define the map.
* If no pair of bits is found that way, fallback to picking some
* pair of 'source' and 'dest' bits that are not the same. If the
* 'source' and 'dest' bits are the same, this represents a node
* that will be migrating to itself, so no pages need move.
* If no bits are left in 'tmp', or if all remaining bits left
* in 'tmp' correspond to the same bit in 'to', return false
* (nothing left to migrate).
* This lets us pick a pair of nodes to migrate between, such that
* if possible the dest node is not already occupied by some other
* source node, minimizing the risk of overloading the memory on a
* node that would happen if we migrated incoming memory to a node
* before migrating outgoing memory source that same node.
* A single scan of tmp is sufficient. As we go, we remember the
* most recent <s, d> pair that moved (s != d). If we find a pair
* that not only moved, but what's better, moved to an empty slot
* (d is not set in tmp), then we break out then, with that pair.
* Otherwise when we finish scannng from_tmp, we at least have the
* most recent <s, d> pair that moved. If we get all the way through
* the scan of tmp without finding any node that moved, much less
* moved to an empty node, then there is nothing left worth migrating.
if (!list_empty(&pagelist)) {
count = swap_pages(&pagelist);
tmp = *from_nodes;
while (!nodes_empty(tmp)) {
int s,d;
int source = -1;
int dest = 0;
for_each_node_mask(s, tmp) {
d = node_remap(s, *from_nodes, *to_nodes);
if (s == d)
source = s; /* Node moved. Memorize */
dest = d;
/* dest not in remaining from nodes? */
if (!node_isset(dest, tmp))
if (source == -1)
node_clear(source, tmp);
err = migrate_to_node(mm, source, dest, flags);
if (err > 0)
busy += err;
if (err < 0)
return count;
if (err < 0)
return err;
return busy;
long do_mbind(unsigned long start, unsigned long len,
......@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len,
int nr_failed = 0;
err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
nr_failed = swap_pages(&pagelist);
nr_failed = migrate_pages_to(&pagelist, vma, -1);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
