• Robert Ho's avatar
    mm, proc: fix region lost in /proc/self/smaps · 855af072
    Robert Ho authored
    Recently, Redhat reported that nvml test suite failed on QEMU/KVM,
    more detailed info please refer to:
    
       https://bugzilla.redhat.com/show_bug.cgi?id=1365721
    
    Actually, this bug is not only for NVDIMM/DAX but also for any other
    file systems.  This simple test case abstracted from nvml can easily
    reproduce this bug in common environment:
    
    -------------------------- testcase.c -----------------------------
    
    int
    is_pmem_proc(const void *addr, size_t len)
    {
            const char *caddr = addr;
    
            FILE *fp;
            if ((fp = fopen("/proc/self/smaps", "r")) == NULL) {
                    printf("!/proc/self/smaps");
                    return 0;
            }
    
            int retval = 0;         /* assume false until proven otherwise */
            char line[PROCMAXLEN];  /* for fgets() */
            char *lo = NULL;        /* beginning of current range in smaps file */
            char *hi = NULL;        /* end of current range in smaps file */
            int needmm = 0;         /* looking for mm flag for current range */
            while (fgets(line, PROCMAXLEN, fp) != NULL) {
                    static const char vmflags[] = "VmFlags:";
                    static const char mm[] = " wr";
    
                    /* check for range line */
                    if (sscanf(line, "%p-%p", &lo, &hi) == 2) {
                            if (needmm) {
                                    /* last range matched, but no mm flag found */
                                    printf("never found mm flag.\n");
                                    break;
                            } else if (caddr < lo) {
                                    /* never found the range for caddr */
                                    printf("#######no match for addr %p.\n", caddr);
                                    break;
                            } else if (caddr < hi) {
                                    /* start address is in this range */
                                    size_t rangelen = (size_t)(hi - caddr);
    
                                    /* remember that matching has started */
                                    needmm = 1;
    
                                    /* calculate remaining range to search for */
                                    if (len > rangelen) {
                                            len -= rangelen;
                                            caddr += rangelen;
                                            printf("matched %zu bytes in range "
                                                    "%p-%p, %zu left over.\n",
                                                            rangelen, lo, hi, len);
                                    } else {
                                            len = 0;
                                            printf("matched all bytes in range "
                                                            "%p-%p.\n", lo, hi);
                                    }
                            }
                    } else if (needmm && strncmp(line, vmflags,
                                            sizeof(vmflags) - 1) == 0) {
                            if (strstr(&line[sizeof(vmflags) - 1], mm) != NULL) {
                                    printf("mm flag found.\n");
                                    if (len == 0) {
                                            /* entire range matched */
                                            retval = 1;
                                            break;
                                    }
                                    needmm = 0;     /* saw what was needed */
                            } else {
                                    /* mm flag not set for some or all of range */
                                    printf("range has no mm flag.\n");
                                    break;
                            }
                    }
            }
    
            fclose(fp);
    
            printf("returning %d.\n", retval);
            return retval;
    }
    
    void *Addr;
    size_t Size;
    
    /*
     * worker -- the work each thread performs
     */
    static void *
    worker(void *arg)
    {
            int *ret = (int *)arg;
            *ret =  is_pmem_proc(Addr, Size);
            return NULL;
    }
    
    int main(int argc, char *argv[])
    {
            if (argc <  2 || argc > 3) {
                    printf("usage: %s file [env].\n", argv[0]);
                    return -1;
            }
    
            int fd = open(argv[1], O_RDWR);
    
            struct stat stbuf;
            fstat(fd, &stbuf);
    
            Size = stbuf.st_size;
            Addr = mmap(0, stbuf.st_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
    
            close(fd);
    
            pthread_t threads[NTHREAD];
            int ret[NTHREAD];
    
            /* kick off NTHREAD threads */
            for (int i = 0; i < NTHREAD; i++)
                    pthread_create(&threads[i], NULL, worker, &ret[i]);
    
            /* wait for all the threads to complete */
            for (int i = 0; i < NTHREAD; i++)
                    pthread_join(threads[i], NULL);
    
            /* verify that all the threads return the same value */
            for (int i = 1; i < NTHREAD; i++) {
                    if (ret[0] != ret[i]) {
                            printf("Error i %d ret[0] = %d ret[i] = %d.\n", i,
                                    ret[0], ret[i]);
                    }
            }
    
            printf("%d", ret[0]);
            return 0;
    }
    
    It failed as some threads can not find the memory region in
    "/proc/self/smaps" which is allocated in the main process
    
    It is caused by proc fs which uses 'file->version' to indicate the VMA that
    is the last one has already been handled by read() system call. When the
    next read() issues, it uses the 'version' to find the VMA, then the next
    VMA is what we want to handle, the related code is as follows:
    
            if (last_addr) {
                    vma = find_vma(mm, last_addr);
                    if (vma && (vma = m_next_vma(priv, vma)))
                            return vma;
            }
    
    However, VMA will be lost if the last VMA is gone, e.g:
    
    The process VMA list is A->B->C->D
    
    CPU 0                                  CPU 1
    read() system call
       handle VMA B
       version = B
    return to userspace
    
                                       unmap VMA B
    
    issue read() again to continue to get
    the region info
       find_vma(version) will get VMA C
       m_next_vma(C) will get VMA D
       handle D
       !!! VMA C is lost !!!
    
    In order to fix this bug, we make 'file->version' indicate the end address
    of the current VMA.  m_start will then look up a vma which with vma_start
    < last_vm_end and moves on to the next vma if we found the same or an
    overlapping vma.  This will guarantee that we will not miss an exclusive
    vma but we can still miss one if the previous vma was shrunk.  This is
    acceptable because guaranteeing "never miss a vma" is simply not feasible.
    User has to cope with some inconsistencies if the file is not read in one
    go.
    
    [mhocko@suse.com: changelog fixes]
    Link: http://lkml.kernel.org/r/1475296958-27652-1-git-send-email-robert.hu@intel.comAcked-by: 's avatarDave Hansen <dave.hansen@intel.com>
    Signed-off-by: 's avatarXiao Guangrong <guangrong.xiao@linux.intel.com>
    Signed-off-by: 's avatarRobert Hu <robert.hu@intel.com>
    Acked-by: 's avatarMichal Hocko <mhocko@suse.com>
    Acked-by: 's avatarOleg Nesterov <oleg@redhat.com>
    Cc: Paolo Bonzini <pbonzini@redhat.com>
    Cc: Dan Williams <dan.j.williams@intel.com>
    Cc: Gleb Natapov <gleb@kernel.org>
    Cc: Marcelo Tosatti <mtosatti@redhat.com>
    Cc: Stefan Hajnoczi <stefanha@redhat.com>
    Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
    Signed-off-by: 's avatarAndrew Morton <akpm@linux-foundation.org>
    Signed-off-by: 's avatarLinus Torvalds <torvalds@linux-foundation.org>
    855af072
Name
Last commit
Last update
..
9p Loading commit data...
adfs Loading commit data...
affs Loading commit data...
afs Loading commit data...
autofs4 Loading commit data...
befs Loading commit data...
bfs Loading commit data...
btrfs Loading commit data...
cachefiles Loading commit data...
ceph Loading commit data...
cifs Loading commit data...
coda Loading commit data...
configfs Loading commit data...
cramfs Loading commit data...
crypto Loading commit data...
debugfs Loading commit data...
devpts Loading commit data...
dlm Loading commit data...
ecryptfs Loading commit data...
efivarfs Loading commit data...
efs Loading commit data...
exofs Loading commit data...
exportfs Loading commit data...
ext2 Loading commit data...
ext4 Loading commit data...
f2fs Loading commit data...
fat Loading commit data...
freevxfs Loading commit data...
fscache Loading commit data...
fuse Loading commit data...
gfs2 Loading commit data...
hfs Loading commit data...
hfsplus Loading commit data...
hostfs Loading commit data...
hpfs Loading commit data...
hugetlbfs Loading commit data...
isofs Loading commit data...
jbd2 Loading commit data...
jffs2 Loading commit data...
jfs Loading commit data...
kernfs Loading commit data...
lockd Loading commit data...
logfs Loading commit data...
minix Loading commit data...
ncpfs Loading commit data...
nfs Loading commit data...
nfs_common Loading commit data...
nfsd Loading commit data...
nilfs2 Loading commit data...
nls Loading commit data...
notify Loading commit data...
ntfs Loading commit data...
ocfs2 Loading commit data...
omfs Loading commit data...
openpromfs Loading commit data...
orangefs Loading commit data...
overlayfs Loading commit data...
proc Loading commit data...
pstore Loading commit data...
qnx4 Loading commit data...
qnx6 Loading commit data...
quota Loading commit data...
ramfs Loading commit data...
reiserfs Loading commit data...
romfs Loading commit data...
squashfs Loading commit data...
sysfs Loading commit data...
sysv Loading commit data...
tracefs Loading commit data...
ubifs Loading commit data...
udf Loading commit data...
ufs Loading commit data...
xfs Loading commit data...
Kconfig Loading commit data...
Kconfig.binfmt Loading commit data...
Makefile Loading commit data...
aio.c Loading commit data...
anon_inodes.c Loading commit data...
attr.c Loading commit data...
bad_inode.c Loading commit data...
binfmt_aout.c Loading commit data...
binfmt_elf.c Loading commit data...
binfmt_elf_fdpic.c Loading commit data...
binfmt_em86.c Loading commit data...
binfmt_flat.c Loading commit data...
binfmt_misc.c Loading commit data...
binfmt_script.c Loading commit data...
block_dev.c Loading commit data...
buffer.c Loading commit data...
char_dev.c Loading commit data...
compat.c Loading commit data...
compat_binfmt_elf.c Loading commit data...
compat_ioctl.c Loading commit data...
coredump.c Loading commit data...
dax.c Loading commit data...
dcache.c Loading commit data...
dcookies.c Loading commit data...
direct-io.c Loading commit data...
drop_caches.c Loading commit data...
eventfd.c Loading commit data...
eventpoll.c Loading commit data...
exec.c Loading commit data...
fcntl.c Loading commit data...
fhandle.c Loading commit data...
file.c Loading commit data...
file_table.c Loading commit data...
filesystems.c Loading commit data...
fs-writeback.c Loading commit data...
fs_pin.c Loading commit data...
fs_struct.c Loading commit data...
inode.c Loading commit data...
internal.h Loading commit data...
ioctl.c Loading commit data...
iomap.c Loading commit data...
libfs.c Loading commit data...
locks.c Loading commit data...
mbcache.c Loading commit data...
mount.h Loading commit data...
mpage.c Loading commit data...
namei.c Loading commit data...
namespace.c Loading commit data...
no-block.c Loading commit data...
nsfs.c Loading commit data...
open.c Loading commit data...
pipe.c Loading commit data...
pnode.c Loading commit data...
pnode.h Loading commit data...
posix_acl.c Loading commit data...
proc_namespace.c Loading commit data...
read_write.c Loading commit data...
readdir.c Loading commit data...
select.c Loading commit data...
seq_file.c Loading commit data...
signalfd.c Loading commit data...
splice.c Loading commit data...
stack.c Loading commit data...
stat.c Loading commit data...
statfs.c Loading commit data...
super.c Loading commit data...
sync.c Loading commit data...
timerfd.c Loading commit data...
userfaultfd.c Loading commit data...
utimes.c Loading commit data...
xattr.c Loading commit data...