Commit 99ccbd22 authored by Milosz Tanski's avatar Milosz Tanski

ceph: use fscache as a local presisent cache

Adding support for fscache to the Ceph filesystem. This would bring it to on
par with some of the other network filesystems in Linux (like NFS, AFS, etc...)

In order to mount the filesystem with fscache the 'fsc' mount option must be
passed.
Signed-off-by: default avatarMilosz Tanski <milosz@adfin.com>
Signed-off-by: default avatarSage Weil <sage@inktank.com>
parent cd0a2df6
......@@ -16,3 +16,12 @@ config CEPH_FS
If unsure, say N.
if CEPH_FS
config CEPH_FSCACHE
bool "Enable Ceph client caching support"
depends on CEPH_FS=m && FSCACHE || CEPH_FS=y && FSCACHE=y
help
Choose Y here to enable persistent, read-only local
caching support for Ceph clients using FS-Cache
endif
......@@ -9,3 +9,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
......@@ -11,6 +11,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
#include <linux/ceph/osd_client.h>
/*
......@@ -144,6 +145,11 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
return;
}
ceph_invalidate_fscache_page(inode, page);
if (!PagePrivate(page))
return;
/*
* We can get non-dirty pages here due to races between
* set_page_dirty and truncate_complete_page; just spit out a
......@@ -163,14 +169,17 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
ClearPagePrivate(page);
}
/* just a sanity check */
static int ceph_releasepage(struct page *page, gfp_t g)
{
struct inode *inode = page->mapping ? page->mapping->host : NULL;
dout("%p releasepage %p idx %lu\n", inode, page, page->index);
WARN_ON(PageDirty(page));
WARN_ON(PagePrivate(page));
return 0;
/* Can we release the page from the cache? */
if (!ceph_release_fscache_page(page, g))
return 0;
return !PagePrivate(page);
}
/*
......@@ -180,11 +189,16 @@ static int readpage_nounlock(struct file *filp, struct page *page)
{
struct inode *inode = file_inode(filp);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_client *osdc =
struct ceph_osd_client *osdc =
&ceph_inode_to_client(inode)->client->osdc;
int err = 0;
u64 len = PAGE_CACHE_SIZE;
err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
goto out;
dout("readpage inode %p file %p page %p index %lu\n",
inode, filp, page, page->index);
err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
......@@ -202,6 +216,9 @@ static int readpage_nounlock(struct file *filp, struct page *page)
}
SetPageUptodate(page);
if (err == 0)
ceph_readpage_to_fscache(inode, page);
out:
return err < 0 ? err : 0;
}
......@@ -244,6 +261,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
page->index);
flush_dcache_page(page);
SetPageUptodate(page);
ceph_readpage_to_fscache(inode, page);
unlock_page(page);
page_cache_release(page);
bytes -= PAGE_CACHE_SIZE;
......@@ -313,7 +331,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
page = list_entry(page_list->prev, struct page, lru);
BUG_ON(PageLocked(page));
list_del(&page->lru);
dout("start_read %p adding %p idx %lu\n", inode, page,
page->index);
if (add_to_page_cache_lru(page, &inode->i_data, page->index,
......@@ -360,6 +378,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
int rc = 0;
int max = 0;
rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
&nr_pages);
if (rc == 0)
goto out;
if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
>> PAGE_SHIFT;
......@@ -479,6 +503,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
ceph_readpage_to_fscache(inode, page);
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
......@@ -534,7 +560,6 @@ static void ceph_release_pages(struct page **pages, int num)
pagevec_release(&pvec);
}
/*
* async writeback completion handler.
*
......
/*
* Ceph cache definitions.
*
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
* Written by Milosz Tanski (milosz@adfin.com)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to:
* Free Software Foundation
* 51 Franklin Street, Fifth Floor
* Boston, MA 02111-1301 USA
*
*/
#include <linux/fscache.h>
#include "super.h"
#include "cache.h"
struct ceph_aux_inode {
struct timespec mtime;
loff_t size;
};
struct fscache_netfs ceph_cache_netfs = {
.name = "ceph",
.version = 0,
};
static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf)
{
const struct ceph_fs_client* fsc = cookie_netfs_data;
uint16_t klen;
klen = sizeof(fsc->client->fsid);
if (klen > maxbuf)
return 0;
memcpy(buffer, &fsc->client->fsid, klen);
return klen;
}
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
.name = "CEPH.fsid",
.type = FSCACHE_COOKIE_TYPE_INDEX,
.get_key = ceph_fscache_session_get_key,
};
int ceph_fscache_register()
{
return fscache_register_netfs(&ceph_cache_netfs);
}
void ceph_fscache_unregister()
{
fscache_unregister_netfs(&ceph_cache_netfs);
}
int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def,
fsc);
if (fsc->fscache == NULL) {
pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
return 0;
}
fsc->revalidate_wq = alloc_workqueue("ceph-revalidate", 0, 1);
if (fsc->revalidate_wq == NULL)
return -ENOMEM;
return 0;
}
static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
/* use ceph virtual inode (id + snaphot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
memcpy(buffer, &ci->i_vino, klen);
return klen;
}
static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
void *buffer, uint16_t bufmax)
{
struct ceph_aux_inode aux;
const struct ceph_inode_info* ci = cookie_netfs_data;
const struct inode* inode = &ci->vfs_inode;
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
memcpy(buffer, &aux, sizeof(aux));
return sizeof(aux);
}
static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
uint64_t *size)
{
const struct ceph_inode_info* ci = cookie_netfs_data;
const struct inode* inode = &ci->vfs_inode;
*size = inode->i_size;
}
static enum fscache_checkaux ceph_fscache_inode_check_aux(
void *cookie_netfs_data, const void *data, uint16_t dlen)
{
struct ceph_aux_inode aux;
struct ceph_inode_info* ci = cookie_netfs_data;
struct inode* inode = &ci->vfs_inode;
if (dlen != sizeof(aux))
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&aux, 0, sizeof(aux));
aux.mtime = inode->i_mtime;
aux.size = inode->i_size;
if (memcmp(data, &aux, sizeof(aux)) != 0)
return FSCACHE_CHECKAUX_OBSOLETE;
dout("ceph inode 0x%p cached okay", ci);
return FSCACHE_CHECKAUX_OKAY;
}
static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data)
{
struct ceph_inode_info* ci = cookie_netfs_data;
struct pagevec pvec;
pgoff_t first;
int loop, nr_pages;
pagevec_init(&pvec, 0);
first = 0;
dout("ceph inode 0x%p now uncached", ci);
while (1) {
nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first,
PAGEVEC_SIZE - pagevec_count(&pvec));
if (!nr_pages)
break;
for (loop = 0; loop < nr_pages; loop++)
ClearPageFsCache(pvec.pages[loop]);
first = pvec.pages[nr_pages - 1]->index + 1;
pvec.nr = nr_pages;
pagevec_release(&pvec);
cond_resched();
}
}
static const struct fscache_cookie_def ceph_fscache_inode_object_def = {
.name = "CEPH.inode",
.type = FSCACHE_COOKIE_TYPE_DATAFILE,
.get_key = ceph_fscache_inode_get_key,
.get_attr = ceph_fscache_inode_get_attr,
.get_aux = ceph_fscache_inode_get_aux,
.check_aux = ceph_fscache_inode_check_aux,
.now_uncached = ceph_fscache_inode_now_uncached,
};
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
struct ceph_inode_info* ci)
{
struct inode* inode = &ci->vfs_inode;
/* No caching for filesystem */
if (fsc->fscache == NULL)
return;
/* Only cache for regular files that are read only */
if ((ci->vfs_inode.i_mode & S_IFREG) == 0)
return;
/* Avoid multiple racing open requests */
mutex_lock(&inode->i_mutex);
if (ci->fscache)
goto done;
ci->fscache = fscache_acquire_cookie(fsc->fscache,
&ceph_fscache_inode_object_def,
ci);
done:
mutex_unlock(&inode->i_mutex);
}
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
struct fscache_cookie* cookie;
if ((cookie = ci->fscache) == NULL)
return;
ci->fscache = NULL;
fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode);
fscache_relinquish_cookie(cookie, 0);
}
static void ceph_vfs_readpage_complete(struct page *page, void *data, int error)
{
if (!error)
SetPageUptodate(page);
}
static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error)
{
if (!error)
SetPageUptodate(page);
unlock_page(page);
}
static inline int cache_valid(struct ceph_inode_info *ci)
{
return ((ceph_caps_issued(ci) & CEPH_CAP_FILE_CACHE) &&
(ci->i_fscache_gen == ci->i_rdcache_gen));
}
/* Atempt to read from the fscache,
*
* This function is called from the readpage_nounlock context. DO NOT attempt to
* unlock the page here (or in the callback).
*/
int ceph_readpage_from_fscache(struct inode *inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!cache_valid(ci))
return -ENOBUFS;
ret = fscache_read_or_alloc_page(ci->fscache, page,
ceph_vfs_readpage_complete, NULL,
GFP_KERNEL);
switch (ret) {
case 0: /* Page found */
dout("page read submitted\n");
return 0;
case -ENOBUFS: /* Pages were not found, and can't be */
case -ENODATA: /* Pages were not found */
dout("page/inode not in cache\n");
return ret;
default:
dout("%s: unknown error ret = %i\n", __func__, ret);
return ret;
}
}
int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!cache_valid(ci))
return -ENOBUFS;
ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages,
ceph_vfs_readpage_complete_unlock,
NULL, mapping_gfp_mask(mapping));
switch (ret) {
case 0: /* All pages found */
dout("all-page read submitted\n");
return 0;
case -ENOBUFS: /* Some pages were not found, and can't be */
case -ENODATA: /* some pages were not found */
dout("page/inode not in cache\n");
return ret;
default:
dout("%s: unknown error ret = %i\n", __func__, ret);
return ret;
}
}
void ceph_readpage_to_fscache(struct inode *inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int ret;
if (!cache_valid(ci))
return;
ret = fscache_write_page(ci->fscache, page, GFP_KERNEL);
if (ret)
fscache_uncache_page(ci->fscache, page);
}
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
{
struct ceph_inode_info *ci = ceph_inode(inode);
fscache_wait_on_page_write(ci->fscache, page);
fscache_uncache_page(ci->fscache, page);
}
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
if (fsc->revalidate_wq)
destroy_workqueue(fsc->revalidate_wq);
fscache_relinquish_cookie(fsc->fscache, 0);
fsc->fscache = NULL;
}
static void ceph_revalidate_work(struct work_struct *work)
{
int issued;
u32 orig_gen;
struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
i_revalidate_work);
struct inode *inode = &ci->vfs_inode;
spin_lock(&ci->i_ceph_lock);
issued = __ceph_caps_issued(ci, NULL);
orig_gen = ci->i_rdcache_gen;
spin_unlock(&ci->i_ceph_lock);
if (!(issued & CEPH_CAP_FILE_CACHE)) {
dout("revalidate_work lost cache before validation %p\n",
inode);
goto out;
}
if (!fscache_check_consistency(ci->fscache))
fscache_invalidate(ci->fscache);
spin_lock(&ci->i_ceph_lock);
/* Update the new valid generation (backwards sanity check too) */
if (orig_gen > ci->i_fscache_gen) {
ci->i_fscache_gen = orig_gen;
}
spin_unlock(&ci->i_ceph_lock);
out:
iput(&ci->vfs_inode);
}
void ceph_queue_revalidate(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
ihold(inode);
if (queue_work(ceph_sb_to_client(inode->i_sb)->revalidate_wq,
&ci->i_revalidate_work)) {
dout("ceph_queue_revalidate %p\n", inode);
} else {
dout("ceph_queue_revalidate %p failed\n)", inode);
iput(inode);
}
}
void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
ci->fscache = NULL;
/* The first load is verifed cookie open time */
ci->i_fscache_gen = 1;
INIT_WORK(&ci->i_revalidate_work, ceph_revalidate_work);
}
/*
* Ceph cache definitions.
*
* Copyright (C) 2013 by Adfin Solutions, Inc. All Rights Reserved.
* Written by Milosz Tanski (milosz@adfin.com)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to:
* Free Software Foundation
* 51 Franklin Street, Fifth Floor
* Boston, MA 02111-1301 USA
*
*/
#ifndef _CEPH_CACHE_H
#define _CEPH_CACHE_H
#ifdef CONFIG_CEPH_FSCACHE
int ceph_fscache_register(void);
void ceph_fscache_unregister(void);
int ceph_fscache_register_fs(struct ceph_fs_client* fsc);
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc);
void ceph_fscache_inode_init(struct ceph_inode_info *ci);
void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
struct ceph_inode_info* ci);
void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages);
void ceph_readpage_to_fscache(struct inode *inode, struct page *page);
void ceph_invalidate_fscache_page(struct inode* inode, struct page *page);
void ceph_queue_revalidate(struct inode *inode);
static inline void ceph_fscache_invalidate(struct inode *inode)
{
fscache_invalidate(ceph_inode(inode)->fscache);
}
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
struct inode* inode = page->mapping->host;
struct ceph_inode_info *ci = ceph_inode(inode);
return fscache_maybe_release_page(ci->fscache, page, gfp);
}
#else
static inline int ceph_fscache_register(void)
{
return 0;
}
static inline void ceph_fscache_unregister(void)
{
}
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{
return 0;
}
static inline void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{
}
static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
}
static inline void ceph_fscache_register_inode_cookie(struct ceph_fs_client* parent_fsc,
struct ceph_inode_info* ci)
{
}
static inline void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci)
{
}
static inline int ceph_readpage_from_fscache(struct inode* inode,
struct page *page)
{
return -ENOBUFS;
}
static inline int ceph_readpages_from_fscache(struct inode *inode,
struct address_space *mapping,
struct list_head *pages,
unsigned *nr_pages)
{
return -ENOBUFS;
}
static inline void ceph_readpage_to_fscache(struct inode *inode,
struct page *page)
{
}
static inline void ceph_fscache_invalidate(struct inode *inode)
{
}
static inline void ceph_invalidate_fscache_page(struct inode *inode,
struct page *page)
{
}
static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp)
{
return 1;
}
static inline void ceph_fscache_readpages_cancel(struct inode *inode,
struct list_head *pages)
{
}
static inline void ceph_queue_revalidate(struct inode *inode)
{
}
#endif
#endif
......@@ -10,6 +10,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
#include <linux/ceph/decode.h>
#include <linux/ceph/messenger.h>
......@@ -479,8 +480,9 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
* i_rdcache_gen.
*/
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
(had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
ci->i_rdcache_gen++;
}
/*
* if we are newly issued FILE_SHARED, mark dir not complete; we
......@@ -2395,6 +2397,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
int writeback = 0;
int queue_invalidate = 0;
int deleted_inode = 0;
int queue_revalidate = 0;
dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
inode, cap, mds, seq, ceph_cap_string(newcaps));
......@@ -2417,6 +2420,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ci->i_rdcache_revoking = ci->i_rdcache_gen;
}
}
ceph_fscache_invalidate(inode);
}
/* side effects now are allowed */
......@@ -2458,6 +2463,11 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
}
}
/* Do we need to revalidate our fscache cookie. Don't bother on the
* first cache cap as we already validate at cookie creation time. */
if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
queue_revalidate = 1;
/* size/ctime/mtime/atime? */
ceph_fill_file_size(inode, issued,
le32_to_cpu(grant->truncate_seq),
......@@ -2542,6 +2552,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
BUG_ON(cap->issued & ~cap->implemented);
spin_unlock(&ci->i_ceph_lock);
if (writeback)
/*
* queue inode for writeback: we can't actually call
......@@ -2553,6 +2564,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
ceph_queue_invalidate(inode);
if (deleted_inode)
invalidate_aliases(inode);
if (queue_revalidate)
ceph_queue_revalidate(inode);
if (wake)
wake_up_all(&ci->i_cap_wq);
......@@ -2709,8 +2722,10 @@ static void handle_cap_trunc(struct inode *inode,
truncate_seq, truncate_size, size);
spin_unlock(&ci->i_ceph_lock);
if (queue_trunc)
if (queue_trunc) {
ceph_queue_vmtruncate(inode);
ceph_fscache_invalidate(inode);
}
}
/*
......
......@@ -12,6 +12,7 @@
#include "super.h"
#include "mds_client.h"
#include "cache.h"
/*
* Ceph file operations
......@@ -69,9 +70,23 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
{
struct ceph_file_info *cf;
int ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
/* First file open request creates the cookie, we want to keep
* this cookie around for the filetime of the inode as not to
* have to worry about fscache register / revoke / operation
* races.
<