Commit 26c5eaa1 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-4.12-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The two main items are support for disabling automatic rbd exclusive
  lock transfers from myself and the long awaited -ENOSPC handling
  series from Jeff.

  The former will allow rbd users to take advantage of exclusive lock's
  built-in blacklist/break-lock functionality while staying in control
  of who owns the lock. With the latter in place, we will abort
  filesystem writes on -ENOSPC instead of having them block
  indefinitely.

  Beyond that we've got the usual pile of filesystem fixes from Zheng,
  some refcount_t conversion patches from Elena and a patch for an
  ancient open() flags handling bug from Alexander"

* tag 'ceph-for-4.12-rc1' of git://github.com/ceph/ceph-client: (31 commits)
  ceph: fix memory leak in __ceph_setxattr()
  ceph: fix file open flags on ppc64
  ceph: choose readdir frag based on previous readdir reply
  rbd: exclusive map option
  rbd: return ResponseMessage result from rbd_handle_request_lock()
  rbd: kill rbd_is_lock_supported()
  rbd: support updating the lock cookie without releasing the lock
  rbd: store lock cookie
  rbd: ignore unlock errors
  rbd: fix error handling around rbd_init_disk()
  rbd: move rbd_unregister_watch() call into rbd_dev_image_release()
  rbd: move rbd_dev_destroy() call out of rbd_dev_image_release()
  ceph: when seeing write errors on an inode, switch to sync writes
  Revert "ceph: SetPageError() for writeback pages if writepages fails"
  ceph: handle epoch barriers in cap messages
  libceph: add an epoch_barrier field to struct ceph_osd_client
  libceph: abort already submitted but abortable requests when map or pool goes full
  libceph: allow requests to return immediately on full conditions if caller wishes
  libceph: remove req->r_replay_version
  ceph: make seeky readdir more efficient
  ...
parents 1176032c eeca958d
This diff is collapsed.
...@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req) ...@@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req)
bool remove_page; bool remove_page;
dout("writepages_finish %p rc %d\n", inode, rc); dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0) if (rc < 0) {
mapping_set_error(mapping, rc); mapping_set_error(mapping, rc);
ceph_set_error_write(ci);
} else {
ceph_clear_error_write(ci);
}
/* /*
* We lost the cache cap, need to truncate the page before * We lost the cache cap, need to truncate the page before
...@@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req) ...@@ -703,9 +707,6 @@ static void writepages_finish(struct ceph_osd_request *req)
clear_bdi_congested(inode_to_bdi(inode), clear_bdi_congested(inode_to_bdi(inode),
BLK_RW_ASYNC); BLK_RW_ASYNC);
if (rc < 0)
SetPageError(page);
ceph_put_snap_context(page_snap_context(page)); ceph_put_snap_context(page_snap_context(page));
page->private = 0; page->private = 0;
ClearPagePrivate(page); ClearPagePrivate(page);
...@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, ...@@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
wr_req->r_mtime = ci->vfs_inode.i_mtime; wr_req->r_mtime = ci->vfs_inode.i_mtime;
wr_req->r_abort_on_full = true;
err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
if (!err) if (!err)
......
...@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg) ...@@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg)
void *p; void *p;
size_t extra_len; size_t extra_len;
struct timespec zerotime = {0}; struct timespec zerotime = {0};
struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
" seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
...@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg) ...@@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg)
ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
/* inline data size */ /* inline data size */
ceph_encode_32(&p, 0); ceph_encode_32(&p, 0);
/* osd_epoch_barrier (version 5) */ /*
ceph_encode_32(&p, 0); * osd_epoch_barrier (version 5)
* The epoch_barrier is protected osdc->lock, so READ_ONCE here in
* case it was recently changed
*/
ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
/* oldest_flush_tid (version 6) */ /* oldest_flush_tid (version 6) */
ceph_encode_64(&p, arg->oldest_flush_tid); ceph_encode_64(&p, arg->oldest_flush_tid);
...@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, ...@@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
first_tid = cf->tid + 1; first_tid = cf->tid + 1;
capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref); refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
dout("__flush_snaps %p capsnap %p tid %llu %s\n", dout("__flush_snaps %p capsnap %p tid %llu %s\n",
...@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, ...@@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
inode, capsnap, cf->tid, inode, capsnap, cf->tid,
ceph_cap_string(capsnap->dirty)); ceph_cap_string(capsnap->dirty));
atomic_inc(&capsnap->nref); refcount_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
ret = __send_flush_snap(inode, session, capsnap, cap->mseq, ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
...@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session, ...@@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session,
p += inline_len; p += inline_len;
} }
if (le16_to_cpu(msg->hdr.version) >= 5) {
struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
u32 epoch_barrier;
ceph_decode_32_safe(&p, end, epoch_barrier, bad);
ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
}
if (le16_to_cpu(msg->hdr.version) >= 8) { if (le16_to_cpu(msg->hdr.version) >= 8) {
u64 flush_tid; u64 flush_tid;
u32 caller_uid, caller_gid; u32 caller_uid, caller_gid;
u32 osd_epoch_barrier;
u32 pool_ns_len; u32 pool_ns_len;
/* version >= 5 */
ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
/* version >= 6 */ /* version >= 6 */
ceph_decode_64_safe(&p, end, flush_tid, bad); ceph_decode_64_safe(&p, end, flush_tid, bad);
/* version >= 7 */ /* version >= 7 */
......
...@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p) ...@@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p)
{ {
int i; int i;
struct ceph_fs_client *fsc = s->private; struct ceph_fs_client *fsc = s->private;
struct ceph_mdsmap *mdsmap;
if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
return 0; return 0;
seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); mdsmap = fsc->mdsc->mdsmap;
seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); seq_printf(s, "epoch %d\n", mdsmap->m_epoch);
seq_printf(s, "session_timeout %d\n", seq_printf(s, "root %d\n", mdsmap->m_root);
fsc->mdsc->mdsmap->m_session_timeout); seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
seq_printf(s, "session_autoclose %d\n", seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
fsc->mdsc->mdsmap->m_session_autoclose); seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { for (i = 0; i < mdsmap->m_num_mds; i++) {
struct ceph_entity_addr *addr = struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
&fsc->mdsc->mdsmap->m_info[i].addr; int state = mdsmap->m_info[i].state;
int state = fsc->mdsc->mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
ceph_pr_addr(&addr->in_addr), ceph_pr_addr(&addr->in_addr),
ceph_mds_state_name(state)); ceph_mds_state_name(state));
......
...@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
int i; int i;
int err; int err;
u32 ftype; unsigned frag = -1;
struct ceph_mds_reply_info_parsed *rinfo; struct ceph_mds_reply_info_parsed *rinfo;
dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos);
...@@ -341,7 +341,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -341,7 +341,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* do we have the correct frag content buffered? */ /* do we have the correct frag content buffered? */
if (need_send_readdir(fi, ctx->pos)) { if (need_send_readdir(fi, ctx->pos)) {
struct ceph_mds_request *req; struct ceph_mds_request *req;
unsigned frag;
int op = ceph_snap(inode) == CEPH_SNAPDIR ? int op = ceph_snap(inode) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
...@@ -352,8 +351,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -352,8 +351,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
} }
if (is_hash_order(ctx->pos)) { if (is_hash_order(ctx->pos)) {
frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), /* fragtree isn't always accurate. choose frag
NULL, NULL); * based on previous reply when possible. */
if (frag == (unsigned)-1)
frag = ceph_choose_frag(ci, fpos_hash(ctx->pos),
NULL, NULL);
} else { } else {
frag = fpos_frag(ctx->pos); frag = fpos_frag(ctx->pos);
} }
...@@ -378,7 +380,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -378,7 +380,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
return -ENOMEM; return -ENOMEM;
} }
} else if (is_hash_order(ctx->pos)) {
req->r_args.readdir.offset_hash =
cpu_to_le32(fpos_hash(ctx->pos));
} }
req->r_dir_release_cnt = fi->dir_release_count; req->r_dir_release_cnt = fi->dir_release_count;
req->r_dir_ordered_cnt = fi->dir_ordered_count; req->r_dir_ordered_cnt = fi->dir_ordered_count;
req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_cache_idx = fi->readdir_cache_idx;
...@@ -476,6 +482,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -476,6 +482,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
struct ceph_vino vino; struct ceph_vino vino;
ino_t ino; ino_t ino;
u32 ftype;
BUG_ON(rde->offset < ctx->pos); BUG_ON(rde->offset < ctx->pos);
...@@ -498,15 +505,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -498,15 +505,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
ctx->pos++; ctx->pos++;
} }
ceph_mdsc_put_request(fi->last_readdir);
fi->last_readdir = NULL;
if (fi->next_offset > 2) { if (fi->next_offset > 2) {
ceph_mdsc_put_request(fi->last_readdir); frag = fi->frag;
fi->last_readdir = NULL;
goto more; goto more;
} }
/* more frags? */ /* more frags? */
if (!ceph_frag_is_rightmost(fi->frag)) { if (!ceph_frag_is_rightmost(fi->frag)) {
unsigned frag = ceph_frag_next(fi->frag); frag = ceph_frag_next(fi->frag);
if (is_hash_order(ctx->pos)) { if (is_hash_order(ctx->pos)) {
loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag),
fi->next_offset, true); fi->next_offset, true);
......
...@@ -13,6 +13,38 @@ ...@@ -13,6 +13,38 @@
#include "mds_client.h" #include "mds_client.h"
#include "cache.h" #include "cache.h"
static __le32 ceph_flags_sys2wire(u32 flags)
{
u32 wire_flags = 0;
switch (flags & O_ACCMODE) {
case O_RDONLY:
wire_flags |= CEPH_O_RDONLY;
break;
case O_WRONLY:
wire_flags |= CEPH_O_WRONLY;
break;
case O_RDWR:
wire_flags |= CEPH_O_RDWR;
break;
}
#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; }
ceph_sys2wire(O_CREAT);
ceph_sys2wire(O_EXCL);
ceph_sys2wire(O_TRUNC);
ceph_sys2wire(O_DIRECTORY);
ceph_sys2wire(O_NOFOLLOW);
#undef ceph_sys2wire
if (flags)
dout("unused open flags: %x", flags);
return cpu_to_le32(wire_flags);
}
/* /*
* Ceph file operations * Ceph file operations
* *
...@@ -120,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) ...@@ -120,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode)
if (IS_ERR(req)) if (IS_ERR(req))
goto out; goto out;
req->r_fmode = ceph_flags_to_mode(flags); req->r_fmode = ceph_flags_to_mode(flags);
req->r_args.open.flags = cpu_to_le32(flags); req->r_args.open.flags = ceph_flags_sys2wire(flags);
req->r_args.open.mode = cpu_to_le32(create_mode); req->r_args.open.mode = cpu_to_le32(create_mode);
out: out:
return req; return req;
...@@ -189,7 +221,7 @@ int ceph_renew_caps(struct inode *inode) ...@@ -189,7 +221,7 @@ int ceph_renew_caps(struct inode *inode)
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
wanted = __ceph_caps_file_wanted(ci); wanted = __ceph_caps_file_wanted(ci);
if (__ceph_is_any_real_caps(ci) && if (__ceph_is_any_real_caps(ci) &&
(!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) {
int issued = __ceph_caps_issued(ci, NULL); int issued = __ceph_caps_issued(ci, NULL);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
dout("renew caps %p want %s issued %s updating mds_wanted\n", dout("renew caps %p want %s issued %s updating mds_wanted\n",
...@@ -778,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -778,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
req->r_callback = ceph_aio_complete_req; req->r_callback = ceph_aio_complete_req;
req->r_inode = inode; req->r_inode = inode;
req->r_priv = aio_req; req->r_priv = aio_req;
req->r_abort_on_full = true;
ret = ceph_osdc_start_request(req->r_osdc, req, false); ret = ceph_osdc_start_request(req->r_osdc, req, false);
out: out:
...@@ -1085,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -1085,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
out: out:
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
if (ret == 0) { if (ret != 0) {
pos += len; ceph_set_error_write(ci);
written += len;
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_AUTHONLY,
NULL);
}
} else
break; break;
}
ceph_clear_error_write(ci);
pos += len;
written += len;
if (pos > i_size_read(inode)) {
check_caps = ceph_inode_set_size(inode, pos);
if (check_caps)
ceph_check_caps(ceph_inode(inode),
CHECK_CAPS_AUTHONLY,
NULL);
}
} }
if (ret != -EOLDSNAPC && written > 0) { if (ret != -EOLDSNAPC && written > 0) {
...@@ -1303,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1303,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
} }
retry_snap: retry_snap:
/* FIXME: not complete since it doesn't account for being at quota */
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) {
err = -ENOSPC; err = -ENOSPC;
goto out; goto out;
...@@ -1324,7 +1361,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -1324,7 +1361,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));
if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
(iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) ||
(ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
struct ceph_snap_context *snapc; struct ceph_snap_context *snapc;
struct iov_iter data; struct iov_iter data;
inode_unlock(inode); inode_unlock(inode);
......
...@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
return readdir_prepopulate_inodes_only(req, session); return readdir_prepopulate_inodes_only(req, session);
if (rinfo->hash_order && req->r_path2) { if (rinfo->hash_order) {
last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, if (req->r_path2) {
req->r_path2, strlen(req->r_path2)); last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash,
last_hash = ceph_frag_value(last_hash); req->r_path2,
strlen(req->r_path2));
last_hash = ceph_frag_value(last_hash);
} else if (rinfo->offset_hash) {
/* mds understands offset_hash */
WARN_ON_ONCE(req->r_readdir_offset != 2);
last_hash = le32_to_cpu(rhead->args.readdir.offset_hash);
}
} }
if (rinfo->dir_dir && if (rinfo->dir_dir &&
...@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 &&
!(rinfo->hash_order && req->r_path2)) { !(rinfo->hash_order && last_hash)) {
/* note dir version at start of readdir so we can tell /* note dir version at start of readdir so we can tell
* if any dentries get dropped */ * if any dentries get dropped */
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
......
...@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end, ...@@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end,
info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_end = !!(flags & CEPH_READDIR_FRAG_END);
info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE);
info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER);
info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH);
} }
if (num == 0) if (num == 0)
goto done; goto done;
...@@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s) ...@@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s)
static struct ceph_mds_session *get_session(struct ceph_mds_session *s) static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
{ {
if (atomic_inc_not_zero(&s->s_ref)) { if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s, dout("mdsc get_session %p %d -> %d\n", s,
atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref));
return s; return s;
} else { } else {
dout("mdsc get_session %p 0 -- FAIL", s); dout("mdsc get_session %p 0 -- FAIL", s);
...@@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) ...@@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
void ceph_put_mds_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s)
{ {
dout("mdsc put_session %p %d -> %d\n", s, dout("mdsc put_session %p %d -> %d\n", s,
atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1);
if (atomic_dec_and_test(&s->s_ref)) { if (refcount_dec_and_test(&s->s_ref)) {
if (s->s_auth.authorizer) if (s->s_auth.authorizer)
ceph_auth_destroy_authorizer(s->s_auth.authorizer); ceph_auth_destroy_authorizer(s->s_auth.authorizer);
kfree(s); kfree(s);
...@@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, ...@@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
return NULL; return NULL;
session = mdsc->sessions[mds]; session = mdsc->sessions[mds];
dout("lookup_mds_session %p %d\n", session, dout("lookup_mds_session %p %d\n", session,
atomic_read(&session->s_ref)); refcount_read(&session->s_ref));
get_session(session); get_session(session);
return session; return session;
} }
...@@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{ {
struct ceph_mds_session *s; struct ceph_mds_session *s;
if (mds >= mdsc->mdsmap->m_max_mds) if (mds >= mdsc->mdsmap->m_num_mds)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS); s = kzalloc(sizeof(*s), GFP_NOFS);
...@@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
INIT_LIST_HEAD(&s->s_caps); INIT_LIST_HEAD(&s->s_caps);
s->s_nr_caps = 0; s->s_nr_caps = 0;
s->s_trim_caps = 0; s->s_trim_caps = 0;
atomic_set(&s->s_ref, 1); refcount_set(&s->s_ref, 1);
INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_waiting);
INIT_LIST_HEAD(&s->s_unsafe); INIT_LIST_HEAD(&s->s_unsafe);
s->s_num_cap_releases = 0; s->s_num_cap_releases = 0;
...@@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
} }
mdsc->sessions[mds] = s; mdsc->sessions[mds] = s;
atomic_inc(&mdsc->num_sessions); atomic_inc(&mdsc->num_sessions);
atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
...@@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, ...@@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts; struct ceph_mds_session *ts;
int i, mds = session->s_mds; int i, mds = session->s_mds;
if (mds >= mdsc->mdsmap->m_max_mds) if (mds >= mdsc->mdsmap->m_num_mds)
return; return;
mi = &mdsc->mdsmap->m_info[mds]; mi = &mdsc->mdsmap->m_info[mds];
...@@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, ...@@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_msg *msg = NULL; struct ceph_msg *msg = NULL;
struct ceph_mds_cap_release *head; struct ceph_mds_cap_release *head;
struct ceph_mds_cap_item *item; struct ceph_mds_cap_item *item;
struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
struct ceph_cap *cap; struct ceph_cap *cap;
LIST_HEAD(tmp_list); LIST_HEAD(tmp_list);
int num_cap_releases; int num_cap_releases;
__le32 barrier, *cap_barrier;
down_read(&osdc->lock);
barrier = cpu_to_le32(osdc->epoch_barrier);
up_read(&osdc->lock);
spin_lock(&session->s_cap_lock); spin_lock(&session->s_cap_lock);
again: again:
...@@ -1571,7 +1578,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, ...@@ -1571,7 +1578,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
head = msg->front.iov_base; head = msg->front.iov_base;
head->num = cpu_to_le32(0); head->num = cpu_to_le32(0);
msg->front.iov_len = sizeof(*head); msg->front.iov_len = sizeof(*head);
msg->hdr.version = cpu_to_le16(2);
msg->hdr.compat_version = cpu_to_le16(1);
} }
cap = list_first_entry(&tmp_list, struct ceph_cap, cap = list_first_entry(&tmp_list, struct ceph_cap,
session_caps); session_caps);
list_del(&cap->session_caps); list_del(&cap->session_caps);
...@@ -1589,6 +1600,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, ...@@ -1589,6 +1600,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
ceph_put_cap(mdsc, cap); ceph_put_cap(mdsc, cap);
if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
// Append cap_barrier field
cap_barrier = msg->front.iov_base + msg->front.iov_len;
*cap_barrier = barrier;
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg); dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg); ceph_con_send(&session->s_con, msg);
...@@ -1604,6 +1620,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, ...@@ -1604,6 +1620,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_cap_lock); spin_unlock(&session->s_cap_lock);
if (msg) { if (msg) {
// Append cap_barrier field
cap_barrier = msg->front.iov_base + msg->front.iov_len;
*cap_barrier = barrier;
msg->front.iov_len += sizeof(*cap_barrier);
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("send_cap_releases mds%d %p\n", session->s_mds, msg); dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
ceph_con_send(&session->s_con, msg); ceph_con_send(&session->s_con, msg);
...@@ -1993,7 +2014,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ...@@ -1993,7 +2014,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) { if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist; struct ceph_pagelist *pagelist = req->r_pagelist;
atomic_inc(&pagelist->refcnt); refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist); ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length); msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else { } else {
...@@ -2640,8 +2661,10 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -2640,8 +2661,10 @@ static void handle_session(struct ceph_mds_session *session,
seq = le64_to_cpu(h->seq); seq = le64_to_cpu(h->seq);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) if (op == CEPH_SESSION_CLOSE) {
get_session(session);
__unregister_session(mdsc, session); __unregister_session(mdsc, session);
}
/* FIXME: this ttl calculation is generous */ /* FIXME: this ttl calculation is generous */
session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
...@@ -2730,6 +2753,8 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -2730,6 +2753,8 @@ static void handle_session(struct ceph_mds_session *session,
kick_requests(mdsc, mds); kick_requests(mdsc, mds);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
} }
if (op == CEPH_SESSION_CLOSE)
ceph_put_mds_session(session);
return; return;
bad: bad:
...@@ -3109,7 +3134,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, ...@@ -3109,7 +3134,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n", dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch); newmap->m_epoch, oldmap->m_epoch);
for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i] == NULL) if (mdsc->sessions[i] == NULL)
continue; continue;
s = mdsc->sessions[i]; s = mdsc->sessions[i];
...@@ -3123,15 +3148,33 @@ static void check_new_map(struct ceph_mds_client *mdsc, ...@@ -3123,15 +3148,33 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state)); ceph_session_state_name(s->s_state));
if (i >= newmap->m_max_mds || if (i >= newmap->m_num_mds ||
memcmp(ceph_mdsmap_get_addr(oldmap, i), memcmp(ceph_mdsmap_get_addr(oldmap, i),
ceph_mdsmap_get_addr(newmap, i), ceph_mdsmap_get_addr(newmap, i),
sizeof(struct ceph_entity_addr))) { sizeof(struct ceph_entity_addr))) {
if (s->s_state == CEPH_MDS_SESSION_OPENING) { if (s->s_state == CEPH_MDS_SESSION_OPENING) {
/* the session never opened, just close it /* the session never opened, just close it