nbd.c 30.5 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15
 *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
 *
 *  Network Block Device
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; under version 2 of the License.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
16
 *  along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18

19 20
#include "block/nbd.h"
#include "block/block.h"
21

22
#include "block/coroutine.h"
23

24 25
#include <errno.h>
#include <string.h>
26
#ifndef _WIN32
27
#include <sys/ioctl.h>
28
#endif
29
#if defined(__sun__) || defined(__HAIKU__)
30 31
#include <sys/ioccom.h>
#endif
32 33
#include <ctype.h>
#include <inttypes.h>
34

35 36 37 38
#ifdef __linux__
#include <linux/fs.h>
#endif

39 40
#include "qemu/sockets.h"
#include "qemu/queue.h"
41
#include "qemu/main-loop.h"
42 43 44 45

//#define DEBUG_NBD

#ifdef DEBUG_NBD
46
#define TRACE(msg, ...) do { \
47
    LOG(msg, ## __VA_ARGS__); \
48
} while(0)
49 50 51 52
#else
#define TRACE(msg, ...) \
    do { } while (0)
#endif
53 54 55 56 57 58 59 60

#define LOG(msg, ...) do { \
    fprintf(stderr, "%s:%s():L%d: " msg "\n", \
            __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
} while(0)

/* This is all part of the "official" NBD API */

Paolo Bonzini's avatar
Paolo Bonzini committed
61
#define NBD_REQUEST_SIZE        (4 + 4 + 8 + 8 + 4)
62
#define NBD_REPLY_SIZE          (4 + 4 + 8)
63 64
#define NBD_REQUEST_MAGIC       0x25609513
#define NBD_REPLY_MAGIC         0x67446698
Paolo Bonzini's avatar
Paolo Bonzini committed
65 66
#define NBD_OPTS_MAGIC          0x49484156454F5054LL
#define NBD_CLIENT_MAGIC        0x0000420281861253LL
67 68 69 70 71 72 73

#define NBD_SET_SOCK            _IO(0xab, 0)
#define NBD_SET_BLKSIZE         _IO(0xab, 1)
#define NBD_SET_SIZE            _IO(0xab, 2)
#define NBD_DO_IT               _IO(0xab, 3)
#define NBD_CLEAR_SOCK          _IO(0xab, 4)
#define NBD_CLEAR_QUE           _IO(0xab, 5)
74 75
#define NBD_PRINT_DEBUG         _IO(0xab, 6)
#define NBD_SET_SIZE_BLOCKS     _IO(0xab, 7)
76
#define NBD_DISCONNECT          _IO(0xab, 8)
77 78
#define NBD_SET_TIMEOUT         _IO(0xab, 9)
#define NBD_SET_FLAGS           _IO(0xab, 10)
79

80
#define NBD_OPT_EXPORT_NAME     (1 << 0)
81

82 83 84 85 86 87 88 89 90 91 92
/* Definitions for opaque data types */

typedef struct NBDRequest NBDRequest;

struct NBDRequest {
    QSIMPLEQ_ENTRY(NBDRequest) entry;
    NBDClient *client;
    uint8_t *data;
};

struct NBDExport {
93
    int refcount;
94 95
    void (*close)(NBDExport *exp);

96
    BlockDriverState *bs;
97
    char *name;
98 99 100
    off_t dev_offset;
    off_t size;
    uint32_t nbdflags;
101
    QTAILQ_HEAD(, NBDClient) clients;
102
    QTAILQ_ENTRY(NBDExport) next;
103 104
};

105 106
static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);

107 108 109 110 111 112 113 114 115 116 117 118
struct NBDClient {
    int refcount;
    void (*close)(NBDClient *client);

    NBDExport *exp;
    int sock;

    Coroutine *recv_coroutine;

    CoMutex send_lock;
    Coroutine *send_coroutine;

119
    QTAILQ_ENTRY(NBDClient) next;
120
    int nb_requests;
121
    bool closing;
122 123
};

124 125
/* That's all folks */

126
ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
127 128
{
    size_t offset = 0;
129
    int err;
130

131 132 133 134 135 136 137 138
    if (qemu_in_coroutine()) {
        if (do_read) {
            return qemu_co_recv(fd, buffer, size);
        } else {
            return qemu_co_send(fd, buffer, size);
        }
    }

139 140 141 142
    while (offset < size) {
        ssize_t len;

        if (do_read) {
Blue Swirl's avatar
Blue Swirl committed
143
            len = qemu_recv(fd, buffer + offset, size - offset, 0);
144
        } else {
145
            len = send(fd, buffer + offset, size - offset, 0);
146 147
        }

148
        if (len < 0) {
149
            err = socket_error();
150

151
            /* recoverable error */
152
            if (err == EINTR || (offset > 0 && err == EAGAIN)) {
153 154 155 156
                continue;
            }

            /* unrecoverable error */
157
            return -err;
158 159 160 161 162 163 164 165 166 167 168 169 170
        }

        /* eof */
        if (len == 0) {
            break;
        }

        offset += len;
    }

    return offset;
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
static ssize_t read_sync(int fd, void *buffer, size_t size)
{
    /* Sockets are kept in blocking mode in the negotiation phase.  After
     * that, a non-readable socket simply means that another thread stole
     * our request/reply.  Synchronization is done with recv_coroutine, so
     * that this is coroutine-safe.
     */
    return nbd_wr_sync(fd, buffer, size, true);
}

static ssize_t write_sync(int fd, void *buffer, size_t size)
{
    int ret;
    do {
        /* For writes, we do expect the socket to be writable.  */
        ret = nbd_wr_sync(fd, buffer, size, false);
    } while (ret == -EAGAIN);
    return ret;
}

191 192
static void combine_addr(char *buf, size_t len, const char* address,
                         uint16_t port)
193
{
194 195 196 197 198
    /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
    if (strstr(address, ":")) {
        snprintf(buf, len, "[%s]:%u", address, port);
    } else {
        snprintf(buf, len, "%s:%u", address, port);
199 200 201
    }
}

202 203 204 205 206 207 208 209 210 211 212 213
int tcp_socket_outgoing_opts(QemuOpts *opts)
{
    Error *local_err = NULL;
    int fd = inet_connect_opts(opts, &local_err, NULL, NULL);
    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }

    return fd;
}

214
int tcp_socket_incoming(const char *address, uint16_t port)
215
{
216 217 218 219
    char address_and_port[128];
    combine_addr(address_and_port, 128, address, port);
    return tcp_socket_incoming_spec(address_and_port);
}
220

221 222
int tcp_socket_incoming_spec(const char *address_and_port)
{
223 224 225 226 227 228 229 230
    Error *local_err = NULL;
    int fd = inet_listen(address_and_port, NULL, 0, SOCK_STREAM, 0, &local_err);

    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
231
}
232

233 234
int unix_socket_incoming(const char *path)
{
235 236
    Error *local_err = NULL;
    int fd = unix_listen(path, NULL, 0, &local_err);
237

238 239 240 241 242
    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
243 244
}

245 246
int unix_socket_outgoing(const char *path)
{
247 248 249 250 251 252 253 254
    Error *local_err = NULL;
    int fd = unix_connect(path, &local_err);

    if (local_err != NULL) {
        qerror_report_err(local_err);
        error_free(local_err);
    }
    return fd;
255
}
256

257
/* Basic flow for negotiation
258 259 260

   Server         Client
   Negotiate
261 262 263 264 265 266 267 268 269 270 271 272 273

   or

   Server         Client
   Negotiate #1
                  Option
   Negotiate #2

   ----

   followed by

   Server         Client
274 275 276 277 278 279 280
                  Request
   Response
                  Request
   Response
                  ...
   ...
                  Request (type == 2)
281

282 283
*/

284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
static int nbd_receive_options(NBDClient *client)
{
    int csock = client->sock;
    char name[256];
    uint32_t tmp, length;
    uint64_t magic;
    int rc;

    /* Client sends:
        [ 0 ..   3]   reserved (0)
        [ 4 ..  11]   NBD_OPTS_MAGIC
        [12 ..  15]   NBD_OPT_EXPORT_NAME
        [16 ..  19]   length
        [20 ..  xx]   export name (length bytes)
     */

    rc = -EINVAL;
    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (tmp != 0) {
        LOG("Bad reserved received");
        goto fail;
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking reserved");
    if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
        LOG("Bad magic received");
        goto fail;
    }

    if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking option");
    if (tmp != be32_to_cpu(NBD_OPT_EXPORT_NAME)) {
        LOG("Bad option received");
        goto fail;
    }

    if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
        LOG("read failed");
        goto fail;
    }
    TRACE("Checking length");
    length = be32_to_cpu(length);
    if (length > 255) {
        LOG("Bad length received");
        goto fail;
    }
    if (read_sync(csock, name, length) != length) {
        LOG("read failed");
        goto fail;
    }
    name[length] = '\0';

    client->exp = nbd_export_find(name);
    if (!client->exp) {
        LOG("export not found");
        goto fail;
    }

    QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
    nbd_export_get(client->exp);

    TRACE("Option negotiation succeeded.");
    rc = 0;
fail:
    return rc;
}

362
static int nbd_send_negotiate(NBDClient *client)
363
{
364
    int csock = client->sock;
365
    char buf[8 + 8 + 8 + 128];
366
    int rc;
367 368
    const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
                         NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
369

370 371 372
    /* Negotiation header without options:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_CLIENT_MAGIC)
373
        [16 ..  23]   size
374 375 376 377 378 379 380 381 382 383 384 385 386
        [24 ..  25]   server flags (0)
        [24 ..  27]   export flags
        [28 .. 151]   reserved     (0)

       Negotiation header with options, part 1:
        [ 0 ..   7]   passwd       ("NBDMAGIC")
        [ 8 ..  15]   magic        (NBD_OPTS_MAGIC)
        [16 ..  17]   server flags (0)

       part 2 (after options are sent):
        [18 ..  25]   size
        [26 ..  27]   export flags
        [28 .. 151]   reserved     (0)
387 388
     */

389
    qemu_set_block(csock);
390 391
    rc = -EINVAL;

392
    TRACE("Beginning negotiation.");
393
    memset(buf, 0, sizeof(buf));
394
    memcpy(buf, "NBDMAGIC", 8);
395 396 397 398 399 400 401 402
    if (client->exp) {
        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
        cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
    } else {
        cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
    }
403

404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
    if (client->exp) {
        if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
            LOG("write failed");
            goto fail;
        }
    } else {
        if (write_sync(csock, buf, 18) != 18) {
            LOG("write failed");
            goto fail;
        }
        rc = nbd_receive_options(client);
        if (rc < 0) {
            LOG("option negotiation failed");
            goto fail;
        }

        assert ((client->exp->nbdflags & ~65535) == 0);
        cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
        cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
        if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
            LOG("write failed");
            goto fail;
        }
427 428
    }

429
    TRACE("Negotiation succeeded.");
430 431
    rc = 0;
fail:
432
    qemu_set_nonblock(csock);
433
    return rc;
434 435
}

436 437
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
                          off_t *size, size_t *blocksize)
438
{
439 440 441
    char buf[256];
    uint64_t magic, s;
    uint16_t tmp;
442
    int rc;
443

444
    TRACE("Receiving negotiation.");
445

446 447
    rc = -EINVAL;

448 449
    if (read_sync(csock, buf, 8) != 8) {
        LOG("read failed");
450
        goto fail;
451 452 453 454 455
    }

    buf[8] = '\0';
    if (strlen(buf) == 0) {
        LOG("server connection closed");
456
        goto fail;
457 458 459 460 461 462 463 464 465 466 467 468 469 470
    }

    TRACE("Magic is %c%c%c%c%c%c%c%c",
          qemu_isprint(buf[0]) ? buf[0] : '.',
          qemu_isprint(buf[1]) ? buf[1] : '.',
          qemu_isprint(buf[2]) ? buf[2] : '.',
          qemu_isprint(buf[3]) ? buf[3] : '.',
          qemu_isprint(buf[4]) ? buf[4] : '.',
          qemu_isprint(buf[5]) ? buf[5] : '.',
          qemu_isprint(buf[6]) ? buf[6] : '.',
          qemu_isprint(buf[7]) ? buf[7] : '.');

    if (memcmp(buf, "NBDMAGIC", 8) != 0) {
        LOG("Invalid magic received");
471
        goto fail;
472 473 474 475
    }

    if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
        LOG("read failed");
476
        goto fail;
477 478 479 480 481 482 483 484 485 486
    }
    magic = be64_to_cpu(magic);
    TRACE("Magic is 0x%" PRIx64, magic);

    if (name) {
        uint32_t reserved = 0;
        uint32_t opt;
        uint32_t namesize;

        TRACE("Checking magic (opts_magic)");
Paolo Bonzini's avatar
Paolo Bonzini committed
487
        if (magic != NBD_OPTS_MAGIC) {
488
            LOG("Bad magic received");
489
            goto fail;
490 491 492
        }
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("flags read failed");
493
            goto fail;
494 495 496 497 498 499
        }
        *flags = be16_to_cpu(tmp) << 16;
        /* reserved for future use */
        if (write_sync(csock, &reserved, sizeof(reserved)) !=
            sizeof(reserved)) {
            LOG("write failed (reserved)");
500
            goto fail;
501 502 503 504 505
        }
        /* write the export name */
        magic = cpu_to_be64(magic);
        if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
            LOG("write failed (magic)");
506
            goto fail;
507 508 509 510
        }
        opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
        if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
            LOG("write failed (opt)");
511
            goto fail;
512 513 514 515 516
        }
        namesize = cpu_to_be32(strlen(name));
        if (write_sync(csock, &namesize, sizeof(namesize)) !=
            sizeof(namesize)) {
            LOG("write failed (namesize)");
517
            goto fail;
518 519 520
        }
        if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
            LOG("write failed (name)");
521
            goto fail;
522 523 524 525
        }
    } else {
        TRACE("Checking magic (cli_magic)");

Paolo Bonzini's avatar
Paolo Bonzini committed
526
        if (magic != NBD_CLIENT_MAGIC) {
527
            LOG("Bad magic received");
528
            goto fail;
529 530 531 532 533
        }
    }

    if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
        LOG("read failed");
534
        goto fail;
535 536 537 538 539 540 541 542
    }
    *size = be64_to_cpu(s);
    *blocksize = 1024;
    TRACE("Size is %" PRIu64, *size);

    if (!name) {
        if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
            LOG("read failed (flags)");
543
            goto fail;
544 545 546 547 548
        }
        *flags = be32_to_cpup(flags);
    } else {
        if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
            LOG("read failed (tmp)");
549
            goto fail;
550 551 552 553 554
        }
        *flags |= be32_to_cpu(tmp);
    }
    if (read_sync(csock, &buf, 124) != 124) {
        LOG("read failed (buf)");
555
        goto fail;
556
    }
557 558 559 560
    rc = 0;

fail:
    return rc;
561
}
562

563 564
#ifdef __linux__
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
565
{
566 567
    TRACE("Setting NBD socket");

568
    if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
569 570
        int serrno = errno;
        LOG("Failed to set NBD socket");
571
        return -serrno;
572 573
    }

574
    TRACE("Setting block size to %lu", (unsigned long)blocksize);
575

576
    if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
577 578
        int serrno = errno;
        LOG("Failed setting NBD block size");
579
        return -serrno;
580
    }
581

582
        TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
583

584
    if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
585 586
        int serrno = errno;
        LOG("Failed setting size (in blocks)");
587
        return -serrno;
588
    }
589

590 591 592 593 594 595 596 597 598 599 600
    if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
        if (errno == ENOTTY) {
            int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
            TRACE("Setting readonly attribute");

            if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
                int serrno = errno;
                LOG("Failed setting read-only attribute");
                return -serrno;
            }
        } else {
601
            int serrno = errno;
602
            LOG("Failed setting flags");
603
            return -serrno;
604 605 606
        }
    }

607
    TRACE("Negotiation ended");
608

609
    return 0;
610 611 612 613
}

int nbd_disconnect(int fd)
{
614 615 616 617
    ioctl(fd, NBD_CLEAR_QUE);
    ioctl(fd, NBD_DISCONNECT);
    ioctl(fd, NBD_CLEAR_SOCK);
    return 0;
618 619
}

620
int nbd_client(int fd)
621
{
622 623
    int ret;
    int serrno;
624

625
    TRACE("Doing NBD loop");
626

627
    ret = ioctl(fd, NBD_DO_IT);
628
    if (ret < 0 && errno == EPIPE) {
629 630 631 632 633 634
        /* NBD_DO_IT normally returns EPIPE when someone has disconnected
         * the socket via NBD_DISCONNECT.  We do not want to return 1 in
         * that case.
         */
        ret = 0;
    }
635
    serrno = errno;
636

637
    TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
638

639 640
    TRACE("Clearing NBD queue");
    ioctl(fd, NBD_CLEAR_QUE);
641

642 643
    TRACE("Clearing NBD socket");
    ioctl(fd, NBD_CLEAR_SOCK);
644

645 646
    errno = serrno;
    return ret;
647
}
648
#else
649
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
650
{
651
    return -ENOTSUP;
652 653 654 655
}

int nbd_disconnect(int fd)
{
656
    return -ENOTSUP;
657 658
}

659
int nbd_client(int fd)
660
{
661
    return -ENOTSUP;
662 663
}
#endif
664

665
ssize_t nbd_send_request(int csock, struct nbd_request *request)
666
{
Paolo Bonzini's avatar
Paolo Bonzini committed
667
    uint8_t buf[NBD_REQUEST_SIZE];
668
    ssize_t ret;
669 670 671 672 673 674

    cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), request->type);
    cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
    cpu_to_be64w((uint64_t*)(buf + 16), request->from);
    cpu_to_be32w((uint32_t*)(buf + 24), request->len);
675

676 677 678 679
    TRACE("Sending request to client: "
          "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
          request->from, request->len, request->handle, request->type);

680 681 682 683 684 685
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
686
        LOG("writing to socket failed");
687
        return -EINVAL;
688 689 690
    }
    return 0;
}
691

692
static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
693
{
Paolo Bonzini's avatar
Paolo Bonzini committed
694
    uint8_t buf[NBD_REQUEST_SIZE];
695
    uint32_t magic;
696
    ssize_t ret;
697

698 699 700 701 702 703
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
704
        LOG("read failed");
705
        return -EINVAL;
706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
    }

    /* Request
       [ 0 ..  3]   magic   (NBD_REQUEST_MAGIC)
       [ 4 ..  7]   type    (0 == READ, 1 == WRITE)
       [ 8 .. 15]   handle
       [16 .. 23]   from
       [24 .. 27]   len
     */

    magic = be32_to_cpup((uint32_t*)buf);
    request->type  = be32_to_cpup((uint32_t*)(buf + 4));
    request->handle = be64_to_cpup((uint64_t*)(buf + 8));
    request->from  = be64_to_cpup((uint64_t*)(buf + 16));
    request->len   = be32_to_cpup((uint32_t*)(buf + 24));

    TRACE("Got request: "
          "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
          magic, request->type, request->from, request->len);

    if (magic != NBD_REQUEST_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
728
        return -EINVAL;
729 730
    }
    return 0;
731 732
}

733
ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
734
{
735 736
    uint8_t buf[NBD_REPLY_SIZE];
    uint32_t magic;
737
    ssize_t ret;
738

739 740 741 742 743 744
    ret = read_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
745
        LOG("read failed");
746
        return -EINVAL;
747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
    }

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */

    magic = be32_to_cpup((uint32_t*)buf);
    reply->error  = be32_to_cpup((uint32_t*)(buf + 4));
    reply->handle = be64_to_cpup((uint64_t*)(buf + 8));

    TRACE("Got reply: "
          "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
          magic, reply->error, reply->handle);

    if (magic != NBD_REPLY_MAGIC) {
        LOG("invalid magic (got 0x%x)", magic);
765
        return -EINVAL;
766 767
    }
    return 0;
768 769
}

770
static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
771
{
Paolo Bonzini's avatar
Paolo Bonzini committed
772
    uint8_t buf[NBD_REPLY_SIZE];
773
    ssize_t ret;
774 775 776 777 778 779 780 781 782 783 784 785

    /* Reply
       [ 0 ..  3]    magic   (NBD_REPLY_MAGIC)
       [ 4 ..  7]    error   (0 == no error)
       [ 7 .. 15]    handle
     */
    cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
    cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
    cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);

    TRACE("Sending response to client");

786 787 788 789 790 791
    ret = write_sync(csock, buf, sizeof(buf));
    if (ret < 0) {
        return ret;
    }

    if (ret != sizeof(buf)) {
792
        LOG("writing to socket failed");
793
        return -EINVAL;
794 795
    }
    return 0;
796
}
797

798 799
#define MAX_NBD_REQUESTS 16

800
void nbd_client_get(NBDClient *client)
801 802 803 804
{
    client->refcount++;
}

805
void nbd_client_put(NBDClient *client)
806 807
{
    if (--client->refcount == 0) {
808 809 810 811 812 813 814 815
        /* The last reference should be dropped by client->close,
         * which is called by nbd_client_close.
         */
        assert(client->closing);

        qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
        close(client->sock);
        client->sock = -1;
816 817 818 819
        if (client->exp) {
            QTAILQ_REMOVE(&client->exp->clients, client, next);
            nbd_export_put(client->exp);
        }
820 821 822 823
        g_free(client);
    }
}

824
void nbd_client_close(NBDClient *client)
825
{
826 827 828 829 830 831 832 833 834 835 836 837
    if (client->closing) {
        return;
    }

    client->closing = true;

    /* Force requests to finish.  They will drop their own references,
     * then we'll close the socket and free the NBDClient.
     */
    shutdown(client->sock, 2);

    /* Also tell the client, so that they release their reference.  */
838 839 840 841 842
    if (client->close) {
        client->close(client);
    }
}

843
static NBDRequest *nbd_request_get(NBDClient *client)
844 845
{
    NBDRequest *req;
846

847 848 849
    assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
    client->nb_requests++;

850
    req = g_slice_new0(NBDRequest);
851 852
    nbd_client_get(client);
    req->client = client;
853 854 855
    return req;
}

856
static void nbd_request_put(NBDRequest *req)
857
{
858
    NBDClient *client = req->client;
859

860 861 862
    if (req->data) {
        qemu_vfree(req->data);
    }
863 864
    g_slice_free(NBDRequest, req);

865 866 867
    if (client->nb_requests-- == MAX_NBD_REQUESTS) {
        qemu_notify_event();
    }
868
    nbd_client_put(client);
869 870
}

871
NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
872 873
                          off_t size, uint32_t nbdflags,
                          void (*close)(NBDExport *))
874 875
{
    NBDExport *exp = g_malloc0(sizeof(NBDExport));
876
    exp->refcount = 1;
877
    QTAILQ_INIT(&exp->clients);
878 879 880
    exp->bs = bs;
    exp->dev_offset = dev_offset;
    exp->nbdflags = nbdflags;
881
    exp->size = size == -1 ? bdrv_getlength(bs) : size;
882
    exp->close = close;
883
    bdrv_ref(bs);
884 885 886
    return exp;
}

887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
NBDExport *nbd_export_find(const char *name)
{
    NBDExport *exp;
    QTAILQ_FOREACH(exp, &exports, next) {
        if (strcmp(name, exp->name) == 0) {
            return exp;
        }
    }

    return NULL;
}

void nbd_export_set_name(NBDExport *exp, const char *name)
{
    if (exp->name == name) {
        return;
    }

    nbd_export_get(exp);
    if (exp->name != NULL) {
        g_free(exp->name);
        exp->name = NULL;
        QTAILQ_REMOVE(&exports, exp, next);
        nbd_export_put(exp);
    }
    if (name != NULL) {
        nbd_export_get(exp);
        exp->name = g_strdup(name);
        QTAILQ_INSERT_TAIL(&exports, exp, next);
    }
    nbd_export_put(exp);
}

920 921
void nbd_export_close(NBDExport *exp)
{
922
    NBDClient *client, *next;
923

924 925 926 927
    nbd_export_get(exp);
    QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
        nbd_client_close(client);
    }
928
    nbd_export_set_name(exp, NULL);
929
    nbd_export_put(exp);
930 931 932 933
    if (exp->bs) {
        bdrv_unref(exp->bs);
        exp->bs = NULL;
    }
934 935 936 937 938 939 940 941 942 943 944 945 946
}

void nbd_export_get(NBDExport *exp)
{
    assert(exp->refcount > 0);
    exp->refcount++;
}

void nbd_export_put(NBDExport *exp)
{
    assert(exp->refcount > 0);
    if (exp->refcount == 1) {
        nbd_export_close(exp);
947 948
    }

949
    if (--exp->refcount == 0) {
950 951
        assert(exp->name == NULL);

952 953 954 955
        if (exp->close) {
            exp->close(exp);
        }

956 957
        g_free(exp);
    }
958 959
}

960 961 962 963 964
BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
{
    return exp->bs;
}

965 966 967 968 969 970 971 972 973
void nbd_export_close_all(void)
{
    NBDExport *exp, *next;

    QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
        nbd_export_close(exp);
    }
}

974
static int nbd_can_read(void *opaque);
975 976 977
static void nbd_read(void *opaque);
static void nbd_restart_write(void *opaque);

978 979
static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
                                 int len)
980
{
981 982
    NBDClient *client = req->client;
    int csock = client->sock;
983
    ssize_t rc, ret;
984

985
    qemu_co_mutex_lock(&client->send_lock);
986 987
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
                         nbd_restart_write, client);
988 989
    client->send_coroutine = qemu_coroutine_self();

990 991 992 993 994
    if (!len) {
        rc = nbd_send_reply(csock, reply);
    } else {
        socket_set_cork(csock, 1);
        rc = nbd_send_reply(csock, reply);
995
        if (rc >= 0) {
996
            ret = qemu_co_send(csock, req->data, len);
997
            if (ret != len) {
998
                rc = -EIO;
999 1000 1001 1002
            }
        }
        socket_set_cork(csock, 0);
    }
1003 1004

    client->send_coroutine = NULL;
1005
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1006
    qemu_co_mutex_unlock(&client->send_lock);
1007 1008 1009
    return rc;
}

1010
static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
1011
{
1012 1013
    NBDClient *client = req->client;
    int csock = client->sock;
1014
    uint32_t command;
1015
    ssize_t rc;
1016

1017
    client->recv_coroutine = qemu_coroutine_self();
1018 1019 1020 1021 1022
    rc = nbd_receive_request(csock, request);
    if (rc < 0) {
        if (rc != -EAGAIN) {
            rc = -EIO;
        }
1023 1024 1025
        goto out;
    }

1026
    if (request->len > NBD_MAX_BUFFER_SIZE) {
1027
        LOG("len (%u) is larger than max len (%u)",
1028
            request->len, NBD_MAX_BUFFER_SIZE);
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
        rc = -EINVAL;
        goto out;
    }

    if ((request->from + request->len) < request->from) {
        LOG("integer overflow detected! "
            "you're probably being attacked");
        rc = -EINVAL;
        goto out;
    }

    TRACE("Decoding type");

1042 1043 1044 1045 1046
    command = request->type & NBD_CMD_MASK_COMMAND;
    if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
        req->data = qemu_blockalign(client->exp->bs, request->len);
    }
    if (command == NBD_CMD_WRITE) {
1047 1048
        TRACE("Reading %u byte(s)", request->len);

1049
        if (qemu_co_recv(csock, req->data, request->len) != request->len) {
1050 1051 1052 1053 1054 1055 1056 1057
            LOG("reading from socket failed");
            rc = -EIO;
            goto out;
        }
    }
    rc = 0;

out:
1058
    client->recv_coroutine = NULL;
1059 1060 1061
    return rc;
}

1062
static void nbd_trip(void *opaque)
1063
{
1064
    NBDClient *client = opaque;
1065
    NBDExport *exp = client->exp;
1066
    NBDRequest *req;
1067 1068
    struct nbd_request request;
    struct nbd_reply reply;
1069
    ssize_t ret;
1070 1071

    TRACE("Reading request.");
1072 1073 1074
    if (client->closing) {
        return;
    }
1075

1076
    req = nbd_request_get(client);
1077
    ret = nbd_co_receive_request(req, &request);
1078 1079 1080
    if (ret == -EAGAIN) {
        goto done;
    }
1081
    if (ret == -EIO) {
1082
        goto out;
1083
    }
1084

1085 1086 1087
    reply.handle = request.handle;
    reply.error = 0;

1088 1089 1090
    if (ret < 0) {
        reply.error = -ret;
        goto error_reply;
1091 1092
    }

1093
    if ((request.from + request.len) > exp->size) {
1094 1095
            LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
            ", Offset: %" PRIu64 "\n",
1096
                    request.from, request.len,
1097
                    (uint64_t)exp->size, (uint64_t)exp->dev_offset);
1098
        LOG("requested operation past EOF--bad client?");
1099
        goto invalid_request;
1100 1101
    }

1102
    switch (request.type & NBD_CMD_MASK_COMMAND) {
1103 1104 1105
    case NBD_CMD_READ:
        TRACE("Request type is READ");

Paolo Bonzini's avatar
Paolo Bonzini committed
1106 1107 1108 1109 1110 1111 1112 1113 1114
        if (request.type & NBD_CMD_FLAG_FUA) {
            ret = bdrv_co_flush(exp->bs);
            if (ret < 0) {
                LOG("flush failed");
                reply.error = -ret;
                goto error_reply;
            }
        }

1115
        ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
1116
                        req->data, request.len / 512);
1117
        if (ret < 0) {
1118
            LOG("reading from file failed");
1119
            reply.error = -ret;
1120
            goto error_reply;
1121 1122 1123
        }

        TRACE("Read %u byte(s)", request.len);
1124
        if (nbd_co_send_reply(req, &reply, request.len) < 0)
1125
            goto out;
1126 1127 1128 1129
        break;
    case NBD_CMD_WRITE:
        TRACE("Request type is WRITE");

1130
        if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
1131
            TRACE("Server is read-only, return error");
1132 1133 1134 1135 1136 1137
            reply.error = EROFS;
            goto error_reply;
        }

        TRACE("Writing to device");

1138
        ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
1139
                         req->data, request.len / 512);
1140 1141 1142 1143 1144
        if (ret < 0) {
            LOG("writing to file failed");
            reply.error = -ret;
            goto error_reply;
        }
1145

1146
        if (request.type & NBD_CMD_FLAG_FUA) {
1147
            ret = bdrv_co_flush(exp->bs);
1148
            if (ret < 0) {
1149
                LOG("flush failed");
1150
                reply.error = -ret;
1151
                goto error_reply;
1152
            }
1153 1154
        }

1155
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1156
            goto out;
1157
        }
1158 1159 1160 1161
        break;
    case NBD_CMD_DISC:
        TRACE("Request type is DISCONNECT");
        errno = 0;
1162
        goto out;
1163 1164 1165
    case NBD_CMD_FLUSH:
        TRACE("Request type is FLUSH");

1166
        ret = bdrv_co_flush(exp->bs);
1167 1168 1169 1170
        if (ret < 0) {
            LOG("flush failed");
            reply.error = -ret;
        }
1171
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1172
            goto out;
1173
        }
1174 1175 1176
        break;
    case NBD_CMD_TRIM:
        TRACE("Request type is TRIM");
1177 1178
        ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
                              request.len / 512);
1179 1180 1181 1182
        if (ret < 0) {
            LOG("discard failed");
            reply.error = -ret;
        }
1183
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1184
            goto out;
1185
        }
1186
        break;
1187 1188
    default:
        LOG("invalid request type (%u) received", request.type);
1189 1190 1191
    invalid_request:
        reply.error = -EINVAL;
    error_reply:
1192
        if (nbd_co_send_reply(req, &reply, 0) < 0) {
1193
            goto out;
1194
        }
1195
        break;
1196 1197 1198 1199
    }

    TRACE("Request/Reply complete");

1200
done:
1201 1202 1203
    nbd_request_put(req);
    return;

1204
out:
1205
    nbd_request_put(req);
1206
    nbd_client_close(client);
1207
}
1208

1209 1210 1211 1212 1213 1214 1215
static int nbd_can_read(void *opaque)
{
    NBDClient *client = opaque;

    return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
}

1216 1217 1218 1219
static void nbd_read(void *opaque)
{
    NBDClient *client = opaque;

1220 1221 1222 1223
    if (client->recv_coroutine) {
        qemu_coroutine_enter(client->recv_coroutine, NULL);
    } else {
        qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
1224 1225 1226
    }
}

1227 1228 1229 1230 1231 1232 1233
static void nbd_restart_write(void *opaque)
{
    NBDClient *client = opaque;

    qemu_coroutine_enter(client->send_coroutine, NULL);
}

1234 1235
NBDClient *nbd_client_new(NBDExport *exp, int csock,
                          void (*close)(NBDClient *))
1236
{
1237 1238 1239 1240 1241
    NBDClient *client;
    client = g_malloc0(sizeof(NBDClient));
    client->refcount = 1;
    client->exp = exp;
    client->sock = csock;
1242 1243 1244 1245
    if (nbd_send_negotiate(client) < 0) {
        g_free(client);
        return NULL;
    }
1246
    client->close = close;
1247
    qemu_co_mutex_init(&client->send_lock);
1248
    qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
1249

1250 1251 1252 1253
    if (exp) {
        QTAILQ_INSERT_TAIL(&exp->clients, client, next);
        nbd_export_get(exp);
    }
1254
    return client;
1255
}