Merge branch 'alex'
[nbd.git] / nbd-server.c
index cd584f0..6d734b3 100644 (file)
@@ -133,15 +133,9 @@ int dontfork = 0;
 /* Debugging macros */
 //#define DODBG
 #ifdef DODBG
-#define DEBUG( a ) printf( a )
-#define DEBUG2( a,b ) printf( a,b )
-#define DEBUG3( a,b,c ) printf( a,b,c )
-#define DEBUG4( a,b,c,d ) printf( a,b,c,d )
+#define DEBUG(...) printf(__VA_ARGS__)
 #else
-#define DEBUG( a )
-#define DEBUG2( a,b ) 
-#define DEBUG3( a,b,c ) 
-#define DEBUG4( a,b,c,d ) 
+#define DEBUG(...)
 #endif
 #ifndef PACKAGE_VERSION
 #define PACKAGE_VERSION ""
@@ -163,6 +157,9 @@ int dontfork = 0;
 #define F_SPARSE 16      /**< flag to tell us copyronwrite should use a sparse file */
 #define F_SDP 32         /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
 #define F_SYNC 64        /**< Whether to fsync() after a write */
+#define F_FLUSH 128      /**< Whether server wants FLUSH to be sent by the client */
+#define F_FUA 256        /**< Whether server wants FUA to be sent by the client */
+#define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
 GHashTable *children;
 char pidfname[256]; /**< name of our PID file */
 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
@@ -208,6 +205,7 @@ typedef struct {
                                  disconnects */
        gchar* servename;    /**< name of the export as selected by nbd-client */
        int max_connections; /**< maximum number of opened connections */
+       gchar* transactionlog;/**< filename for transaction log */
 } SERVER;
 
 /**
@@ -234,6 +232,7 @@ typedef struct {
        u32 difffilelen;     /**< number of pages in difffile */
        u32 *difmap;         /**< see comment on the global difmap for this one */
        gboolean modern;     /**< client was negotiated using modern negotiation protocol */
+       int transactionlogfd;/**< fd for transaction log */
 } CLIENT;
 
 /**
@@ -262,6 +261,22 @@ typedef struct {
                                  is PARAM_BOOL. */
 } PARAM;
 
+static inline const char * getcommandname(uint64_t command) {
+       switch (command) {
+       case NBD_CMD_READ:
+               return "NBD_CMD_READ";
+       case NBD_CMD_WRITE:
+               return "NBD_CMD_WRITE";
+       case NBD_CMD_DISC:
+               return "NBD_CMD_DISC";
+       case NBD_CMD_FLUSH:
+               return "NBD_CMD_FLUSH";
+       default:
+               break;
+       }
+       return "UNKNOWN";
+}
+
 /**
  * Check whether a client is allowed to connect. Works with an authorization
  * file which contains one line per machine, no wildcards.
@@ -577,6 +592,8 @@ void remove_server(gpointer s) {
                g_free(server->prerun);
        if(server->postrun)
                g_free(server->postrun);
+       if(server->transactionlog)
+               g_free(server->transactionlog);
        g_free(server);
 }
 
@@ -616,6 +633,9 @@ SERVER* dup_serve(SERVER *s) {
 
        if(s->postrun)
                serve->postrun = g_strdup(s->postrun);
+
+       if(s->transactionlog)
+               serve->transactionlog = g_strdup(s->transactionlog);
        
        if(s->servename)
                serve->servename = g_strdup(s->servename);
@@ -714,12 +734,16 @@ GArray* parse_cfile(gchar* f, GError** e) {
                { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
                { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
                { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
+               { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
                { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
                { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
                { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
                { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
                { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
                { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
+               { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
+               { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
+               { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
                { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
                { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
        };
@@ -898,7 +922,7 @@ void sigchld_handler(int s) {
                if(!i) {
                        msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
                } else {
-                       DEBUG2("Removing %d from the list of children", pid);
+                       DEBUG("Removing %d from the list of children", pid);
                        g_hash_table_remove(children, &pid);
                }
        }
@@ -976,7 +1000,7 @@ off_t size_autodetect(int fhandle) {
        if (es > ((off_t)0)) {
                return es;
         } else {
-                DEBUG2("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
+                DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
         }
 
        err("Could not find size of exported block device: %m");
@@ -1055,7 +1079,7 @@ void myseek(int handle,off_t a) {
  * @param client The client we're serving for
  * @return The number of bytes actually written, or -1 in case of an error
  **/
-ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        int fhandle;
        off_t foffset;
        size_t maxbytes;
@@ -1066,12 +1090,51 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        if(maxbytes && len > maxbytes)
                len = maxbytes;
 
-       DEBUG4("(WRITE to fd %d offset %llu len %u), ", fhandle, foffset, len);
+       DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
 
        myseek(fhandle, foffset);
        retval = write(fhandle, buf, len);
        if(client->server->flags & F_SYNC) {
                fsync(fhandle);
+       } else if (fua) {
+
+         /* This is where we would do the following
+          *   #ifdef USE_SYNC_FILE_RANGE
+          * However, we don't, for the reasons set out below
+          * by Christoph Hellwig <hch@infradead.org>
+          *
+          * [BEGINS] 
+          * fdatasync is equivalent to fsync except that it does not flush
+          * non-essential metadata (basically just timestamps in practice), but it
+          * does flush metadata requried to find the data again, e.g. allocation
+          * information and extent maps.  sync_file_range does nothing but flush
+          * out pagecache content - it means you basically won't get your data
+          * back in case of a crash if you either:
+          * 
+          *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
+          *  b) are using a sparse file on a filesystem
+          *  c) are using a fallocate-preallocated file on a filesystem
+          *  d) use any file on a COW filesystem like btrfs
+          * 
+          * e.g. it only does anything useful for you if you do not have a volatile
+          * write cache, and either use a raw block device node, or just overwrite
+          * an already fully allocated (and not preallocated) file on a non-COW
+          * filesystem.
+          * [ENDS]
+          *
+          * What we should do is open a second FD with O_DSYNC set, then write to
+          * that when appropriate. However, with a Linux client, every REQ_FUA
+          * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
+          * problems.
+          *
+          */
+#if 0
+               sync_file_range(fhandle, foffset, len,
+                               SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
+                               SYNC_FILE_RANGE_WAIT_AFTER);
+#else
+               fdatasync(fhandle);
+#endif
        }
        return retval;
 }
@@ -1080,10 +1143,10 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
  * Call rawexpwrite repeatedly until all data has been written.
  * @return 0 on success, nonzero on failure
  **/
-int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) {
+int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        ssize_t ret=0;
 
-       while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) {
+       while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
                a += ret;
                buf += ret;
                len -= ret;
@@ -1112,7 +1175,7 @@ ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
        if(maxbytes && len > maxbytes)
                len = maxbytes;
 
-       DEBUG4("(READ from fd %d offset %llu len %u), ", fhandle, foffset, len);
+       DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
 
        myseek(fhandle, foffset);
        return read(fhandle, buf, len);
@@ -1149,7 +1212,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
 
        if (!(client->server->flags & F_COPYONWRITE))
                return(rawexpread_fully(a, buf, len, client));
-       DEBUG3("Asked to read %d bytes at %llu.\n", len, (unsigned long long)a);
+       DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
 
        mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
 
@@ -1159,12 +1222,12 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
                rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
                        len : (size_t)DIFFPAGESIZE-offset;
                if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
-                       DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt,
+                       DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
                               (unsigned long)(client->difmap[mapcnt]));
                        myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
                        if (read(client->difffile, buf, rdlen) != rdlen) return -1;
                } else { /* the block is not there */
-                       DEBUG2("Page %llu is not here, we read the original one\n",
+                       DEBUG("Page %llu is not here, we read the original one\n",
                               (unsigned long long)mapcnt);
                        if(rawexpread_fully(a, buf, rdlen, client)) return -1;
                }
@@ -1184,7 +1247,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
  * @param client The client we're going to write for.
  * @return 0 on success, nonzero on failure
  **/
-int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        char pagebuf[DIFFPAGESIZE];
        off_t mapcnt,mapl,maph;
        off_t wrlen,rdlen; 
@@ -1192,8 +1255,8 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        off_t offset;
 
        if (!(client->server->flags & F_COPYONWRITE))
-               return(rawexpwrite_fully(a, buf, len, client)); 
-       DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a);
+               return(rawexpwrite_fully(a, buf, len, client, fua)); 
+       DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
 
        mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
 
@@ -1204,7 +1267,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
                        len : (size_t)DIFFPAGESIZE-offset;
 
                if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
-                       DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt,
+                       DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
                               (unsigned long)(client->difmap[mapcnt])) ;
                        myseek(client->difffile,
                                        client->difmap[mapcnt]*DIFFPAGESIZE+offset);
@@ -1212,7 +1275,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
                } else { /* the block is not there */
                        myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
                        client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
-                       DEBUG3("Page %llu is not here, we put it at %lu\n",
+                       DEBUG("Page %llu is not here, we put it at %lu\n",
                               (unsigned long long)mapcnt,
                               (unsigned long)(client->difmap[mapcnt]));
                        rdlen=DIFFPAGESIZE ;
@@ -1225,6 +1288,30 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
                }                                                   
                len-=wrlen ; a+=wrlen ; buf+=wrlen ;
        }
+       if (client->server->flags & F_SYNC) {
+               fsync(client->difffile);
+       } else if (fua) {
+               /* open question: would it be cheaper to do multiple sync_file_ranges?
+                  as we iterate through the above?
+                */
+               fdatasync(client->difffile);
+       }
+       return 0;
+}
+
+int expflush(CLIENT *client) {
+       gint i;
+
+        if (client->server->flags & F_COPYONWRITE) {
+               return fsync(client->difffile);
+       }
+       
+       for (i = 0; i < client->export->len; i++) {
+               FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
+               if (fsync(fi.fhandle) < 0)
+                       return -1;
+       }
+       
        return 0;
 }
 
@@ -1304,6 +1391,7 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
                                client->exportsize = OFFT_MAX;
                                client->net = net;
                                client->modern = TRUE;
+                               client->transactionlogfd = -1;
                                free(name);
                                return client;
                        }
@@ -1317,6 +1405,12 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
                err("Negotiation failed: %m");
        if (client->server->flags & F_READONLY)
                flags |= NBD_FLAG_READ_ONLY;
+       if (client->server->flags & F_FLUSH)
+               flags |= NBD_FLAG_SEND_FLUSH;
+       if (client->server->flags & F_FUA)
+               flags |= NBD_FLAG_SEND_FUA;
+       if (client->server->flags & F_ROTATIONAL)
+               flags |= NBD_FLAG_ROTATIONAL;
        if (!client->modern) {
                /* oldstyle */
                flags = htonl(flags);
@@ -1337,7 +1431,9 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
 }
 
 /** sending macro. */
-#define SEND(net,reply) writeit( net, &reply, sizeof( reply ));
+#define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
+       if (client->transactionlogfd != -1) \
+               writeit(client->transactionlogfd, &reply, sizeof(reply)); }
 /** error macro. */
 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
 /**
@@ -1366,15 +1462,52 @@ int mainloop(CLIENT *client) {
                size_t len;
                size_t currlen;
                size_t writelen;
+               uint16_t command;
 #ifdef DODBG
                i++;
                printf("%d: ", i);
 #endif
                readit(client->net, &request, sizeof(request));
+               if (client->transactionlogfd != -1)
+                       writeit(client->transactionlogfd, &request, sizeof(request));
+
                request.from = ntohll(request.from);
                request.type = ntohl(request.type);
+               command = request.type & NBD_CMD_MASK_COMMAND;
+               len = ntohl(request.len);
+
+               DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
+                               (unsigned long long)request.from,
+                               (unsigned long long)request.from / 512, (unsigned int)len);
 
-               if (request.type==NBD_CMD_DISC) {
+               if (request.magic != htonl(NBD_REQUEST_MAGIC))
+                       err("Not enough magic.");
+
+               memcpy(reply.handle, request.handle, sizeof(reply.handle));
+
+               if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
+                       if ((request.from + len) > (OFFT_MAX)) {
+                               DEBUG("[Number too large!]");
+                               ERROR(client, reply, EINVAL);
+                               continue;
+                       }
+
+                       if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
+                               DEBUG("[RANGE!]");
+                               ERROR(client, reply, EINVAL);
+                               continue;
+                       }
+
+                       currlen = len;
+                       if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
+                               currlen = BUFSIZE - sizeof(struct nbd_reply);
+                               msg2(LOG_INFO, "oversized request (this is not a problem)");
+                       }
+               }
+
+               switch (command) {
+
+               case NBD_CMD_DISC:
                        msg2(LOG_INFO, "Disconnect request received.");
                        if (client->server->flags & F_COPYONWRITE) { 
                                if (client->difmap) g_free(client->difmap) ;
@@ -1384,37 +1517,8 @@ int mainloop(CLIENT *client) {
                        }
                        go_on=FALSE;
                        continue;
-               }
 
-               len = ntohl(request.len);
-
-               if (request.magic != htonl(NBD_REQUEST_MAGIC))
-                       err("Not enough magic.");
-               if (len > BUFSIZE - sizeof(struct nbd_reply)) {
-                       currlen = BUFSIZE - sizeof(struct nbd_reply);
-                       msg2(LOG_INFO, "oversized request (this is not a problem)");
-               } else {
-                       currlen = len;
-               }
-#ifdef DODBG
-               printf("%s from %llu (%llu) len %d, ", request.type ? "WRITE" :
-                               "READ", (unsigned long long)request.from,
-                               (unsigned long long)request.from / 512, len);
-#endif
-               memcpy(reply.handle, request.handle, sizeof(reply.handle));
-               if ((request.from + len) > (OFFT_MAX)) {
-                       DEBUG("[Number too large!]");
-                       ERROR(client, reply, EINVAL);
-                       continue;
-               }
-
-               if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
-                       DEBUG("[RANGE!]");
-                       ERROR(client, reply, EINVAL);
-                       continue;
-               }
-
-               if (request.type==NBD_CMD_WRITE) {
+               case NBD_CMD_WRITE:
                        DEBUG("wr: net->buf, ");
                        while(len > 0) {
                                readit(client->net, buf, currlen);
@@ -1425,39 +1529,59 @@ int mainloop(CLIENT *client) {
                                        ERROR(client, reply, EPERM);
                                        continue;
                                }
-                               if (expwrite(request.from, buf, len, client)) {
+                               if (expwrite(request.from, buf, len, client,
+                                            request.type & NBD_CMD_FLAG_FUA)) {
                                        DEBUG("Write failed: %m" );
                                        ERROR(client, reply, errno);
                                        continue;
                                }
-                               SEND(client->net, reply);
-                               DEBUG("OK!\n");
                                len -= currlen;
                                currlen = (len < BUFSIZE) ? len : BUFSIZE;
                        }
+                       SEND(client->net, reply);
+                       DEBUG("OK!\n");
                        continue;
-               }
-               /* READ */
-
-               DEBUG("exp->buf, ");
-               memcpy(buf, &reply, sizeof(struct nbd_reply));
-               p = buf + sizeof(struct nbd_reply);
-               writelen = currlen + sizeof(struct nbd_reply);
-               while(len > 0) {
-                       if (expread(request.from, p, currlen, client)) {
-                               DEBUG("Read failed: %m");
+
+               case NBD_CMD_FLUSH:
+                       DEBUG("fl: ");
+                       if (expflush(client)) {
+                               DEBUG("Flush failed: %m");
                                ERROR(client, reply, errno);
                                continue;
                        }
+                       SEND(client->net, reply);
+                       DEBUG("OK!\n");
+                       continue;
 
-                       DEBUG("buf->net, ");
-                       writeit(client->net, buf, writelen);
-                       len -= currlen;
-                       currlen = (len < BUFSIZE) ? len : BUFSIZE;
-                       p = buf;
-                       writelen = currlen;
+               case NBD_CMD_READ:
+                       DEBUG("exp->buf, ");
+                       memcpy(buf, &reply, sizeof(struct nbd_reply));
+                       if (client->transactionlogfd != -1)
+                               writeit(client->transactionlogfd, &reply, sizeof(reply));
+                       p = buf + sizeof(struct nbd_reply);
+                       writelen = currlen + sizeof(struct nbd_reply);
+                       while(len > 0) {
+                               if (expread(request.from, p, currlen, client)) {
+                                       DEBUG("Read failed: %m");
+                                       ERROR(client, reply, errno);
+                                       continue;
+                               }
+                               
+                               DEBUG("buf->net, ");
+                               writeit(client->net, buf, writelen);
+                               len -= currlen;
+                               request.from += currlen;
+                               currlen = (len < BUFSIZE) ? len : BUFSIZE;
+                               p = buf;
+                               writelen = currlen;
+                       }
+                       DEBUG("OK!\n");
+                       continue;
+
+               default:
+                       DEBUG ("Ignoring unknown command\n");
+                       continue;
                }
-               DEBUG("OK!\n");
        }
        return 0;
 }
@@ -1488,7 +1612,7 @@ void setupexport(CLIENT* client) {
                } else {
                        tmpname=g_strdup(client->exportname);
                }
-               DEBUG2( "Opening %s\n", tmpname );
+               DEBUG( "Opening %s\n", tmpname );
                fi.fhandle = open(tmpname, mode);
                if(fi.fhandle == -1 && mode == O_RDWR) {
                        /* Try again because maybe media was read-only */
@@ -1587,6 +1711,15 @@ int do_run(gchar* command, gchar* file) {
  * @param client a connected client
  **/
 void serveconnection(CLIENT *client) {
+       if (client->server->transactionlog && (client->transactionlogfd == -1))
+       {
+               if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
+                                                          O_WRONLY | O_CREAT,
+                                                          S_IRUSR | S_IWUSR)))
+                       g_warning("Could not open transaction log %s",
+                                 client->server->transactionlog);
+       }
+
        if(do_run(client->server->prerun, client->exportname)) {
                exit(EXIT_FAILURE);
        }
@@ -1600,6 +1733,12 @@ void serveconnection(CLIENT *client) {
 
        mainloop(client);
        do_run(client->server->postrun, client->exportname);
+
+       if (-1 != client->transactionlogfd)
+       {
+               close(client->transactionlogfd);
+               client->transactionlogfd = -1;
+       }
 }
 
 /**
@@ -1785,6 +1924,7 @@ int serveloop(GArray* servers) {
                                        client->server=serve;
                                        client->exportsize=OFFT_MAX;
                                        client->net=net;
+                                       client->transactionlogfd = -1;
                                }
                                set_peername(net, client);
                                if (!authorized_client(client)) {