copy handle on all requests
[nbd.git] / nbd-server.c
index 51b725c..4122cef 100644 (file)
@@ -116,6 +116,9 @@ gchar* rungroup=NULL;
 /** whether to export using the old negotiation protocol (port-based) */
 gboolean do_oldstyle=FALSE;
 
+/* Whether we should avoid forking */
+int dontfork = 0;
+
 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
 #ifdef ISSERVER
 #define msg2(a,b) syslog(a,b)
@@ -134,11 +137,13 @@ gboolean do_oldstyle=FALSE;
 #define DEBUG2( a,b ) printf( a,b )
 #define DEBUG3( a,b,c ) printf( a,b,c )
 #define DEBUG4( a,b,c,d ) printf( a,b,c,d )
+#define DEBUG5( a,b,c,d,e ) printf( a,b,c,d,e )
 #else
 #define DEBUG( a )
 #define DEBUG2( a,b ) 
 #define DEBUG3( a,b,c ) 
 #define DEBUG4( a,b,c,d ) 
+#define DEBUG5( a,b,c,d,e ) 
 #endif
 #ifndef PACKAGE_VERSION
 #define PACKAGE_VERSION ""
@@ -160,6 +165,9 @@ gboolean do_oldstyle=FALSE;
 #define F_SPARSE 16      /**< flag to tell us copyronwrite should use a sparse file */
 #define F_SDP 32         /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
 #define F_SYNC 64        /**< Whether to fsync() after a write */
+#define F_FLUSH 128      /**< Whether server wants FLUSH to be sent by the client */
+#define F_FUA 256        /**< Whether server wants FUA to be sent by the client */
+#define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
 GHashTable *children;
 char pidfname[256]; /**< name of our PID file */
 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
@@ -320,7 +328,7 @@ int authorized_client(CLIENT *opts) {
  * @param buf a buffer
  * @param len the number of bytes to be read
  **/
-inline void readit(int f, void *buf, size_t len) {
+static inline void readit(int f, void *buf, size_t len) {
        ssize_t res;
        while (len > 0) {
                DEBUG("*");
@@ -342,7 +350,7 @@ inline void readit(int f, void *buf, size_t len) {
  * @param buf a buffer containing data
  * @param len the number of bytes to be written
  **/
-inline void writeit(int f, void *buf, size_t len) {
+static inline void writeit(int f, void *buf, size_t len) {
        ssize_t res;
        while (len > 0) {
                DEBUG("+");
@@ -413,6 +421,7 @@ SERVER* cmdline(int argc, char *argv[]) {
                {"read-only", no_argument, NULL, 'r'},
                {"multi-file", no_argument, NULL, 'm'},
                {"copy-on-write", no_argument, NULL, 'c'},
+               {"dont-fork", no_argument, NULL, 'd'},
                {"authorize-file", required_argument, NULL, 'l'},
                {"config-file", required_argument, NULL, 'C'},
                {"pid-file", required_argument, NULL, 'p'},
@@ -434,7 +443,7 @@ SERVER* cmdline(int argc, char *argv[]) {
        serve=g_new0(SERVER, 1);
        serve->authname = g_strdup(default_authname);
        serve->virtstyle=VIRT_IPLIT;
-       while((c=getopt_long(argc, argv, "-C:cl:mo:rp:M:", long_options, &i))>=0) {
+       while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
                switch (c) {
                case 1:
                        /* non-option argument */
@@ -503,6 +512,9 @@ SERVER* cmdline(int argc, char *argv[]) {
                case 'c': 
                        serve->flags |=F_COPYONWRITE;
                        break;
+               case 'd': 
+                       dontfork = 1;
+                       break;
                case 'C':
                        g_free(config_file_pos);
                        config_file_pos=g_strdup(optarg);
@@ -599,8 +611,9 @@ SERVER* dup_serve(SERVER *s) {
                serve->authname = strdup(s->authname);
 
        serve->flags = s->flags;
-       serve->socket = serve->socket;
-       serve->socket_family = serve->socket_family;
+       serve->socket = s->socket;
+       serve->socket_family = s->socket_family;
+       serve->virtstyle = s->virtstyle;
        serve->cidrlen = s->cidrlen;
 
        if(s->prerun)
@@ -699,21 +712,24 @@ GArray* parse_cfile(gchar* f, GError** e) {
        SERVER s;
        gchar *virtstyle=NULL;
        PARAM lp[] = {
-               { "exportname", TRUE,   PARAM_STRING,   NULL, 0 },
-               { "port",       TRUE,   PARAM_INT,      NULL, 0 },
-               { "authfile",   FALSE,  PARAM_STRING,   NULL, 0 },
-               { "filesize",   FALSE,  PARAM_INT,      NULL, 0 },
-               { "virtstyle",  FALSE,  PARAM_STRING,   NULL, 0 },
-               { "prerun",     FALSE,  PARAM_STRING,   NULL, 0 },
-               { "postrun",    FALSE,  PARAM_STRING,   NULL, 0 },
-               { "readonly",   FALSE,  PARAM_BOOL,     NULL, F_READONLY },
-               { "multifile",  FALSE,  PARAM_BOOL,     NULL, F_MULTIFILE },
-               { "copyonwrite", FALSE, PARAM_BOOL,     NULL, F_COPYONWRITE },
-               { "sparse_cow", FALSE,  PARAM_BOOL,     NULL, F_SPARSE },
-               { "sdp",        FALSE,  PARAM_BOOL,     NULL, F_SDP },
-               { "sync",       FALSE,  PARAM_BOOL,     NULL, F_SYNC },
-               { "listenaddr", FALSE,  PARAM_STRING,   NULL, 0 },
-               { "maxconnections", FALSE, PARAM_INT,   NULL, 0 },
+               { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
+               { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
+               { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
+               { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
+               { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
+               { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
+               { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
+               { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
+               { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
+               { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
+               { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
+               { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
+               { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
+               { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
+               { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
+               { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
+               { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
+               { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
        };
        const int lp_size=sizeof(lp)/sizeof(PARAM);
        PARAM gp[] = {
@@ -753,18 +769,6 @@ GArray* parse_cfile(gchar* f, GError** e) {
        groups = g_key_file_get_groups(cfile, NULL);
        for(i=0;groups[i];i++) {
                memset(&s, '\0', sizeof(SERVER));
-               lp[0].target=&(s.exportname);
-               lp[1].target=&(s.port);
-               lp[2].target=&(s.authname);
-               lp[3].target=&(s.expected_size);
-               lp[4].target=&(virtstyle);
-               lp[5].target=&(s.prerun);
-               lp[6].target=&(s.postrun);
-               lp[7].target=lp[8].target=lp[9].target=
-                               lp[10].target=lp[11].target=
-                               lp[12].target=&(s.flags);
-               lp[13].target=&(s.listenaddr);
-               lp[14].target=&(s.max_connections);
 
                /* After the [generic] group, start parsing exports */
                if(i==1) {
@@ -1059,7 +1063,7 @@ void myseek(int handle,off_t a) {
  * @param client The client we're serving for
  * @return The number of bytes actually written, or -1 in case of an error
  **/
-ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        int fhandle;
        off_t foffset;
        size_t maxbytes;
@@ -1070,12 +1074,20 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        if(maxbytes && len > maxbytes)
                len = maxbytes;
 
-       DEBUG4("(WRITE to fd %d offset %llu len %u), ", fhandle, foffset, len);
+       DEBUG5("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, foffset, len, fua);
 
        myseek(fhandle, foffset);
        retval = write(fhandle, buf, len);
        if(client->server->flags & F_SYNC) {
                fsync(fhandle);
+       } else if (fua) {
+#ifdef USE_SYNC_FILE_RANGE
+               sync_file_range(fhandle, foffset, len,
+                               SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
+                               SYNC_FILE_RANGE_WAIT_AFTER);
+#else
+               fdatasync(fhandle);
+#endif
        }
        return retval;
 }
@@ -1084,10 +1096,10 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
  * Call rawexpwrite repeatedly until all data has been written.
  * @return 0 on success, nonzero on failure
  **/
-int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) {
+int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        ssize_t ret=0;
 
-       while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) {
+       while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
                a += ret;
                buf += ret;
                len -= ret;
@@ -1188,7 +1200,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
  * @param client The client we're going to write for.
  * @return 0 on success, nonzero on failure
  **/
-int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
        char pagebuf[DIFFPAGESIZE];
        off_t mapcnt,mapl,maph;
        off_t wrlen,rdlen; 
@@ -1196,7 +1208,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        off_t offset;
 
        if (!(client->server->flags & F_COPYONWRITE))
-               return(rawexpwrite_fully(a, buf, len, client)); 
+               return(rawexpwrite_fully(a, buf, len, client, fua)); 
        DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a);
 
        mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
@@ -1229,6 +1241,33 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
                }                                                   
                len-=wrlen ; a+=wrlen ; buf+=wrlen ;
        }
+       if (client->server->flags & F_SYNC) {
+               fsync(client->difffile);
+       } else if (fua) {
+               /* open question: would it be cheaper to do multiple sync_file_ranges?
+                  as we iterate through the above?
+                */
+               fdatasync(client->difffile);
+       }
+       return 0;
+}
+
+int expflush(CLIENT *client) {
+       int fhandle;
+       off_t foffset;
+       size_t maxbytes;
+       gint i;
+
+        if (client->server->flags & F_COPYONWRITE) {
+               return fsync(client->difffile);
+       }
+       
+       for (i = 0; i < client->export->len; i++) {
+               FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
+               if (fsync(fi.fhandle) < 0)
+                       return -1;
+       }
+       
        return 0;
 }
 
@@ -1308,9 +1347,11 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
                                client->exportsize = OFFT_MAX;
                                client->net = net;
                                client->modern = TRUE;
+                               free(name);
                                return client;
                        }
                }
+               free(name);
                return NULL;
        }
        /* common */
@@ -1319,6 +1360,12 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
                err("Negotiation failed: %m");
        if (client->server->flags & F_READONLY)
                flags |= NBD_FLAG_READ_ONLY;
+       if (client->server->flags & F_FLUSH)
+               flags |= NBD_FLAG_SEND_FLUSH;
+       if (client->server->flags & F_FUA)
+               flags |= NBD_FLAG_SEND_FUA;
+       if (client->server->flags & F_ROTATIONAL)
+               flags |= NBD_FLAG_ROTATIONAL;
        if (!client->modern) {
                /* oldstyle */
                flags = htonl(flags);
@@ -1368,6 +1415,7 @@ int mainloop(CLIENT *client) {
                size_t len;
                size_t currlen;
                size_t writelen;
+               uint16_t command;
 #ifdef DODBG
                i++;
                printf("%d: ", i);
@@ -1375,8 +1423,9 @@ int mainloop(CLIENT *client) {
                readit(client->net, &request, sizeof(request));
                request.from = ntohll(request.from);
                request.type = ntohl(request.type);
+               command = request.type & NBD_CMD_MASK_COMMAND;
 
-               if (request.type==NBD_CMD_DISC) {
+               if (command==NBD_CMD_DISC) {
                        msg2(LOG_INFO, "Disconnect request received.");
                        if (client->server->flags & F_COPYONWRITE) { 
                                if (client->difmap) g_free(client->difmap) ;
@@ -1394,29 +1443,32 @@ int mainloop(CLIENT *client) {
                        err("Not enough magic.");
                if (len > BUFSIZE - sizeof(struct nbd_reply)) {
                        currlen = BUFSIZE - sizeof(struct nbd_reply);
-                       msg("INFO: oversized request (this is not a problem)");
+                       msg2(LOG_INFO, "oversized request (this is not a problem)");
                } else {
                        currlen = len;
                }
 #ifdef DODBG
-               printf("%s from %llu (%llu) len %d, ", request.type ? "WRITE" :
+               printf("%s from %llu (%llu) len %d, ", command ? "WRITE" :
                                "READ", (unsigned long long)request.from,
                                (unsigned long long)request.from / 512, len);
 #endif
                memcpy(reply.handle, request.handle, sizeof(reply.handle));
-               if ((request.from + len) > (OFFT_MAX)) {
-                       DEBUG("[Number too large!]");
-                       ERROR(client, reply, EINVAL);
-                       continue;
-               }
 
-               if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
-                       DEBUG("[RANGE!]");
-                       ERROR(client, reply, EINVAL);
-                       continue;
+               if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
+                       if ((request.from + len) > (OFFT_MAX)) {
+                               DEBUG("[Number too large!]");
+                               ERROR(client, reply, EINVAL);
+                               continue;
+                       }
+
+                       if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
+                               DEBUG("[RANGE!]");
+                               ERROR(client, reply, EINVAL);
+                               continue;
+                       }
                }
 
-               if (request.type==NBD_CMD_WRITE) {
+               if (command==NBD_CMD_WRITE) {
                        DEBUG("wr: net->buf, ");
                        while(len > 0) {
                                readit(client->net, buf, currlen);
@@ -1427,7 +1479,8 @@ int mainloop(CLIENT *client) {
                                        ERROR(client, reply, EPERM);
                                        continue;
                                }
-                               if (expwrite(request.from, buf, len, client)) {
+                               if (expwrite(request.from, buf, len, client,
+                                            request.type & NBD_CMD_FLAG_FUA)) {
                                        DEBUG("Write failed: %m" );
                                        ERROR(client, reply, errno);
                                        continue;
@@ -1439,27 +1492,43 @@ int mainloop(CLIENT *client) {
                        }
                        continue;
                }
-               /* READ */
-
-               DEBUG("exp->buf, ");
-               memcpy(buf, &reply, sizeof(struct nbd_reply));
-               p = buf + sizeof(struct nbd_reply);
-               writelen = currlen + sizeof(struct nbd_reply);
-               while(len > 0) {
-                       if (expread(request.from, p, currlen, client)) {
-                               DEBUG("Read failed: %m");
+
+               if (command==NBD_CMD_FLUSH) {
+                       DEBUG("fl: ");
+                       if (expflush(client)) {
+                               DEBUG("Flush failed: %m");
                                ERROR(client, reply, errno);
                                continue;
                        }
+                       SEND(client->net, reply);
+                       DEBUG("OK!\n");
+                       continue;
+               }
 
-                       DEBUG("buf->net, ");
-                       writeit(client->net, buf, writelen);
-                       len -= currlen;
-                       currlen = (len < BUFSIZE) ? len : BUFSIZE;
-                       p = buf;
-                       writelen = currlen;
+               if (command==NBD_CMD_READ) {
+                       DEBUG("exp->buf, ");
+                       memcpy(buf, &reply, sizeof(struct nbd_reply));
+                       p = buf + sizeof(struct nbd_reply);
+                       writelen = currlen + sizeof(struct nbd_reply);
+                       while(len > 0) {
+                               if (expread(request.from, p, currlen, client)) {
+                                       DEBUG("Read failed: %m");
+                                       ERROR(client, reply, errno);
+                                       continue;
+                               }
+                               
+                               DEBUG("buf->net, ");
+                               writeit(client->net, buf, writelen);
+                               len -= currlen;
+                               currlen = (len < BUFSIZE) ? len : BUFSIZE;
+                               p = buf;
+                               writelen = currlen;
+                       }
+                       DEBUG("OK!\n");
+                       continue;
                }
-               DEBUG("OK!\n");
+
+               DEBUG ("Ignoring unknown command\n");
        }
        return 0;
 }
@@ -1745,7 +1814,7 @@ int serveloop(GArray* servers) {
                memcpy(&rset, &mset, sizeof(fd_set));
                if(select(max+1, &rset, NULL, NULL, NULL)>0) {
                        int net = 0;
-                       SERVER* serve;
+                       SERVER* serve=NULL;
 
                        DEBUG("accept, ");
                        if(FD_ISSET(modernsock, &rset)) {
@@ -1756,7 +1825,9 @@ int serveloop(GArray* servers) {
                                        err_nonfatal("negotiation failed");
                                        close(net);
                                        net=0;
+                                       continue;
                                }
+                               serve = client->server;
                        }
                        for(i=0;i<servers->len && !net;i++) {
                                serve=&(g_array_index(servers, SERVER, i));
@@ -1794,31 +1865,33 @@ int serveloop(GArray* servers) {
                                }
                                msg2(LOG_INFO,"Authorized client") ;
                                pid=g_malloc(sizeof(pid_t));
-#ifndef NOFORK
-                               if ((*pid=fork())<0) {
-                                       msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
-                                       close(net);
-                                       continue;
-                               }
-                               if (*pid>0) { /* parent */
-                                       close(net);
-                                       g_hash_table_insert(children, pid, pid);
-                                       continue;
-                               }
-                               /* child */
-                               g_hash_table_destroy(children);
-                               for(i=0;i<servers->len;i++) {
-                                       serve=&g_array_index(servers, SERVER, i);
-                                       close(serve->socket);
+
+                               if (!dontfork) {
+                                       if ((*pid=fork())<0) {
+                                               msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
+                                               close(net);
+                                               continue;
+                                       }
+                                       if (*pid>0) { /* parent */
+                                               close(net);
+                                               g_hash_table_insert(children, pid, pid);
+                                               continue;
+                                       }
+                                       /* child */
+                                       g_hash_table_destroy(children);
+                                       for(i=0;i<servers->len;i++) {
+                                               serve=&g_array_index(servers, SERVER, i);
+                                               close(serve->socket);
+                                       }
+                                       /* FALSE does not free the
+                                          actual data. This is required,
+                                          because the client has a
+                                          direct reference into that
+                                          data, and otherwise we get a
+                                          segfault... */
+                                       g_array_free(servers, FALSE);
                                }
-                               /* FALSE does not free the
-                               actual data. This is required,
-                               because the client has a
-                               direct reference into that
-                               data, and otherwise we get a
-                               segfault... */
-                               g_array_free(servers, FALSE);
-#endif // NOFORK
+
                                msg2(LOG_INFO,"Starting to serve");
                                serveconnection(client);
                                exit(EXIT_SUCCESS);
@@ -1985,7 +2058,7 @@ void setup_servers(GArray* servers) {
  *     is only used to create a PID file of the form
  *     /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
  **/
-#if !defined(NODAEMON) && !defined(NOFORK)
+#if !defined(NODAEMON)
 void daemonize(SERVER* serve) {
        FILE*pidf;
 
@@ -2014,7 +2087,7 @@ void daemonize(SERVER* serve) {
 }
 #else
 #define daemonize(serve)
-#endif /* !defined(NODAEMON) && !defined(NOFORK) */
+#endif /* !defined(NODAEMON) */
 
 /*
  * Everything beyond this point (in the file) is run in non-daemon mode.
@@ -2153,7 +2226,8 @@ int main(int argc, char *argv[]) {
                g_message("No configured exports; quitting.");
                exit(EXIT_FAILURE);
        }
-       daemonize(serve);
+       if (!dontfork)
+               daemonize(serve);
        setup_servers(servers);
        dousers();
        serveloop(servers);