X-Git-Url: http://git.alex.org.uk diff --git a/nbd-server.c b/nbd-server.c index cd584f0..6d734b3 100644 --- a/nbd-server.c +++ b/nbd-server.c @@ -133,15 +133,9 @@ int dontfork = 0; /* Debugging macros */ //#define DODBG #ifdef DODBG -#define DEBUG( a ) printf( a ) -#define DEBUG2( a,b ) printf( a,b ) -#define DEBUG3( a,b,c ) printf( a,b,c ) -#define DEBUG4( a,b,c,d ) printf( a,b,c,d ) +#define DEBUG(...) printf(__VA_ARGS__) #else -#define DEBUG( a ) -#define DEBUG2( a,b ) -#define DEBUG3( a,b,c ) -#define DEBUG4( a,b,c,d ) +#define DEBUG(...) #endif #ifndef PACKAGE_VERSION #define PACKAGE_VERSION "" @@ -163,6 +157,9 @@ int dontfork = 0; #define F_SPARSE 16 /**< flag to tell us copyronwrite should use a sparse file */ #define F_SDP 32 /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */ #define F_SYNC 64 /**< Whether to fsync() after a write */ +#define F_FLUSH 128 /**< Whether server wants FLUSH to be sent by the client */ +#define F_FUA 256 /**< Whether server wants FUA to be sent by the client */ +#define F_ROTATIONAL 512 /**< Whether server wants the client to implement the elevator algorithm */ GHashTable *children; char pidfname[256]; /**< name of our PID file */ char pidftemplate[256]; /**< template to be used for the filename of the PID file */ @@ -208,6 +205,7 @@ typedef struct { disconnects */ gchar* servename; /**< name of the export as selected by nbd-client */ int max_connections; /**< maximum number of opened connections */ + gchar* transactionlog;/**< filename for transaction log */ } SERVER; /** @@ -234,6 +232,7 @@ typedef struct { u32 difffilelen; /**< number of pages in difffile */ u32 *difmap; /**< see comment on the global difmap for this one */ gboolean modern; /**< client was negotiated using modern negotiation protocol */ + int transactionlogfd;/**< fd for transaction log */ } CLIENT; /** @@ -262,6 +261,22 @@ typedef struct { is PARAM_BOOL. */ } PARAM; +static inline const char * getcommandname(uint64_t command) { + switch (command) { + case NBD_CMD_READ: + return "NBD_CMD_READ"; + case NBD_CMD_WRITE: + return "NBD_CMD_WRITE"; + case NBD_CMD_DISC: + return "NBD_CMD_DISC"; + case NBD_CMD_FLUSH: + return "NBD_CMD_FLUSH"; + default: + break; + } + return "UNKNOWN"; +} + /** * Check whether a client is allowed to connect. Works with an authorization * file which contains one line per machine, no wildcards. @@ -577,6 +592,8 @@ void remove_server(gpointer s) { g_free(server->prerun); if(server->postrun) g_free(server->postrun); + if(server->transactionlog) + g_free(server->transactionlog); g_free(server); } @@ -616,6 +633,9 @@ SERVER* dup_serve(SERVER *s) { if(s->postrun) serve->postrun = g_strdup(s->postrun); + + if(s->transactionlog) + serve->transactionlog = g_strdup(s->transactionlog); if(s->servename) serve->servename = g_strdup(s->servename); @@ -714,12 +734,16 @@ GArray* parse_cfile(gchar* f, GError** e) { { "virtstyle", FALSE, PARAM_STRING, &(virtstyle), 0 }, { "prerun", FALSE, PARAM_STRING, &(s.prerun), 0 }, { "postrun", FALSE, PARAM_STRING, &(s.postrun), 0 }, + { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog), 0 }, { "readonly", FALSE, PARAM_BOOL, &(s.flags), F_READONLY }, { "multifile", FALSE, PARAM_BOOL, &(s.flags), F_MULTIFILE }, { "copyonwrite", FALSE, PARAM_BOOL, &(s.flags), F_COPYONWRITE }, { "sparse_cow", FALSE, PARAM_BOOL, &(s.flags), F_SPARSE }, { "sdp", FALSE, PARAM_BOOL, &(s.flags), F_SDP }, { "sync", FALSE, PARAM_BOOL, &(s.flags), F_SYNC }, + { "flush", FALSE, PARAM_BOOL, &(s.flags), F_FLUSH }, + { "fua", FALSE, PARAM_BOOL, &(s.flags), F_FUA }, + { "rotational", FALSE, PARAM_BOOL, &(s.flags), F_ROTATIONAL }, { "listenaddr", FALSE, PARAM_STRING, &(s.listenaddr), 0 }, { "maxconnections", FALSE, PARAM_INT, &(s.max_connections), 0 }, }; @@ -898,7 +922,7 @@ void sigchld_handler(int s) { if(!i) { msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid); } else { - DEBUG2("Removing %d from the list of children", pid); + DEBUG("Removing %d from the list of children", pid); g_hash_table_remove(children, &pid); } } @@ -976,7 +1000,7 @@ off_t size_autodetect(int fhandle) { if (es > ((off_t)0)) { return es; } else { - DEBUG2("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4))); + DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4))); } err("Could not find size of exported block device: %m"); @@ -1055,7 +1079,7 @@ void myseek(int handle,off_t a) { * @param client The client we're serving for * @return The number of bytes actually written, or -1 in case of an error **/ -ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { +ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) { int fhandle; off_t foffset; size_t maxbytes; @@ -1066,12 +1090,51 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { if(maxbytes && len > maxbytes) len = maxbytes; - DEBUG4("(WRITE to fd %d offset %llu len %u), ", fhandle, foffset, len); + DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua); myseek(fhandle, foffset); retval = write(fhandle, buf, len); if(client->server->flags & F_SYNC) { fsync(fhandle); + } else if (fua) { + + /* This is where we would do the following + * #ifdef USE_SYNC_FILE_RANGE + * However, we don't, for the reasons set out below + * by Christoph Hellwig + * + * [BEGINS] + * fdatasync is equivalent to fsync except that it does not flush + * non-essential metadata (basically just timestamps in practice), but it + * does flush metadata requried to find the data again, e.g. allocation + * information and extent maps. sync_file_range does nothing but flush + * out pagecache content - it means you basically won't get your data + * back in case of a crash if you either: + * + * a) have a volatile write cache in your disk (e.g. any normal SATA disk) + * b) are using a sparse file on a filesystem + * c) are using a fallocate-preallocated file on a filesystem + * d) use any file on a COW filesystem like btrfs + * + * e.g. it only does anything useful for you if you do not have a volatile + * write cache, and either use a raw block device node, or just overwrite + * an already fully allocated (and not preallocated) file on a non-COW + * filesystem. + * [ENDS] + * + * What we should do is open a second FD with O_DSYNC set, then write to + * that when appropriate. However, with a Linux client, every REQ_FUA + * immediately follows a REQ_FLUSH, so fdatasync does not cause performance + * problems. + * + */ +#if 0 + sync_file_range(fhandle, foffset, len, + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER); +#else + fdatasync(fhandle); +#endif } return retval; } @@ -1080,10 +1143,10 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { * Call rawexpwrite repeatedly until all data has been written. * @return 0 on success, nonzero on failure **/ -int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) { +int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) { ssize_t ret=0; - while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) { + while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) { a += ret; buf += ret; len -= ret; @@ -1112,7 +1175,7 @@ ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) { if(maxbytes && len > maxbytes) len = maxbytes; - DEBUG4("(READ from fd %d offset %llu len %u), ", fhandle, foffset, len); + DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len); myseek(fhandle, foffset); return read(fhandle, buf, len); @@ -1149,7 +1212,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { if (!(client->server->flags & F_COPYONWRITE)) return(rawexpread_fully(a, buf, len, client)); - DEBUG3("Asked to read %d bytes at %llu.\n", len, (unsigned long long)a); + DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a); mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE; @@ -1159,12 +1222,12 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { rdlen=(0difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */ - DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt, + DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])); myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset); if (read(client->difffile, buf, rdlen) != rdlen) return -1; } else { /* the block is not there */ - DEBUG2("Page %llu is not here, we read the original one\n", + DEBUG("Page %llu is not here, we read the original one\n", (unsigned long long)mapcnt); if(rawexpread_fully(a, buf, rdlen, client)) return -1; } @@ -1184,7 +1247,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { * @param client The client we're going to write for. * @return 0 on success, nonzero on failure **/ -int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { +int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) { char pagebuf[DIFFPAGESIZE]; off_t mapcnt,mapl,maph; off_t wrlen,rdlen; @@ -1192,8 +1255,8 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { off_t offset; if (!(client->server->flags & F_COPYONWRITE)) - return(rawexpwrite_fully(a, buf, len, client)); - DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a); + return(rawexpwrite_fully(a, buf, len, client, fua)); + DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a); mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ; @@ -1204,7 +1267,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { len : (size_t)DIFFPAGESIZE-offset; if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */ - DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt, + DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])) ; myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset); @@ -1212,7 +1275,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { } else { /* the block is not there */ myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ; client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++; - DEBUG3("Page %llu is not here, we put it at %lu\n", + DEBUG("Page %llu is not here, we put it at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])); rdlen=DIFFPAGESIZE ; @@ -1225,6 +1288,30 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { } len-=wrlen ; a+=wrlen ; buf+=wrlen ; } + if (client->server->flags & F_SYNC) { + fsync(client->difffile); + } else if (fua) { + /* open question: would it be cheaper to do multiple sync_file_ranges? + as we iterate through the above? + */ + fdatasync(client->difffile); + } + return 0; +} + +int expflush(CLIENT *client) { + gint i; + + if (client->server->flags & F_COPYONWRITE) { + return fsync(client->difffile); + } + + for (i = 0; i < client->export->len; i++) { + FILE_INFO fi = g_array_index(client->export, FILE_INFO, i); + if (fsync(fi.fhandle) < 0) + return -1; + } + return 0; } @@ -1304,6 +1391,7 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) { client->exportsize = OFFT_MAX; client->net = net; client->modern = TRUE; + client->transactionlogfd = -1; free(name); return client; } @@ -1317,6 +1405,12 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) { err("Negotiation failed: %m"); if (client->server->flags & F_READONLY) flags |= NBD_FLAG_READ_ONLY; + if (client->server->flags & F_FLUSH) + flags |= NBD_FLAG_SEND_FLUSH; + if (client->server->flags & F_FUA) + flags |= NBD_FLAG_SEND_FUA; + if (client->server->flags & F_ROTATIONAL) + flags |= NBD_FLAG_ROTATIONAL; if (!client->modern) { /* oldstyle */ flags = htonl(flags); @@ -1337,7 +1431,9 @@ CLIENT* negotiate(int net, CLIENT *client, GArray* servers) { } /** sending macro. */ -#define SEND(net,reply) writeit( net, &reply, sizeof( reply )); +#define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \ + if (client->transactionlogfd != -1) \ + writeit(client->transactionlogfd, &reply, sizeof(reply)); } /** error macro. */ #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; } /** @@ -1366,15 +1462,52 @@ int mainloop(CLIENT *client) { size_t len; size_t currlen; size_t writelen; + uint16_t command; #ifdef DODBG i++; printf("%d: ", i); #endif readit(client->net, &request, sizeof(request)); + if (client->transactionlogfd != -1) + writeit(client->transactionlogfd, &request, sizeof(request)); + request.from = ntohll(request.from); request.type = ntohl(request.type); + command = request.type & NBD_CMD_MASK_COMMAND; + len = ntohl(request.len); + + DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command), + (unsigned long long)request.from, + (unsigned long long)request.from / 512, (unsigned int)len); - if (request.type==NBD_CMD_DISC) { + if (request.magic != htonl(NBD_REQUEST_MAGIC)) + err("Not enough magic."); + + memcpy(reply.handle, request.handle, sizeof(reply.handle)); + + if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) { + if ((request.from + len) > (OFFT_MAX)) { + DEBUG("[Number too large!]"); + ERROR(client, reply, EINVAL); + continue; + } + + if (((ssize_t)((off_t)request.from + len) > client->exportsize)) { + DEBUG("[RANGE!]"); + ERROR(client, reply, EINVAL); + continue; + } + + currlen = len; + if (currlen > BUFSIZE - sizeof(struct nbd_reply)) { + currlen = BUFSIZE - sizeof(struct nbd_reply); + msg2(LOG_INFO, "oversized request (this is not a problem)"); + } + } + + switch (command) { + + case NBD_CMD_DISC: msg2(LOG_INFO, "Disconnect request received."); if (client->server->flags & F_COPYONWRITE) { if (client->difmap) g_free(client->difmap) ; @@ -1384,37 +1517,8 @@ int mainloop(CLIENT *client) { } go_on=FALSE; continue; - } - len = ntohl(request.len); - - if (request.magic != htonl(NBD_REQUEST_MAGIC)) - err("Not enough magic."); - if (len > BUFSIZE - sizeof(struct nbd_reply)) { - currlen = BUFSIZE - sizeof(struct nbd_reply); - msg2(LOG_INFO, "oversized request (this is not a problem)"); - } else { - currlen = len; - } -#ifdef DODBG - printf("%s from %llu (%llu) len %d, ", request.type ? "WRITE" : - "READ", (unsigned long long)request.from, - (unsigned long long)request.from / 512, len); -#endif - memcpy(reply.handle, request.handle, sizeof(reply.handle)); - if ((request.from + len) > (OFFT_MAX)) { - DEBUG("[Number too large!]"); - ERROR(client, reply, EINVAL); - continue; - } - - if (((ssize_t)((off_t)request.from + len) > client->exportsize)) { - DEBUG("[RANGE!]"); - ERROR(client, reply, EINVAL); - continue; - } - - if (request.type==NBD_CMD_WRITE) { + case NBD_CMD_WRITE: DEBUG("wr: net->buf, "); while(len > 0) { readit(client->net, buf, currlen); @@ -1425,39 +1529,59 @@ int mainloop(CLIENT *client) { ERROR(client, reply, EPERM); continue; } - if (expwrite(request.from, buf, len, client)) { + if (expwrite(request.from, buf, len, client, + request.type & NBD_CMD_FLAG_FUA)) { DEBUG("Write failed: %m" ); ERROR(client, reply, errno); continue; } - SEND(client->net, reply); - DEBUG("OK!\n"); len -= currlen; currlen = (len < BUFSIZE) ? len : BUFSIZE; } + SEND(client->net, reply); + DEBUG("OK!\n"); continue; - } - /* READ */ - - DEBUG("exp->buf, "); - memcpy(buf, &reply, sizeof(struct nbd_reply)); - p = buf + sizeof(struct nbd_reply); - writelen = currlen + sizeof(struct nbd_reply); - while(len > 0) { - if (expread(request.from, p, currlen, client)) { - DEBUG("Read failed: %m"); + + case NBD_CMD_FLUSH: + DEBUG("fl: "); + if (expflush(client)) { + DEBUG("Flush failed: %m"); ERROR(client, reply, errno); continue; } + SEND(client->net, reply); + DEBUG("OK!\n"); + continue; - DEBUG("buf->net, "); - writeit(client->net, buf, writelen); - len -= currlen; - currlen = (len < BUFSIZE) ? len : BUFSIZE; - p = buf; - writelen = currlen; + case NBD_CMD_READ: + DEBUG("exp->buf, "); + memcpy(buf, &reply, sizeof(struct nbd_reply)); + if (client->transactionlogfd != -1) + writeit(client->transactionlogfd, &reply, sizeof(reply)); + p = buf + sizeof(struct nbd_reply); + writelen = currlen + sizeof(struct nbd_reply); + while(len > 0) { + if (expread(request.from, p, currlen, client)) { + DEBUG("Read failed: %m"); + ERROR(client, reply, errno); + continue; + } + + DEBUG("buf->net, "); + writeit(client->net, buf, writelen); + len -= currlen; + request.from += currlen; + currlen = (len < BUFSIZE) ? len : BUFSIZE; + p = buf; + writelen = currlen; + } + DEBUG("OK!\n"); + continue; + + default: + DEBUG ("Ignoring unknown command\n"); + continue; } - DEBUG("OK!\n"); } return 0; } @@ -1488,7 +1612,7 @@ void setupexport(CLIENT* client) { } else { tmpname=g_strdup(client->exportname); } - DEBUG2( "Opening %s\n", tmpname ); + DEBUG( "Opening %s\n", tmpname ); fi.fhandle = open(tmpname, mode); if(fi.fhandle == -1 && mode == O_RDWR) { /* Try again because maybe media was read-only */ @@ -1587,6 +1711,15 @@ int do_run(gchar* command, gchar* file) { * @param client a connected client **/ void serveconnection(CLIENT *client) { + if (client->server->transactionlog && (client->transactionlogfd == -1)) + { + if (-1 == (client->transactionlogfd = open(client->server->transactionlog, + O_WRONLY | O_CREAT, + S_IRUSR | S_IWUSR))) + g_warning("Could not open transaction log %s", + client->server->transactionlog); + } + if(do_run(client->server->prerun, client->exportname)) { exit(EXIT_FAILURE); } @@ -1600,6 +1733,12 @@ void serveconnection(CLIENT *client) { mainloop(client); do_run(client->server->postrun, client->exportname); + + if (-1 != client->transactionlogfd) + { + close(client->transactionlogfd); + client->transactionlogfd = -1; + } } /** @@ -1785,6 +1924,7 @@ int serveloop(GArray* servers) { client->server=serve; client->exportsize=OFFT_MAX; client->net=net; + client->transactionlogfd = -1; } set_peername(net, client); if (!authorized_client(client)) {