X-Git-Url: http://git.alex.org.uk diff --git a/nbd-server.c b/nbd-server.c index 521facc..3343d9d 100644 --- a/nbd-server.c +++ b/nbd-server.c @@ -96,6 +96,10 @@ #define MY_NAME "nbd_server" #include "cliserv.h" +#ifdef WITH_SDP +#include +#endif + /** Default position of the config file */ #ifndef SYSCONFDIR #define SYSCONFDIR "/etc" @@ -109,6 +113,11 @@ gchar* config_file_pos; gchar* runuser=NULL; /** What group we're running as */ gchar* rungroup=NULL; +/** whether to export using the old negotiation protocol (port-based) */ +gboolean do_oldstyle=FALSE; + +/* Whether we should avoid forking */ +int dontfork = 0; /** Logging macros, now nothing goes to syslog unless you say ISSERVER */ #ifdef ISSERVER @@ -124,15 +133,9 @@ gchar* rungroup=NULL; /* Debugging macros */ //#define DODBG #ifdef DODBG -#define DEBUG( a ) printf( a ) -#define DEBUG2( a,b ) printf( a,b ) -#define DEBUG3( a,b,c ) printf( a,b,c ) -#define DEBUG4( a,b,c,d ) printf( a,b,c,d ) +#define DEBUG(...) printf(__VA_ARGS__) #else -#define DEBUG( a ) -#define DEBUG2( a,b ) -#define DEBUG3( a,b,c ) -#define DEBUG4( a,b,c,d ) +#define DEBUG(...) #endif #ifndef PACKAGE_VERSION #define PACKAGE_VERSION "" @@ -144,7 +147,7 @@ gchar* rungroup=NULL; #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1)) #define LINELEN 256 /**< Size of static buffer used to read the authorization file (yuck) */ -#define BUFSIZE (1024*1024) /**< Size of buffer that can hold requests */ +#define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */ #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */ #define F_READONLY 1 /**< flag to tell us a file is readonly */ #define F_MULTIFILE 2 /**< flag to tell us a file is exported using -m */ @@ -154,11 +157,21 @@ gchar* rungroup=NULL; #define F_SPARSE 16 /**< flag to tell us copyronwrite should use a sparse file */ #define F_SDP 32 /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */ #define F_SYNC 64 /**< Whether to fsync() after a write */ +#define F_FLUSH 128 /**< Whether server wants FLUSH to be sent by the client */ +#define F_FUA 256 /**< Whether server wants FUA to be sent by the client */ +#define F_ROTATIONAL 512 /**< Whether server wants the client to implement the elevator algorithm */ GHashTable *children; char pidfname[256]; /**< name of our PID file */ char pidftemplate[256]; /**< template to be used for the filename of the PID file */ char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */ +int modernsock=0; /**< Socket for the modern handler. Not used + if a client was only specified on the + command line; only port used if + oldstyle is set to false (and then the + command-line client isn't used, gna gna) */ +char* modern_listen; /**< listenaddr value for modernsock */ + /** * Types of virtuatlization **/ @@ -190,6 +203,9 @@ typedef struct { but before starting to serve */ gchar* postrun; /**< command that will be ran after the client disconnects */ + gchar* servename; /**< name of the export as selected by nbd-client */ + int max_connections; /**< maximum number of opened connections */ + gchar* transactionlog;/**< filename for transaction log */ } SERVER; /** @@ -215,6 +231,8 @@ typedef struct { make -m and -c mutually exclusive */ u32 difffilelen; /**< number of pages in difffile */ u32 *difmap; /**< see comment on the global difmap for this one */ + gboolean modern; /**< client was negotiated using modern negotiation protocol */ + int transactionlogfd;/**< fd for transaction log */ } CLIENT; /** @@ -274,7 +292,7 @@ int authorized_client(CLIENT *opts) { return 0; } *(tmp++)=0; - if(inet_aton(line,&addr)) { + if(!inet_aton(line,&addr)) { msg4(LOG_CRIT, ERRMSG, line, opts->server->authname); return 0; } @@ -304,7 +322,7 @@ int authorized_client(CLIENT *opts) { * @param buf a buffer * @param len the number of bytes to be read **/ -inline void readit(int f, void *buf, size_t len) { +static inline void readit(int f, void *buf, size_t len) { ssize_t res; while (len > 0) { DEBUG("*"); @@ -326,7 +344,7 @@ inline void readit(int f, void *buf, size_t len) { * @param buf a buffer containing data * @param len the number of bytes to be written **/ -inline void writeit(int f, void *buf, size_t len) { +static inline void writeit(int f, void *buf, size_t len) { ssize_t res; while (len > 0) { DEBUG("+"); @@ -343,14 +361,15 @@ inline void writeit(int f, void *buf, size_t len) { */ void usage() { printf("This is nbd-server version " VERSION "\n"); - printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name]\n" + printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n" "\t-r|--read-only\t\tread only\n" "\t-m|--multi-file\t\tmultiple file\n" "\t-c|--copy-on-write\tcopy on write\n" "\t-C|--config-file\tspecify an alternate configuration file\n" "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n" "\t-p|--pid-file\t\tspecify a filename to write our PID to\n" - "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n\n" + "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n" + "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n" "\tif port is set to 0, stdin is used (for running from inetd)\n" "\tif file_to_export contains '%%s', it is substituted with the IP\n" "\t\taddress of the machine trying to connect\n" @@ -396,10 +415,12 @@ SERVER* cmdline(int argc, char *argv[]) { {"read-only", no_argument, NULL, 'r'}, {"multi-file", no_argument, NULL, 'm'}, {"copy-on-write", no_argument, NULL, 'c'}, + {"dont-fork", no_argument, NULL, 'd'}, {"authorize-file", required_argument, NULL, 'l'}, {"config-file", required_argument, NULL, 'C'}, {"pid-file", required_argument, NULL, 'p'}, {"output-config", required_argument, NULL, 'o'}, + {"max-connection", required_argument, NULL, 'M'}, {0,0,0,0} }; SERVER *serve; @@ -416,7 +437,7 @@ SERVER* cmdline(int argc, char *argv[]) { serve=g_new0(SERVER, 1); serve->authname = g_strdup(default_authname); serve->virtstyle=VIRT_IPLIT; - while((c=getopt_long(argc, argv, "-C:cl:mo:rp:", long_options, &i))>=0) { + while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) { switch (c) { case 1: /* non-option argument */ @@ -485,6 +506,9 @@ SERVER* cmdline(int argc, char *argv[]) { case 'c': serve->flags |=F_COPYONWRITE; break; + case 'd': + dontfork = 1; + break; case 'C': g_free(config_file_pos); config_file_pos=g_strdup(optarg); @@ -493,6 +517,9 @@ SERVER* cmdline(int argc, char *argv[]) { g_free(serve->authname); serve->authname=g_strdup(optarg); break; + case 'M': + serve->max_connections = strtol(optarg, NULL, 0); + break; default: usage(); exit(EXIT_FAILURE); @@ -504,6 +531,8 @@ SERVER* cmdline(int argc, char *argv[]) { if(nonspecial<2) { g_free(serve); serve=NULL; + } else { + do_oldstyle = TRUE; } if(do_output) { if(!serve) { @@ -525,8 +554,10 @@ typedef enum { CFILE_VALUE_INVALID, /**< A value is syntactically invalid */ CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */ CFILE_PROGERR, /**< Programmer error */ - CFILE_NO_EXPORTS /**< A config file was specified that does not + CFILE_NO_EXPORTS, /**< A config file was specified that does not define any exports */ + CFILE_INCORRECT_PORT, /**< The reserved port was specified for an + old-style export. */ } CFILE_ERRORS; /** @@ -539,6 +570,14 @@ void remove_server(gpointer s) { g_free(server->exportname); if(server->authname) g_free(server->authname); + if(server->listenaddr) + g_free(server->listenaddr); + if(server->prerun) + g_free(server->prerun); + if(server->postrun) + g_free(server->postrun); + if(server->transactionlog) + g_free(server->transactionlog); g_free(server); } @@ -551,33 +590,42 @@ SERVER* dup_serve(SERVER *s) { SERVER *serve = NULL; serve=g_new0(SERVER, 1); - if (serve == NULL) + if(serve == NULL) return NULL; - if (s->exportname) + if(s->exportname) serve->exportname = g_strdup(s->exportname); serve->expected_size = s->expected_size; - if (s->listenaddr) + if(s->listenaddr) serve->listenaddr = g_strdup(s->listenaddr); serve->port = s->port; - if (s->authname) + if(s->authname) serve->authname = strdup(s->authname); serve->flags = s->flags; - serve->socket = serve->socket; - serve->socket_family = serve->socket_family; + serve->socket = s->socket; + serve->socket_family = s->socket_family; + serve->virtstyle = s->virtstyle; serve->cidrlen = s->cidrlen; - if (s->prerun) + if(s->prerun) serve->prerun = g_strdup(s->prerun); - if (s->postrun) + if(s->postrun) serve->postrun = g_strdup(s->postrun); + if(s->transactionlog) + serve->transactionlog = g_strdup(s->transactionlog); + + if(s->servename) + serve->servename = g_strdup(s->servename); + + serve->max_connections = s->max_connections; + return serve; } @@ -587,8 +635,7 @@ SERVER* dup_serve(SERVER *s) { * @param a server array * @return 0 success, -1 error */ -int append_serve(SERVER *s, GArray *a) -{ +int append_serve(SERVER *s, GArray *a) { SERVER *ns = NULL; struct addrinfo hints; struct addrinfo *ai = NULL; @@ -664,25 +711,32 @@ GArray* parse_cfile(gchar* f, GError** e) { SERVER s; gchar *virtstyle=NULL; PARAM lp[] = { - { "exportname", TRUE, PARAM_STRING, NULL, 0 }, - { "port", TRUE, PARAM_INT, NULL, 0 }, - { "authfile", FALSE, PARAM_STRING, NULL, 0 }, - { "filesize", FALSE, PARAM_INT, NULL, 0 }, - { "virtstyle", FALSE, PARAM_STRING, NULL, 0 }, - { "prerun", FALSE, PARAM_STRING, NULL, 0 }, - { "postrun", FALSE, PARAM_STRING, NULL, 0 }, - { "readonly", FALSE, PARAM_BOOL, NULL, F_READONLY }, - { "multifile", FALSE, PARAM_BOOL, NULL, F_MULTIFILE }, - { "copyonwrite", FALSE, PARAM_BOOL, NULL, F_COPYONWRITE }, - { "sparse_cow", FALSE, PARAM_BOOL, NULL, F_SPARSE }, - { "sdp", FALSE, PARAM_BOOL, NULL, F_SDP }, - { "sync", FALSE, PARAM_BOOL, NULL, F_SYNC }, - { "listenaddr", FALSE, PARAM_STRING, NULL, 0 }, + { "exportname", TRUE, PARAM_STRING, &(s.exportname), 0 }, + { "port", TRUE, PARAM_INT, &(s.port), 0 }, + { "authfile", FALSE, PARAM_STRING, &(s.authname), 0 }, + { "filesize", FALSE, PARAM_INT, &(s.expected_size), 0 }, + { "virtstyle", FALSE, PARAM_STRING, &(virtstyle), 0 }, + { "prerun", FALSE, PARAM_STRING, &(s.prerun), 0 }, + { "postrun", FALSE, PARAM_STRING, &(s.postrun), 0 }, + { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog), 0 }, + { "readonly", FALSE, PARAM_BOOL, &(s.flags), F_READONLY }, + { "multifile", FALSE, PARAM_BOOL, &(s.flags), F_MULTIFILE }, + { "copyonwrite", FALSE, PARAM_BOOL, &(s.flags), F_COPYONWRITE }, + { "sparse_cow", FALSE, PARAM_BOOL, &(s.flags), F_SPARSE }, + { "sdp", FALSE, PARAM_BOOL, &(s.flags), F_SDP }, + { "sync", FALSE, PARAM_BOOL, &(s.flags), F_SYNC }, + { "flush", FALSE, PARAM_BOOL, &(s.flags), F_FLUSH }, + { "fua", FALSE, PARAM_BOOL, &(s.flags), F_FUA }, + { "rotational", FALSE, PARAM_BOOL, &(s.flags), F_ROTATIONAL }, + { "listenaddr", FALSE, PARAM_STRING, &(s.listenaddr), 0 }, + { "maxconnections", FALSE, PARAM_INT, &(s.max_connections), 0 }, }; const int lp_size=sizeof(lp)/sizeof(PARAM); PARAM gp[] = { { "user", FALSE, PARAM_STRING, &runuser, 0 }, { "group", FALSE, PARAM_STRING, &rungroup, 0 }, + { "oldstyle", FALSE, PARAM_BOOL, &do_oldstyle, 1 }, + { "listenaddr", FALSE, PARAM_STRING, &modern_listen, 0 }, }; PARAM* p=gp; int p_size=sizeof(gp)/sizeof(PARAM); @@ -715,17 +769,6 @@ GArray* parse_cfile(gchar* f, GError** e) { groups = g_key_file_get_groups(cfile, NULL); for(i=0;groups[i];i++) { memset(&s, '\0', sizeof(SERVER)); - lp[0].target=&(s.exportname); - lp[1].target=&(s.port); - lp[2].target=&(s.authname); - lp[3].target=&(s.expected_size); - lp[4].target=&(virtstyle); - lp[5].target=&(s.prerun); - lp[6].target=&(s.postrun); - lp[7].target=lp[8].target=lp[9].target= - lp[10].target=lp[11].target= - lp[12].target=&(s.flags); - lp[13].target=&(s.listenaddr); /* After the [generic] group, start parsing exports */ if(i==1) { @@ -763,6 +806,11 @@ GArray* parse_cfile(gchar* f, GError** e) { } break; } + if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) { + g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export"); + g_key_file_free(cfile); + return NULL; + } if(err) { if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) { if(!p[j].required) { @@ -804,6 +852,10 @@ GArray* parse_cfile(gchar* f, GError** e) { g_key_file_free(cfile); return NULL; } + if(s.port && !do_oldstyle) { + g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect."); + g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info"); + } } else { s.virtstyle=VIRT_IPLIT; } @@ -812,8 +864,13 @@ GArray* parse_cfile(gchar* f, GError** e) { /* Don't append values for the [generic] group */ if(i>0) { s.socket_family = AF_UNSPEC; + s.servename = groups[i]; append_serve(&s, retval); + } else { + if(!do_oldstyle) { + lp[1].required = 0; + } } #ifndef WITH_SDP if(s.flags & F_SDP) { @@ -849,7 +906,7 @@ void sigchld_handler(int s) { if(!i) { msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid); } else { - DEBUG2("Removing %d from the list of children", pid); + DEBUG("Removing %d from the list of children", pid); g_hash_table_remove(children, &pid); } } @@ -905,7 +962,7 @@ off_t size_autodetect(int fhandle) { #ifdef HAVE_SYS_IOCTL_H #ifdef BLKGETSIZE64 DEBUG("looking for export size with ioctl BLKGETSIZE64\n"); - if (!ioctl(fhandle, BLKGETSIZE64, bytes) && bytes) { + if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) { return (off_t)bytes; } #endif /* BLKGETSIZE64 */ @@ -927,7 +984,7 @@ off_t size_autodetect(int fhandle) { if (es > ((off_t)0)) { return es; } else { - DEBUG2("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4))); + DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4))); } err("Could not find size of exported block device: %m"); @@ -1006,7 +1063,7 @@ void myseek(int handle,off_t a) { * @param client The client we're serving for * @return The number of bytes actually written, or -1 in case of an error **/ -ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { +ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) { int fhandle; off_t foffset; size_t maxbytes; @@ -1017,12 +1074,51 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { if(maxbytes && len > maxbytes) len = maxbytes; - DEBUG4("(WRITE to fd %d offset %llu len %u), ", fhandle, foffset, len); + DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua); myseek(fhandle, foffset); retval = write(fhandle, buf, len); if(client->server->flags & F_SYNC) { fsync(fhandle); + } else if (fua) { + + /* This is where we would do the following + * #ifdef USE_SYNC_FILE_RANGE + * However, we don't, for the reasons set out below + * by Christoph Hellwig + * + * [BEGINS] + * fdatasync is equivalent to fsync except that it does not flush + * non-essential metadata (basically just timestamps in practice), but it + * does flush metadata requried to find the data again, e.g. allocation + * information and extent maps. sync_file_range does nothing but flush + * out pagecache content - it means you basically won't get your data + * back in case of a crash if you either: + * + * a) have a volatile write cache in your disk (e.g. any normal SATA disk) + * b) are using a sparse file on a filesystem + * c) are using a fallocate-preallocated file on a filesystem + * d) use any file on a COW filesystem like btrfs + * + * e.g. it only does anything useful for you if you do not have a volatile + * write cache, and either use a raw block device node, or just overwrite + * an already fully allocated (and not preallocated) file on a non-COW + * filesystem. + * [ENDS] + * + * What we should do is open a second FD with O_DSYNC set, then write to + * that when appropriate. However, with a Linux client, every REQ_FUA + * immediately follows a REQ_FLUSH, so fdatasync does not cause performance + * problems. + * + */ +#if 0 + sync_file_range(fhandle, foffset, len, + SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER); +#else + fdatasync(fhandle); +#endif } return retval; } @@ -1031,10 +1127,10 @@ ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) { * Call rawexpwrite repeatedly until all data has been written. * @return 0 on success, nonzero on failure **/ -int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) { +int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) { ssize_t ret=0; - while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) { + while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) { a += ret; buf += ret; len -= ret; @@ -1063,7 +1159,7 @@ ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) { if(maxbytes && len > maxbytes) len = maxbytes; - DEBUG4("(READ from fd %d offset %llu len %u), ", fhandle, foffset, len); + DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len); myseek(fhandle, foffset); return read(fhandle, buf, len); @@ -1100,7 +1196,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { if (!(client->server->flags & F_COPYONWRITE)) return(rawexpread_fully(a, buf, len, client)); - DEBUG3("Asked to read %d bytes at %llu.\n", len, (unsigned long long)a); + DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a); mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE; @@ -1110,12 +1206,12 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { rdlen=(0difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */ - DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt, + DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])); myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset); if (read(client->difffile, buf, rdlen) != rdlen) return -1; } else { /* the block is not there */ - DEBUG2("Page %llu is not here, we read the original one\n", + DEBUG("Page %llu is not here, we read the original one\n", (unsigned long long)mapcnt); if(rawexpread_fully(a, buf, rdlen, client)) return -1; } @@ -1135,7 +1231,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) { * @param client The client we're going to write for. * @return 0 on success, nonzero on failure **/ -int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { +int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) { char pagebuf[DIFFPAGESIZE]; off_t mapcnt,mapl,maph; off_t wrlen,rdlen; @@ -1143,8 +1239,8 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { off_t offset; if (!(client->server->flags & F_COPYONWRITE)) - return(rawexpwrite_fully(a, buf, len, client)); - DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a); + return(rawexpwrite_fully(a, buf, len, client, fua)); + DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a); mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ; @@ -1155,7 +1251,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { len : (size_t)DIFFPAGESIZE-offset; if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */ - DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt, + DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])) ; myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset); @@ -1163,7 +1259,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { } else { /* the block is not there */ myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ; client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++; - DEBUG3("Page %llu is not here, we put it at %lu\n", + DEBUG("Page %llu is not here, we put it at %lu\n", (unsigned long long)mapcnt, (unsigned long)(client->difmap[mapcnt])); rdlen=DIFFPAGESIZE ; @@ -1176,6 +1272,30 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { } len-=wrlen ; a+=wrlen ; buf+=wrlen ; } + if (client->server->flags & F_SYNC) { + fsync(client->difffile); + } else if (fua) { + /* open question: would it be cheaper to do multiple sync_file_ranges? + as we iterate through the above? + */ + fdatasync(client->difffile); + } + return 0; +} + +int expflush(CLIENT *client) { + gint i; + + if (client->server->flags & F_COPYONWRITE) { + return fsync(client->difffile); + } + + for (i = 0; i < client->export->len; i++) { + FILE_INFO fi = g_array_index(client->export, FILE_INFO, i); + if (fsync(fi.fhandle) < 0) + return -1; + } + return 0; } @@ -1184,31 +1304,120 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) { * * @param client The client we're negotiating with. **/ -void negotiate(CLIENT *client) { +CLIENT* negotiate(int net, CLIENT *client, GArray* servers) { char zeros[128]; - u64 size_host; - u32 flags = NBD_FLAG_HAS_FLAGS; + uint64_t size_host; + uint32_t flags = NBD_FLAG_HAS_FLAGS; + uint16_t smallflags = 0; + uint64_t magic; memset(zeros, '\0', sizeof(zeros)); - if (write(client->net, INIT_PASSWD, 8) < 0) - err("Negotiation failed: %m"); - cliserv_magic = htonll(cliserv_magic); - if (write(client->net, &cliserv_magic, sizeof(cliserv_magic)) < 0) - err("Negotiation failed: %m"); + if(!client || !client->modern) { + /* common */ + if (write(net, INIT_PASSWD, 8) < 0) { + err_nonfatal("Negotiation failed: %m"); + if(client) + exit(EXIT_FAILURE); + } + if(!client || client->modern) { + /* modern */ + magic = htonll(opts_magic); + } else { + /* oldstyle */ + magic = htonll(cliserv_magic); + } + if (write(net, &magic, sizeof(magic)) < 0) { + err_nonfatal("Negotiation failed: %m"); + if(client) + exit(EXIT_FAILURE); + } + } + if(!client) { + /* modern */ + uint32_t reserved; + uint32_t opt; + uint32_t namelen; + char* name; + int i; + + if(!servers) + err("programmer error"); + if (write(net, &smallflags, sizeof(uint16_t)) < 0) + err("Negotiation failed: %m"); + if (read(net, &reserved, sizeof(reserved)) < 0) + err("Negotiation failed: %m"); + if (read(net, &magic, sizeof(magic)) < 0) + err("Negotiation failed: %m"); + magic = ntohll(magic); + if(magic != opts_magic) { + close(net); + return NULL; + } + if (read(net, &opt, sizeof(opt)) < 0) + err("Negotiation failed: %m"); + opt = ntohl(opt); + if(opt != NBD_OPT_EXPORT_NAME) { + close(net); + return NULL; + } + if (read(net, &namelen, sizeof(namelen)) < 0) + err("Negotiation failed: %m"); + namelen = ntohl(namelen); + name = malloc(namelen+1); + name[namelen]=0; + if (read(net, name, namelen) < 0) + err("Negotiation failed: %m"); + for(i=0; ilen; i++) { + SERVER* serve = &(g_array_index(servers, SERVER, i)); + if(!strcmp(serve->servename, name)) { + CLIENT* client = g_new0(CLIENT, 1); + client->server = serve; + client->exportsize = OFFT_MAX; + client->net = net; + client->modern = TRUE; + client->transactionlogfd = -1; + free(name); + return client; + } + } + free(name); + return NULL; + } + /* common */ size_host = htonll((u64)(client->exportsize)); - if (write(client->net, &size_host, 8) < 0) + if (write(net, &size_host, 8) < 0) err("Negotiation failed: %m"); if (client->server->flags & F_READONLY) flags |= NBD_FLAG_READ_ONLY; - flags = htonl(flags); - if (write(client->net, &flags, 4) < 0) - err("Negotiation failed: %m"); + if (client->server->flags & F_FLUSH) + flags |= NBD_FLAG_SEND_FLUSH; + if (client->server->flags & F_FUA) + flags |= NBD_FLAG_SEND_FUA; + if (client->server->flags & F_ROTATIONAL) + flags |= NBD_FLAG_ROTATIONAL; + if (!client->modern) { + /* oldstyle */ + flags = htonl(flags); + if (write(client->net, &flags, 4) < 0) + err("Negotiation failed: %m"); + } else { + /* modern */ + smallflags = (uint16_t)(flags & ~((uint16_t)0)); + smallflags = htons(smallflags); + if (write(client->net, &smallflags, sizeof(smallflags)) < 0) { + err("Negotiation failed: %m"); + } + } + /* common */ if (write(client->net, zeros, 124) < 0) err("Negotiation failed: %m"); + return NULL; } /** sending macro. */ -#define SEND(net,reply) writeit( net, &reply, sizeof( reply )); +#define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \ + if (client->transactionlogfd != -1) \ + writeit(client->transactionlogfd, &reply, sizeof(reply)); } /** error macro. */ #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; } /** @@ -1227,22 +1436,30 @@ int mainloop(CLIENT *client) { #ifdef DODBG int i = 0; #endif - negotiate(client); + negotiate(client->net, client, NULL); DEBUG("Entering request loop!\n"); reply.magic = htonl(NBD_REPLY_MAGIC); reply.error = 0; while (go_on) { char buf[BUFSIZE]; + char* p; size_t len; + size_t currlen; + size_t writelen; + uint16_t command; #ifdef DODBG i++; printf("%d: ", i); #endif readit(client->net, &request, sizeof(request)); + if (client->transactionlogfd != -1) + writeit(client->transactionlogfd, &request, sizeof(request)); + request.from = ntohll(request.from); request.type = ntohl(request.type); + command = request.type & NBD_CMD_MASK_COMMAND; - if (request.type==NBD_CMD_DISC) { + if (command==NBD_CMD_DISC) { msg2(LOG_INFO, "Disconnect request received."); if (client->server->flags & F_COPYONWRITE) { if (client->difmap) g_free(client->difmap) ; @@ -1258,38 +1475,60 @@ int mainloop(CLIENT *client) { if (request.magic != htonl(NBD_REQUEST_MAGIC)) err("Not enough magic."); - if (len > BUFSIZE + sizeof(struct nbd_reply)) - err("Request too big!"); -#ifdef DODBG - printf("%s from %llu (%llu) len %d, ", request.type ? "WRITE" : + if (len > BUFSIZE - sizeof(struct nbd_reply)) { + currlen = BUFSIZE - sizeof(struct nbd_reply); + msg2(LOG_INFO, "oversized request (this is not a problem)"); + } else { + currlen = len; + } + DEBUG("%s from %llu (%llu) len %d, ", command ? "WRITE" : "READ", (unsigned long long)request.from, - (unsigned long long)request.from / 512, len); -#endif + (unsigned long long)request.from / 512, (unsigned int)len); memcpy(reply.handle, request.handle, sizeof(reply.handle)); - if ((request.from + len) > (OFFT_MAX)) { - DEBUG("[Number too large!]"); - ERROR(client, reply, EINVAL); - continue; - } - if (((ssize_t)((off_t)request.from + len) > client->exportsize)) { - DEBUG("[RANGE!]"); - ERROR(client, reply, EINVAL); - continue; + if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) { + if ((request.from + len) > (OFFT_MAX)) { + DEBUG("[Number too large!]"); + ERROR(client, reply, EINVAL); + continue; + } + + if (((ssize_t)((off_t)request.from + len) > client->exportsize)) { + DEBUG("[RANGE!]"); + ERROR(client, reply, EINVAL); + continue; + } } - if (request.type==NBD_CMD_WRITE) { + if (command==NBD_CMD_WRITE) { DEBUG("wr: net->buf, "); - readit(client->net, buf, len); - DEBUG("buf->exp, "); - if ((client->server->flags & F_READONLY) || - (client->server->flags & F_AUTOREADONLY)) { - DEBUG("[WRITE to READONLY!]"); - ERROR(client, reply, EPERM); - continue; + while(len > 0) { + readit(client->net, buf, currlen); + DEBUG("buf->exp, "); + if ((client->server->flags & F_READONLY) || + (client->server->flags & F_AUTOREADONLY)) { + DEBUG("[WRITE to READONLY!]"); + ERROR(client, reply, EPERM); + continue; + } + if (expwrite(request.from, buf, len, client, + request.type & NBD_CMD_FLAG_FUA)) { + DEBUG("Write failed: %m" ); + ERROR(client, reply, errno); + continue; + } + SEND(client->net, reply); + DEBUG("OK!\n"); + len -= currlen; + currlen = (len < BUFSIZE) ? len : BUFSIZE; } - if (expwrite(request.from, buf, len, client)) { - DEBUG("Write failed: %m" ); + continue; + } + + if (command==NBD_CMD_FLUSH) { + DEBUG("fl: "); + if (expflush(client)) { + DEBUG("Flush failed: %m"); ERROR(client, reply, errno); continue; } @@ -1297,19 +1536,34 @@ int mainloop(CLIENT *client) { DEBUG("OK!\n"); continue; } - /* READ */ - DEBUG("exp->buf, "); - if (expread(request.from, buf + sizeof(struct nbd_reply), len, client)) { - DEBUG("Read failed: %m"); - ERROR(client, reply, errno); + if (command==NBD_CMD_READ) { + DEBUG("exp->buf, "); + memcpy(buf, &reply, sizeof(struct nbd_reply)); + if (client->transactionlogfd != -1) + writeit(client->transactionlogfd, &reply, sizeof(reply)); + p = buf + sizeof(struct nbd_reply); + writelen = currlen + sizeof(struct nbd_reply); + while(len > 0) { + if (expread(request.from, p, currlen, client)) { + DEBUG("Read failed: %m"); + ERROR(client, reply, errno); + continue; + } + + DEBUG("buf->net, "); + writeit(client->net, buf, writelen); + len -= currlen; + request.from += currlen; + currlen = (len < BUFSIZE) ? len : BUFSIZE; + p = buf; + writelen = currlen; + } + DEBUG("OK!\n"); continue; } - DEBUG("buf->net, "); - memcpy(buf, &reply, sizeof(struct nbd_reply)); - writeit(client->net, buf, len + sizeof(struct nbd_reply)); - DEBUG("OK!\n"); + DEBUG ("Ignoring unknown command\n"); } return 0; } @@ -1340,7 +1594,7 @@ void setupexport(CLIENT* client) { } else { tmpname=g_strdup(client->exportname); } - DEBUG2( "Opening %s\n", tmpname ); + DEBUG( "Opening %s\n", tmpname ); fi.fhandle = open(tmpname, mode); if(fi.fhandle == -1 && mode == O_RDWR) { /* Try again because maybe media was read-only */ @@ -1439,6 +1693,15 @@ int do_run(gchar* command, gchar* file) { * @param client a connected client **/ void serveconnection(CLIENT *client) { + if (client->server->transactionlog && (client->transactionlogfd == -1)) + { + if (-1 == (client->transactionlogfd = open(client->server->transactionlog, + O_WRONLY | O_CREAT, + S_IRUSR | S_IWUSR))) + g_warning("Could not open transaction log %s", + client->server->transactionlog); + } + if(do_run(client->server->prerun, client->exportname)) { exit(EXIT_FAILURE); } @@ -1452,6 +1715,12 @@ void serveconnection(CLIENT *client) { mainloop(client); do_run(client->server->postrun, client->exportname); + + if (-1 != client->transactionlogfd) + { + close(client->transactionlogfd); + client->transactionlogfd = -1; + } } /** @@ -1563,7 +1832,6 @@ void destroy_pid_t(gpointer data) { int serveloop(GArray* servers) { struct sockaddr_storage addrin; socklen_t addrinlen=sizeof(addrin); - SERVER *serve; int i; int max; int sock; @@ -1580,44 +1848,76 @@ int serveloop(GArray* servers) { max=0; FD_ZERO(&mset); for(i=0;ilen;i++) { - sock=(g_array_index(servers, SERVER, i)).socket; - FD_SET(sock, &mset); - max=sock>max?sock:max; + if((sock=(g_array_index(servers, SERVER, i)).socket)) { + FD_SET(sock, &mset); + max=sock>max?sock:max; + } + } + if(modernsock) { + FD_SET(modernsock, &mset); + max=modernsock>max?modernsock:max; } for(;;) { - CLIENT *client; - int net; + CLIENT *client = NULL; pid_t *pid; memcpy(&rset, &mset, sizeof(fd_set)); if(select(max+1, &rset, NULL, NULL, NULL)>0) { + int net = 0; + SERVER* serve=NULL; + DEBUG("accept, "); - for(i=0;ilen;i++) { + if(FD_ISSET(modernsock, &rset)) { + if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0) + err("accept: %m"); + client = negotiate(net, NULL, servers); + if(!client) { + err_nonfatal("negotiation failed"); + close(net); + net=0; + continue; + } + serve = client->server; + } + for(i=0;ilen && !net;i++) { serve=&(g_array_index(servers, SERVER, i)); if(FD_ISSET(serve->socket, &rset)) { - int sock_flags; if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0) err("accept: %m"); - - if((sock_flags = fcntl(net, F_GETFL, 0))==-1) { - err("fcntl F_GETFL"); - } - if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) { - err("fcntl F_SETFL ~O_NONBLOCK"); - } - client = g_malloc(sizeof(CLIENT)); + } + } + if(net) { + int sock_flags; + + if(serve->max_connections > 0 && + g_hash_table_size(children) >= serve->max_connections) { + msg2(LOG_INFO, "Max connections reached"); + close(net); + continue; + } + if((sock_flags = fcntl(net, F_GETFL, 0))==-1) { + err("fcntl F_GETFL"); + } + if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) { + err("fcntl F_SETFL ~O_NONBLOCK"); + } + if(!client) { + client = g_new0(CLIENT, 1); client->server=serve; client->exportsize=OFFT_MAX; client->net=net; - set_peername(net, client); - if (!authorized_client(client)) { - msg2(LOG_INFO,"Unauthorized client") ; - close(net); - continue; - } - msg2(LOG_INFO,"Authorized client") ; - pid=g_malloc(sizeof(pid_t)); -#ifndef NOFORK + client->transactionlogfd = -1; + } + set_peername(net, client); + if (!authorized_client(client)) { + msg2(LOG_INFO,"Unauthorized client") ; + close(net); + continue; + } + msg2(LOG_INFO,"Authorized client") ; + pid=g_malloc(sizeof(pid_t)); + + if (!dontfork) { if ((*pid=fork())<0) { msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ; close(net); @@ -1635,42 +1935,61 @@ int serveloop(GArray* servers) { close(serve->socket); } /* FALSE does not free the - actual data. This is required, - because the client has a - direct reference into that - data, and otherwise we get a - segfault... */ + actual data. This is required, + because the client has a + direct reference into that + data, and otherwise we get a + segfault... */ g_array_free(servers, FALSE); -#endif // NOFORK - msg2(LOG_INFO,"Starting to serve"); - serveconnection(client); - exit(EXIT_SUCCESS); } + + msg2(LOG_INFO,"Starting to serve"); + serveconnection(client); + exit(EXIT_SUCCESS); } } } } +void dosockopts(int socket) { +#ifndef sun + int yes=1; +#else + char yes='1'; +#endif /* sun */ + int sock_flags; + + /* lose the pesky "Address already in use" error message */ + if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) { + err("setsockopt SO_REUSEADDR"); + } + if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) { + err("setsockopt SO_KEEPALIVE"); + } + + /* make the listening socket non-blocking */ + if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) { + err("fcntl F_GETFL"); + } + if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) { + err("fcntl F_SETFL O_NONBLOCK"); + } +} + /** * Connect a server's socket. * * @param serve the server we want to connect. **/ -void setup_serve(SERVER *serve) { - struct sockaddr_storage addrin; +int setup_serve(SERVER *serve) { struct addrinfo hints; struct addrinfo *ai = NULL; - struct sigaction sa; - int addrinlen = sizeof(addrin); - int sock_flags; -#ifndef sun - int yes=1; -#else - char yes='1'; -#endif /* sun */ gchar *port = NULL; int e; + if(!do_oldstyle) { + return serve->servename ? 1 : 0; + } memset(&hints,'\0',sizeof(hints)); hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV; hints.ai_socktype = SOCK_STREAM; @@ -1678,7 +1997,7 @@ void setup_serve(SERVER *serve) { port = g_strdup_printf ("%d", serve->port); if (port == NULL) - return; + return 0; e = getaddrinfo(serve->listenaddr,port,&hints,&ai); @@ -1705,21 +2024,7 @@ void setup_serve(SERVER *serve) { if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0) err("socket: %m"); - /* lose the pesky "Address already in use" error message */ - if (setsockopt(serve->socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) { - err("setsockopt SO_REUSEADDR"); - } - if (setsockopt(serve->socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) { - err("setsockopt SO_KEEPALIVE"); - } - - /* make the listening socket non-blocking */ - if ((sock_flags = fcntl(serve->socket, F_GETFL, 0)) == -1) { - err("fcntl F_GETFL"); - } - if (fcntl(serve->socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) { - err("fcntl F_SETFL O_NONBLOCK"); - } + dosockopts(serve->socket); DEBUG("Waiting for connections... bind, "); e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen); @@ -1730,17 +2035,43 @@ void setup_serve(SERVER *serve) { err("listen: %m"); freeaddrinfo (ai); + if(serve->servename) { + return 1; + } else { + return 0; + } +} - sa.sa_handler = sigchld_handler; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - if(sigaction(SIGCHLD, &sa, NULL) == -1) - err("sigaction: %m"); - sa.sa_handler = sigterm_handler; - sigemptyset(&sa.sa_mask); - sa.sa_flags = SA_RESTART; - if(sigaction(SIGTERM, &sa, NULL) == -1) - err("sigaction: %m"); +void open_modern(void) { + struct addrinfo hints; + struct addrinfo* ai = NULL; + struct sock_flags; + int e; + + memset(&hints, '\0', sizeof(hints)); + hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG; + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = AF_UNSPEC; + hints.ai_protocol = IPPROTO_TCP; + e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai); + if(e != 0) { + fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e)); + exit(EXIT_FAILURE); + } + if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) { + err("socket: %m"); + } + + dosockopts(modernsock); + + if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) { + err("bind: %m"); + } + if(listen(modernsock, 10) <0) { + err("listen: %m"); + } + + freeaddrinfo(ai); } /** @@ -1748,11 +2079,27 @@ void setup_serve(SERVER *serve) { **/ void setup_servers(GArray* servers) { int i; + struct sigaction sa; + int want_modern=0; for(i=0;ilen;i++) { - setup_serve(&(g_array_index(servers, SERVER, i))); + want_modern |= setup_serve(&(g_array_index(servers, SERVER, i))); + } + if(want_modern) { + open_modern(); } children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t); + + sa.sa_handler = sigchld_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART; + if(sigaction(SIGCHLD, &sa, NULL) == -1) + err("sigaction: %m"); + sa.sa_handler = sigterm_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART; + if(sigaction(SIGTERM, &sa, NULL) == -1) + err("sigaction: %m"); } /** @@ -1762,7 +2109,7 @@ void setup_servers(GArray* servers) { * is only used to create a PID file of the form * /var/run/nbd-server.<port>.pid; it's not modified in any way. **/ -#if !defined(NODAEMON) && !defined(NOFORK) +#if !defined(NODAEMON) void daemonize(SERVER* serve) { FILE*pidf; @@ -1791,7 +2138,7 @@ void daemonize(SERVER* serve) { } #else #define daemonize(serve) -#endif /* !defined(NODAEMON) && !defined(NOFORK) */ +#endif /* !defined(NODAEMON) */ /* * Everything beyond this point (in the file) is run in non-daemon mode. @@ -1862,7 +2209,7 @@ void glib_message_syslog_redirect(const gchar *log_domain, default: level=LOG_ERR; } - syslog(level, message); + syslog(level, "%s", message); } #endif @@ -1914,16 +2261,24 @@ int main(int argc, char *argv[]) { } } - if(!servers || !servers->len) { - g_warning("Could not parse config file: %s", - err ? err->message : "Unknown error"); + if(!servers || !servers->len) { + if(err && !(err->domain == g_quark_from_string("parse_cfile") + && err->code == CFILE_NOTFOUND)) { + g_warning("Could not parse config file: %s", + err ? err->message : "Unknown error"); + } } - + if(serve) { + g_warning("Specifying an export on the command line is deprecated."); + g_warning("Please use a configuration file instead."); + } + if((!serve) && (!servers||!servers->len)) { - g_message("Nothing to do! Bye!"); + g_message("No configured exports; quitting."); exit(EXIT_FAILURE); } - daemonize(serve); + if (!dontfork) + daemonize(serve); setup_servers(servers); dousers(); serveloop(servers);