r231: Make client sockets non-blocking, too.
[nbd.git] / nbd-server.c
index c21e3c6..281a14f 100644 (file)
@@ -58,6 +58,7 @@
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
+#include <sys/select.h>                /* select */
 #include <sys/wait.h>          /* wait */
 #ifdef HAVE_SYS_IOCTL_H
 #include <sys/ioctl.h>
@@ -81,6 +82,8 @@
 #include <dirent.h>
 #include <unistd.h>
 #include <getopt.h>
+#include <pwd.h>
+#include <grp.h>
 
 #include <glib.h>
 
 /** Where our config file actually is */
 gchar* config_file_pos;
 
-/** how much space for child PIDs we have by default. Dynamically
-   allocated, and will be realloc()ed if out of space, so this should
-   probably be fair for most situations. */
-#define DEFAULT_CHILD_ARRAY 256
+/** What user we're running as */
+gchar* runuser=NULL;
+/** What group we're running as */
+gchar* rungroup=NULL;
 
 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
 #ifdef ISSERVER
@@ -119,49 +122,52 @@ gchar* config_file_pos;
 #define DEBUG( a ) printf( a )
 #define DEBUG2( a,b ) printf( a,b )
 #define DEBUG3( a,b,c ) printf( a,b,c )
+#define DEBUG4( a,b,c,d ) printf( a,b,c,d )
 #else
 #define DEBUG( a )
 #define DEBUG2( a,b ) 
 #define DEBUG3( a,b,c ) 
+#define DEBUG4( a,b,c,d ) 
 #endif
 #ifndef PACKAGE_VERSION
 #define PACKAGE_VERSION ""
 #endif
 /**
- * The highest value a variable of type off_t can reach.
+ * The highest value a variable of type off_t can reach. This is a signed
+ * integer, so set all bits except for the leftmost one.
  **/
-/* This is starting to get ugly. If someone knows a better way to find
- * the maximum value of a signed type *without* relying on overflow
- * (doing so breaks on 64bit architectures), that would be nice.
- *
- * Actually, do we need this at all? Can't we just say '0 is autodetect', and
- * live with it? Or better yet, use an extra flag, or so?
- * Answer: yes, we need it, as the hunksize is defined to this when the
- * multiple file thingy isn't used.
- */
-#define OFFT_MAX (((((off_t)1)<<((sizeof(off_t)-1)*8))-1)<<7)+127
+#define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
 #define LINELEN 256      /**< Size of static buffer used to read the
                            authorization file (yuck) */
 #define BUFSIZE (1024*1024) /**< Size of buffer that can hold requests */
-#define GIGA (1*1024*1024*1024) /**< 1 Gigabyte. Used as hunksize when doing
-                                 the multiple file thingy. @todo: make this a
-                                 configuration option. */
 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
 #define F_READONLY 1      /**< flag to tell us a file is readonly */
 #define F_MULTIFILE 2    /**< flag to tell us a file is exported using -m */
 #define F_COPYONWRITE 4          /**< flag to tell us a file is exported using
                            copyonwrite */
 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
+#define F_SPARSE 16
 GHashTable *children;
 char pidfname[256]; /**< name of our PID file */
-char default_authname[] = "/etc/nbd_server.allow"; /**< default name of allow file */
+char pidftemplate[256]; /**< template to be used for the filename of the PID file */
+char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
+
+/**
+ * Types of virtuatlization
+ **/
+typedef enum {
+       VIRT_NONE=0,    /**< No virtualization */
+       VIRT_IPLIT,     /**< Literal IP address as part of the filename */
+       VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
+                            doing the same as in IPLIT */
+       VIRT_CIDR,      /**< Every subnet in its own directory */
+} VIRT_STYLE;
 
 /**
  * Variables associated with a server.
  **/
 typedef struct {
        gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
-       off_t hunksize;      /**< size of a hunk of an exported file */
        off_t expected_size; /**< size of the exported file as it was told to
                               us through configuration */
        unsigned int port;   /**< port we're exporting this file at */
@@ -170,17 +176,25 @@ typedef struct {
        unsigned int timeout;/**< how long a connection may be idle
                               (0=forever) */
        int socket;          /**< The socket of this server. */
+       VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
+       uint8_t cidrlen;     /**< The length of the mask when we use
+                                 CIDR-style virtualization */
 } SERVER;
 
 /**
  * Variables associated with a client socket.
  **/
 typedef struct {
+       int fhandle;      /**< file descriptor */
+       off_t startoff;   /**< starting offset of this file */
+} FILE_INFO;
+
+typedef struct {
        off_t exportsize;    /**< size of the file we're exporting */
        char *clientname;    /**< peer */
        char *exportname;    /**< (processed) filename of the file we're exporting */
-       GArray *export;    /**< array of filedescriptors of exported files;
-                              only the first is actually used unless we're
+       GArray *export;    /**< array of FILE_INFO of exported files;
+                              array size is always 1 unless we're
                               doing the multiple file option */
        int net;             /**< The actual client socket */
        SERVER *server;      /**< The server this client is getting data from */
@@ -200,6 +214,7 @@ typedef enum {
        PARAM_STRING,           /**< This parameter is a string */
        PARAM_BOOL,             /**< This parameter is a boolean */
 } PARAM_TYPE;
+
 /**
  * Configuration file values
  **/
@@ -225,9 +240,14 @@ typedef struct {
  * @return 0 - authorization refused, 1 - OK
  **/
 int authorized_client(CLIENT *opts) {
+       const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
        FILE *f ;
-   
        char line[LINELEN]; 
+       char *tmp;
+       struct in_addr addr;
+       struct in_addr client;
+       struct in_addr cltemp;
+       int len;
 
        if ((f=fopen(opts->server->authname,"r"))==NULL) {
                msg4(LOG_INFO,"Can't open authorization file %s (%s).",
@@ -235,14 +255,35 @@ int authorized_client(CLIENT *opts) {
                return 1 ; 
        }
   
+       inet_aton(opts->clientname, &client);
        while (fgets(line,LINELEN,f)!=NULL) {
+               if((tmp=index(line, '/'))) {
+                       if(strlen(line)<=tmp-line) {
+                               msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
+                               return 0;
+                       }
+                       *(tmp++)=0;
+                       if(inet_aton(line,&addr)) {
+                               msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
+                               return 0;
+                       }
+                       len=strtol(tmp, NULL, 0);
+                       addr.s_addr>>=32-len;
+                       addr.s_addr<<=32-len;
+                       memcpy(&cltemp,&client,sizeof(client));
+                       cltemp.s_addr>>=32-len;
+                       cltemp.s_addr<<=32-len;
+                       if(addr.s_addr == cltemp.s_addr) {
+                               return 1;
+                       }
+               }
                if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
                        fclose(f);
                        return 1;
                }
        }
-       fclose(f) ;
-       return 0 ;
+       fclose(f);
+       return 0;
 }
 
 /**
@@ -254,12 +295,33 @@ int authorized_client(CLIENT *opts) {
  **/
 inline void readit(int f, void *buf, size_t len) {
        ssize_t res;
+       gboolean tried = FALSE;
+
        while (len > 0) {
                DEBUG("*");
-               if ((res = read(f, buf, len)) <= 0)
-                       err("Read failed: %m");
-               len -= res;
-               buf += res;
+               if ((res = read(f, buf, len)) <= 0) {
+                       if(!tried && errno==EAGAIN) {
+                               /* Assume the connection will work some time in
+                                * the future, but don't run away with CPU time
+                                * in case it doesn't */
+                               fd_set set;
+                               struct timeval tv;
+
+                               DEBUG("Read failed, trying again");
+                               tried=TRUE;
+                               FD_ZERO(&set);
+                               FD_SET(f, &set);
+                               tv.tv_sec=30;
+                               tv.tv_usec=0;
+                               select(f+1, &set, NULL, NULL, &tv);
+                       } else {
+                               err("Read failed: %m");
+                       }
+               } else {
+                       len -= res;
+                       buf += res;
+                       tried=FALSE;
+               }
        }
 }
 
@@ -272,12 +334,33 @@ inline void readit(int f, void *buf, size_t len) {
  **/
 inline void writeit(int f, void *buf, size_t len) {
        ssize_t res;
+       gboolean tried=FALSE;
+
        while (len > 0) {
                DEBUG("+");
-               if ((res = write(f, buf, len)) <= 0)
-                       err("Send failed: %m");
-               len -= res;
-               buf += res;
+               if ((res = write(f, buf, len)) <= 0) {
+                       if(!tried && errno==EAGAIN) {
+                               /* Assume the connection will work some time in
+                                * the future, but don't run away with CPU time
+                                * in case it doesn't */
+                               fd_set set;
+                               struct timeval tv;
+
+                               DEBUG("Write failed, trying again");
+                               tried=TRUE;
+                               FD_ZERO(&set);
+                               FD_SET(f, &set);
+                               tv.tv_sec=30;
+                               tv.tv_usec=0;
+                               select(f+1, NULL, &set, NULL, &tv);
+                       } else {
+                               err("Send failed: %m");
+                       }
+               } else {
+                       len -= res;
+                       buf += res;
+                       tried=FALSE;
+               }
        }
 }
 
@@ -287,13 +370,14 @@ inline void writeit(int f, void *buf, size_t len) {
  */
 void usage() {
        printf("This is nbd-server version " VERSION "\n");
-       printf("Usage: port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-a timeout_sec] [-C configuration file]\n"
+       printf("Usage: port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-a timeout_sec] [-C configuration file] [-p PID file name]\n"
               "\t-r|--read-only\t\tread only\n"
               "\t-m|--multi-file\t\tmultiple file\n"
               "\t-c|--copy-on-write\tcopy on write\n"
-              "\t-C|--config-file\tspecify an alternat configuration file\n"
+              "\t-C|--config-file\tspecify an alternate configuration file\n"
               "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
-              "\t-a|--idle-time\t\tmaximum idle seconds; server terminates when\n\t\t\t\tidle time exceeded\n\n"
+              "\t-a|--idle-time\t\tmaximum idle seconds; server terminates when\n\t\t\t\tidle time exceeded\n"
+              "\t-p|--pid-file\t\tspecify a filename to write our PID to\n\n"
               "\tif port is set to 0, stdin is used (for running from inetd)\n"
               "\tif file_to_export contains '%%s', it is substituted with the IP\n"
               "\t\taddress of the machine trying to connect\n" );
@@ -317,6 +401,7 @@ SERVER* cmdline(int argc, char *argv[]) {
                {"authorize-file", required_argument, NULL, 'l'},
                {"idle-time", required_argument, NULL, 'a'},
                {"config-file", required_argument, NULL, 'C'},
+               {"pid-file", required_argument, NULL, 'p'},
                {0,0,0,0}
        };
        SERVER *serve;
@@ -328,9 +413,8 @@ SERVER* cmdline(int argc, char *argv[]) {
                return NULL;
        }
        serve=g_new0(SERVER, 1);
-       serve->hunksize=OFFT_MAX;
        serve->authname = g_strdup(default_authname);
-       while((c=getopt_long(argc, argv, "-a:C:cl:mr", long_options, &i))>=0) {
+       while((c=getopt_long(argc, argv, "-a:C:cl:mrp:", long_options, &i))>=0) {
                switch (c) {
                case 1:
                        /* non-option argument */
@@ -368,7 +452,9 @@ SERVER* cmdline(int argc, char *argv[]) {
                        break;
                case 'm':
                        serve->flags |= F_MULTIFILE;
-                       serve->hunksize = 1*GIGA;
+                       break;
+               case 'p':
+                       strncpy(pidftemplate, optarg, 256);
                        break;
                case 'c': 
                        serve->flags |=F_COPYONWRITE;
@@ -429,33 +515,45 @@ void remove_server(gpointer s) {
  * @param f the name of the config file
  * @param e a GError. @see CFILE_ERRORS for what error values this function can
  *     return.
- * @return a GHashTable of SERVER* pointers, with the port number as the hash
- *     key. If the config file is empty or does not exist, returns an empty
- *     GHashTable; if the config file contains an error, returns NULL, and
- *     e is set appropriately
+ * @return a Array of SERVER* pointers, If the config file is empty or does not
+ *     exist, returns an empty GHashTable; if the config file contains an
+ *     error, returns NULL, and e is set appropriately
  **/
 GArray* parse_cfile(gchar* f, GError** e) {
+       const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
+       const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
        SERVER s;
-       PARAM p[] = {
+       gchar *virtstyle=NULL;
+       PARAM lp[] = {
                { "exportname", TRUE,   PARAM_STRING,   NULL, 0 },
                { "port",       TRUE,   PARAM_INT,      NULL, 0 },
                { "authfile",   FALSE,  PARAM_STRING,   NULL, 0 },
                { "timeout",    FALSE,  PARAM_INT,      NULL, 0 },
                { "filesize",   FALSE,  PARAM_INT,      NULL, 0 },
+               { "virtstyle",  FALSE,  PARAM_STRING,   NULL, 0 },
                { "readonly",   FALSE,  PARAM_BOOL,     NULL, F_READONLY },
                { "multifile",  FALSE,  PARAM_BOOL,     NULL, F_MULTIFILE },
                { "copyonwrite", FALSE, PARAM_BOOL,     NULL, F_COPYONWRITE },
+               { "autoreadonly", FALSE, PARAM_BOOL,    NULL, F_AUTOREADONLY },
+               { "sparse_cow", FALSE,  PARAM_BOOL,     NULL, F_SPARSE },
        };
-       const int p_size=8;
+       const int lp_size=11;
+       PARAM gp[] = {
+               { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
+               { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
+       };
+       PARAM* p=gp;
+       int p_size=2;
        GKeyFile *cfile;
        GError *err = NULL;
+       const char *err_msg=NULL;
        GQuark errdomain;
        GArray *retval=NULL;
        gchar **groups;
        gboolean value;
-       gint i,j;
+       gint i;
+       gint j;
 
-       memset(&s, '\0', sizeof(SERVER));
        errdomain = g_quark_from_string("parse_cfile");
        cfile = g_key_file_new();
        retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
@@ -471,13 +569,21 @@ GArray* parse_cfile(gchar* f, GError** e) {
                return NULL;
        }
        groups = g_key_file_get_groups(cfile, NULL);
-       for(i=1;groups[i];i++) {
-               p[0].target=&(s.exportname);
-               p[1].target=&(s.port);
-               p[2].target=&(s.authname);
-               p[3].target=&(s.timeout);
-               p[4].target=&(s.expected_size);
-               p[5].target=p[6].target=p[7].target=p[8].target=&(s.flags);
+       for(i=0;groups[i];i++) {
+               memset(&s, '\0', sizeof(SERVER));
+               lp[0].target=&(s.exportname);
+               lp[1].target=&(s.port);
+               lp[2].target=&(s.authname);
+               lp[3].target=&(s.timeout);
+               lp[4].target=&(s.expected_size);
+               lp[5].target=&(virtstyle);
+               lp[6].target=lp[7].target=lp[8].target=
+                               lp[9].target=lp[10].target=&(s.flags);
+               /* After the [generic] group, start parsing exports */
+               if(i==1) {
+                       p=lp;
+                       p_size=lp_size;
+               } 
                for(j=0;j<p_size;j++) {
                        g_assert(p[j].target != NULL);
                        g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
@@ -501,31 +607,64 @@ GArray* parse_cfile(gchar* f, GError** e) {
                                                        groups[i],
                                                        p[j].paramname, &err);
                                        if(!err) {
-                                               *((gint*)p[j].target) |= value;
+                                               if(value) {
+                                                       *((gint*)p[j].target) |= p[j].flagval;
+                                               } else {
+                                                       *((gint*)p[j].target) &= ~(p[j].flagval);
+                                               }
                                        }
                                        break;
                        }
                        if(err) {
                                if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
-                                       if(p[j].required) {
-                                               g_set_error(e, errdomain, CFILE_KEY_MISSING, "Could not find required value %s in group %s: %s", p[j].paramname, groups[i], err->message);
-                                               g_array_free(retval, TRUE);
-                                               g_error_free(err);
-                                               g_key_file_free(cfile);
-                                               return NULL;
-                                       } else {
+                                       if(!p[j].required) {
+                                               /* Ignore not-found error for optional values */
                                                g_clear_error(&err);
                                                continue;
+                                       } else {
+                                               err_msg = MISSING_REQUIRED_ERROR;
                                        }
-                                       g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Could not parse %s in group %s: %s", p[j].paramname, groups[i], err->message);
+                               } else {
+                                       err_msg = DEFAULT_ERROR;
+                               }
+                               g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
+                               g_array_free(retval, TRUE);
+                               g_error_free(err);
+                               g_key_file_free(cfile);
+                               return NULL;
+                       }
+               }
+               if(virtstyle) {
+                       if(!strncmp(virtstyle, "none", 4)) {
+                               s.virtstyle=VIRT_NONE;
+                       } else if(!strncmp(virtstyle, "ipliteral", 9)) {
+                               s.virtstyle=VIRT_IPLIT;
+                       } else if(!strncmp(virtstyle, "iphash", 6)) {
+                               s.virtstyle=VIRT_IPHASH;
+                       } else if(!strncmp(virtstyle, "cidrhash", 8)) {
+                               s.virtstyle=VIRT_CIDR;
+                               if(strlen(virtstyle)<10) {
+                                       g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
                                        g_array_free(retval, TRUE);
-                                       g_error_free(err);
                                        g_key_file_free(cfile);
                                        return NULL;
                                }
+                               s.cidrlen=strtol(virtstyle+8, NULL, 0);
+                       } else {
+                               g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
+                               g_array_free(retval, TRUE);
+                               g_key_file_free(cfile);
+                               return NULL;
                        }
+               } else {
+                       s.virtstyle=VIRT_IPLIT;
+               }
+               /* Don't need to free this, it's not our string */
+               virtstyle=NULL;
+               /* Don't append values for the [generic] group */
+               if(i>0) {
+                       g_array_append_val(retval, s);
                }
-               g_array_append_val(retval, s);
        }
        return retval;
 }
@@ -536,12 +675,12 @@ GArray* parse_cfile(gchar* f, GError** e) {
  * is severely wrong)
  **/
 void sigchld_handler(int s) {
-        int* status=NULL;
+        int status;
        int* i;
        pid_t pid;
 
-       while((pid=wait(status)) > 0) {
-               if(WIFEXITED(status)) {
+       while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
+               if(WIFEXITED(&status)) {
                        msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
                }
                i=g_hash_table_lookup(children, &pid);
@@ -590,11 +729,11 @@ void sigterm_handler(int s) {
 /**
  * Detect the size of a file.
  *
- * @param export An open filedescriptor
+ * @param fhandle An open filedescriptor
  * @return the size of the file, or OFFT_MAX if detection was
  * impossible.
  **/
-off_t size_autodetect(int export) {
+off_t size_autodetect(int fhandle) {
        off_t es;
        u32 es32;
        struct stat stat_buf;
@@ -603,8 +742,8 @@ off_t size_autodetect(int export) {
 #ifdef HAVE_SYS_MOUNT_H
 #ifdef HAVE_SYS_IOCTL_H
 #ifdef BLKGETSIZE
-       DEBUG("looking for export size with ioctl BLKGETSIZE\n");
-       if (!ioctl(export, BLKGETSIZE, &es32) && es32) {
+       DEBUG("looking for fhandle size with ioctl BLKGETSIZE\n");
+       if (!ioctl(fhandle, BLKGETSIZE, &es32) && es32) {
                es = (off_t)es32 * (off_t)512;
                return es;
        }
@@ -612,9 +751,9 @@ off_t size_autodetect(int export) {
 #endif /* HAVE_SYS_IOCTL_H */
 #endif /* HAVE_SYS_MOUNT_H */
 
-       DEBUG("looking for export size with fstat\n");
+       DEBUG("looking for fhandle size with fstat\n");
        stat_buf.st_size = 0;
-       error = fstat(export, &stat_buf);
+       error = fstat(fhandle, &stat_buf);
        if (!error) {
                if(stat_buf.st_size > 0)
                        return (off_t)stat_buf.st_size;
@@ -622,8 +761,8 @@ off_t size_autodetect(int export) {
                 err("fstat failed: %m");
         }
 
-       DEBUG("looking for export size with lseek SEEK_END\n");
-       es = lseek(export, (off_t)0, SEEK_END);
+       DEBUG("looking for fhandle size with lseek SEEK_END\n");
+       es = lseek(fhandle, (off_t)0, SEEK_END);
        if (es > ((off_t)0)) {
                return es;
         } else {
@@ -635,6 +774,54 @@ off_t size_autodetect(int export) {
 }
 
 /**
+ * Get the file handle and offset, given an export offset.
+ *
+ * @param export An array of export files
+ * @param a The offset to get corresponding file/offset for
+ * @param fhandle [out] File descriptor
+ * @param foffset [out] Offset into fhandle
+ * @param maxbytes [out] Tells how many bytes can be read/written
+ * from fhandle starting at foffset (0 if there is no limit)
+ * @return 0 on success, -1 on failure
+ **/
+int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
+       /* Negative offset not allowed */
+       if(a < 0)
+               return -1;
+
+       /* Binary search for last file with starting offset <= a */
+       FILE_INFO fi;
+       int start = 0;
+       int end = export->len - 1;
+       while( start <= end ) {
+               int mid = (start + end) / 2;
+               fi = g_array_index(export, FILE_INFO, mid);
+               if( fi.startoff < a ) {
+                       start = mid + 1;
+               } else if( fi.startoff > a ) {
+                       end = mid - 1;
+               } else {
+                       start = end = mid;
+                       break;
+               }
+       }
+
+       /* end should never go negative, since first startoff is 0 and a >= 0 */
+       g_assert(end >= 0);
+
+       fi = g_array_index(export, FILE_INFO, end);
+       *fhandle = fi.fhandle;
+       *foffset = a - fi.startoff;
+       *maxbytes = 0;
+       if( end+1 < export->len ) {
+               FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
+               *maxbytes = fi_next.startoff - a;
+       }
+
+       return 0;
+}
+
+/**
  * seek to a position in a file, with error handling.
  * @param handle a filedescriptor
  * @param a position to seek to
@@ -658,13 +845,35 @@ void myseek(int handle,off_t a) {
  * @param client The client we're serving for
  * @return The number of bytes actually written, or -1 in case of an error
  **/
-int rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
-       ssize_t res;
+ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
+       int fhandle;
+       off_t foffset;
+       size_t maxbytes;
+
+       if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
+               return -1;
+       if(maxbytes && len > maxbytes)
+               len = maxbytes;
+
+       DEBUG4("(WRITE to fd %d offset %Lu len %u), ", fhandle, foffset, len);
 
-       myseek(g_array_index(client->export, int, (int)(a/client->server->hunksize)), a%client->server->hunksize);
-       ;
-       res = write(g_array_index(client->export, int, (int)((off_t)a/(off_t)(client->server->hunksize))), buf, len);
-       return (res < 0 || (size_t)res != len);
+       myseek(fhandle, foffset);
+       return write(fhandle, buf, len);
+}
+
+/**
+ * Call rawexpwrite repeatedly until all data has been written.
+ * @return 0 on success, nonzero on failure
+ **/
+int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client) {
+       ssize_t ret;
+
+       while(len > 0 && (ret=rawexpwrite(a, buf, len, client)) > 0 ) {
+               a += ret;
+               buf += ret;
+               len -= ret;
+       }
+       return (ret < 0 || len != 0);
 }
 
 /**
@@ -678,13 +887,35 @@ int rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client) {
  * @return The number of bytes actually read, or -1 in case of an
  * error.
  **/
-int rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
-       ssize_t res;
+ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
+       int fhandle;
+       off_t foffset;
+       size_t maxbytes;
+
+       if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
+               return -1;
+       if(maxbytes && len > maxbytes)
+               len = maxbytes;
+
+       DEBUG4("(READ from fd %d offset %Lu len %u), ", fhandle, foffset, len);
+
+       myseek(fhandle, foffset);
+       return read(fhandle, buf, len);
+}
+
+/**
+ * Call rawexpread repeatedly until all data has been read.
+ * @return 0 on success, nonzero on failure
+ **/
+int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
+       ssize_t ret;
 
-       myseek(g_array_index(client->export,int,(int)a/client->server->hunksize),
-                       a%client->server->hunksize);
-       res = read(g_array_index(client->export,int,(int)a/client->server->hunksize), buf, len);
-       return (res < 0 || (size_t)res != len);
+       while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
+               a += ret;
+               buf += ret;
+               len -= ret;
+       }
+       return (ret < 0 || len != 0);
 }
 
 /**
@@ -695,14 +926,14 @@ int rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
  * @param buf A buffer to read into
  * @param len The size of buf
  * @param client The client we're going to read for
- * @return The number of bytes actually read, or -1 in case of an error
+ * @return 0 on success, nonzero on failure
  **/
 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
        off_t rdlen, offset;
        off_t mapcnt, mapl, maph, pagestart;
 
        if (!(client->server->flags & F_COPYONWRITE))
-               return rawexpread(a, buf, len, client);
+               return(rawexpread_fully(a, buf, len, client));
        DEBUG3("Asked to read %d bytes at %Lu.\n", len, (unsigned long long)a);
 
        mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
@@ -720,7 +951,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
                } else { /* the block is not there */
                        DEBUG2("Page %Lu is not here, we read the original one\n",
                               (unsigned long long)mapcnt);
-                       if(rawexpread(a, buf, rdlen, client)) return -1;
+                       if(rawexpread_fully(a, buf, rdlen, client)) return -1;
                }
                len-=rdlen; a+=rdlen; buf+=rdlen;
        }
@@ -736,7 +967,7 @@ int expread(off_t a, char *buf, size_t len, CLIENT *client) {
  * @param buf The buffer to write from
  * @param len The length of buf
  * @param client The client we're going to write for.
- * @return The number of bytes actually written, or -1 in case of an error
+ * @return 0 on success, nonzero on failure
  **/
 int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        char pagebuf[DIFFPAGESIZE];
@@ -746,7 +977,7 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
        off_t offset;
 
        if (!(client->server->flags & F_COPYONWRITE))
-               return(rawexpwrite(a,buf,len, client)); 
+               return(rawexpwrite_fully(a, buf, len, client)); 
        DEBUG3("Asked to write %d bytes at %Lu.\n", len, (unsigned long long)a);
 
        mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
@@ -765,16 +996,12 @@ int expwrite(off_t a, char *buf, size_t len, CLIENT *client) {
                        if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
                } else { /* the block is not there */
                        myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
-                       client->difmap[mapcnt]=client->difffilelen++ ;
+                       client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
                        DEBUG3("Page %Lu is not here, we put it at %lu\n",
                               (unsigned long long)mapcnt,
                               (unsigned long)(client->difmap[mapcnt]));
                        rdlen=DIFFPAGESIZE ;
-                       if (rdlen+pagestart%(client->server->hunksize) >
-                                       (client->server->hunksize)) 
-                               rdlen=client->server->hunksize -
-                                       (pagestart%client->server->hunksize);
-                       if (rawexpread(pagestart, pagebuf, rdlen, client))
+                       if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
                                return -1;
                        memcpy(pagebuf+offset,buf,wrlen) ;
                        if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
@@ -795,7 +1022,7 @@ void negotiate(CLIENT *client) {
        char zeros[300];
        u64 size_host;
 
-       memset(zeros, 0, 290);
+       memset(zeros, '\0', 290);
        if (write(client->net, INIT_PASSWD, 8) < 0)
                err("Negotiation failed: %m");
        cliserv_magic = htonll(cliserv_magic);
@@ -847,8 +1074,8 @@ int mainloop(CLIENT *client) {
 
                if (request.type==NBD_CMD_DISC) {
                        msg2(LOG_INFO, "Disconnect request received.");
-                       if (client->difmap) g_free(client->difmap) ;
-                       if (client->difffile>=0) { 
+                       if (client->server->flags & F_COPYONWRITE) { 
+                               if (client->difmap) g_free(client->difmap) ;
                                close(client->difffile);
                                unlink(client->difffilename);
                                free(client->difffilename);
@@ -875,8 +1102,7 @@ int mainloop(CLIENT *client) {
                        continue;
                }
 
-               if (((ssize_t)((off_t)request.from + len) > client->exportsize) ||
-                   ((client->server->flags & F_READONLY) && request.type)) {
+               if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
                        DEBUG("[RANGE!]");
                        ERROR(client, reply);
                        continue;
@@ -886,9 +1112,13 @@ int mainloop(CLIENT *client) {
                        DEBUG("wr: net->buf, ");
                        readit(client->net, buf, len);
                        DEBUG("buf->exp, ");
-                       if ((client->server->flags & F_AUTOREADONLY) ||
-                                       expwrite(request.from, buf, len,
-                                               client)) {
+                       if ((client->server->flags & F_READONLY) ||
+                           (client->server->flags & F_AUTOREADONLY)) {
+                               DEBUG("[WRITE to READONLY!]");
+                               ERROR(client, reply);
+                               continue;
+                       }
+                       if (expwrite(request.from, buf, len, client)) {
                                DEBUG("Write failed: %m" );
                                ERROR(client, reply);
                                continue;
@@ -915,36 +1145,75 @@ int mainloop(CLIENT *client) {
 }
 
 /**
- * Split a single exportfile into multiple ones, if that was asked.
- * @return 0 on success, -1 on failure
- * @param client information on the client which we want to split
+ * Set up client export array, which is an array of FILE_INFO.
+ * Also, split a single exportfile into multiple ones, if that was asked.
+ * @param client information on the client which we want to setup export for
  **/
-int splitexport(CLIENT* client) {
-       off_t i;
-       int fhandle;
+void setupexport(CLIENT* client) {
+       int i;
+       off_t laststartoff = 0, lastsize = 0;
+       int multifile = (client->server->flags & F_MULTIFILE);
+
+       client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
 
-       client->export = g_array_new(TRUE, TRUE, sizeof(int));
-       for (i=0; i<client->exportsize; i+=client->server->hunksize) {
+       /* If multi-file, open as many files as we can.
+        * If not, open exactly one file.
+        * Calculate file sizes as we go to get total size. */
+       for(i=0; ; i++) {
+               FILE_INFO fi;
                gchar *tmpname;
+               mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
 
-               if(client->server->flags & F_MULTIFILE) {
-                       tmpname=g_strdup_printf("%s.%d", client->exportname,
-                                       (int)(i/client->server->hunksize));
+               if(multifile) {
+                       tmpname=g_strdup_printf("%s.%d", client->exportname, i);
                } else {
                        tmpname=g_strdup(client->exportname);
                }
                DEBUG2( "Opening %s\n", tmpname );
-               if((fhandle = open(tmpname, (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR)) == -1) {
-                       /* Read WRITE ACCESS was requested by media is only read only */
-                       client->server->flags |= F_AUTOREADONLY;
-                       client->server->flags |= F_READONLY;
-                       if((fhandle = open(tmpname, O_RDONLY)) == -1)
-                               err("Could not open exported file: %m");
+               fi.fhandle = open(tmpname, mode);
+               if(fi.fhandle == -1 && mode == O_RDWR) {
+                       /* Try again because maybe media was read-only */
+                       fi.fhandle = open(tmpname, O_RDONLY);
+                       if(fi.fhandle != -1) {
+                               client->server->flags |= F_AUTOREADONLY;
+                               client->server->flags |= F_READONLY;
+                       }
                }
-               g_array_insert_val(client->export,i/client->server->hunksize,fhandle);
+               if(fi.fhandle == -1) {
+                       if(multifile && i>0)
+                               break;
+                       err("Could not open exported file: %m");
+               }
+               fi.startoff = laststartoff + lastsize;
+               g_array_append_val(client->export, fi);
                g_free(tmpname);
+
+               /* Starting offset and size of this file will be used to
+                * calculate starting offset of next file */
+               laststartoff = fi.startoff;
+               lastsize = size_autodetect(fi.fhandle);
+
+               if(!multifile)
+                       break;
+       }
+
+       /* Set export size to total calculated size */
+       client->exportsize = laststartoff + lastsize;
+
+       /* Export size may be overridden */
+       if(client->server->expected_size) {
+               /* desired size must be <= total calculated size */
+               if(client->server->expected_size > client->exportsize) {
+                       err("Size of exported file is too big\n");
+               }
+
+               client->exportsize = client->server->expected_size;
+       }
+
+       msg3(LOG_INFO, "Size of exported file/device is %Lu", (unsigned long long)client->exportsize);
+       if(multifile) {
+               msg3(LOG_INFO, "Total number of files: %d", i);
        }
-       return 0;
 }
 
 int copyonwrite_prepare(CLIENT* client) {
@@ -973,22 +1242,7 @@ int copyonwrite_prepare(CLIENT* client) {
  * @param client a connected client
  **/
 void serveconnection(CLIENT *client) {
-       splitexport(client);
-
-       if (!client->server->expected_size) {
-               client->exportsize = size_autodetect(g_array_index(client->export,int,0));
-       } else {
-               /* Perhaps we should check first. Not now. */
-               client->exportsize = client->server->expected_size;
-       }
-       if (client->exportsize > OFFT_MAX) {
-               /* uhm, well... In a parallel universe, this *might* be
-                * possible... */
-               err("Size of exported file is too big\n");
-       }
-       else {
-               msg3(LOG_INFO, "size of exported file/device is %Lu", (unsigned long long)client->exportsize);
-       }
+       setupexport(client);
 
        if (client->server->flags & F_COPYONWRITE) {
                copyonwrite_prepare(client);
@@ -1002,7 +1256,8 @@ void serveconnection(CLIENT *client) {
 /**
  * Find the name of the file we have to serve. This will use g_strdup_printf
  * to put the IP address of the client inside a filename containing
- * "%s". That name is then written to client->exportname.
+ * "%s" (in the form as specified by the "virtstyle" option). That name
+ * is then written to client->exportname.
  *
  * @param net A socket connected to an nbd client
  * @param client information about the client. The IP address in human-readable
@@ -1011,14 +1266,40 @@ void serveconnection(CLIENT *client) {
  **/
 void set_peername(int net, CLIENT *client) {
        struct sockaddr_in addrin;
-       int addrinlen = sizeof( addrin );
-       char *peername ;
+       struct sockaddr_in netaddr;
+       size_t addrinlen = sizeof( addrin );
+       char *peername;
+       char *netname;
+       char *tmp;
+       int i;
 
        if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
                err("getsockname failed: %m");
-       peername = inet_ntoa(addrin.sin_addr);
-       client->exportname=g_strdup_printf(client->server->exportname, peername);
+       peername = g_strdup(inet_ntoa(addrin.sin_addr));
+       switch(client->server->virtstyle) {
+               case VIRT_NONE:
+                       client->exportname=g_strdup(client->server->exportname);
+                       break;
+               case VIRT_IPHASH:
+                       for(i=0;i<strlen(peername);i++) {
+                               if(peername[i]=='.') {
+                                       peername[i]='/';
+                               }
+                       }
+               case VIRT_IPLIT:
+                       client->exportname=g_strdup_printf(client->server->exportname, peername);
+                       break;
+               case VIRT_CIDR:
+                       memcpy(&netaddr, &addrin, addrinlen);
+                       netaddr.sin_addr.s_addr>>=32-(client->server->cidrlen);
+                       netaddr.sin_addr.s_addr<<=32-(client->server->cidrlen);
+                       netname = inet_ntoa(netaddr.sin_addr);
+                       tmp=g_strdup_printf("%s/%s", netname, peername);
+                       client->exportname=g_strdup_printf(client->server->exportname, tmp);
+                       break;
+       }
 
+       g_free(peername);
        msg4(LOG_INFO, "connect from %s, assigned file is %s", 
             peername, client->exportname);
        client->clientname=g_strdup(peername);
@@ -1046,11 +1327,14 @@ void daemonize(SERVER* serve) {
        if(daemon(0,0)<0) {
                err("daemon");
        }
-       if(serve) {
-               snprintf(pidfname, sizeof(char)*255, "/var/run/nbd-server.%d.pid", serve->port);
-       } else {
-               strncpy(pidfname, "/var/run/nbd-server.pid", sizeof(char)*255);
+       if(!*pidftemplate) {
+               if(serve) {
+                       strncpy(pidftemplate, "/var/run/server.%d.pid", 255);
+               } else {
+                       strncpy(pidftemplate, "/var/run/server.pid", 255);
+               }
        }
+       snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
        pidf=fopen(pidfname, "w");
        if(pidf) {
                fprintf(pidf,"%d\n", (int)getpid());
@@ -1073,6 +1357,7 @@ void setup_serve(SERVER *serve) {
        struct sockaddr_in addrin;
        struct sigaction sa;
        int addrinlen = sizeof(addrin);
+       int sock_flags;
 #ifndef sun
        int yes=1;
 #else
@@ -1089,6 +1374,14 @@ void setup_serve(SERVER *serve) {
                err("setsockopt SO_KEEPALIVE");
        }
 
+       /* make the listening socket non-blocking */
+       if ((sock_flags = fcntl(serve->socket, F_GETFL, 0)) == -1) {
+               err("fcntl F_GETFL");
+       }
+       if (fcntl(serve->socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
+               err("fcntl F_SETFL O_NONBLOCK on server socket");
+       }
+
        DEBUG("Waiting for connections... bind, ");
        addrin.sin_family = AF_INET;
        addrin.sin_port = htons(serve->port);
@@ -1108,7 +1401,6 @@ void setup_serve(SERVER *serve) {
        sa.sa_flags = SA_RESTART;
        if(sigaction(SIGTERM, &sa, NULL) == -1)
                err("sigaction: %m");
-       children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
 }
 
 /**
@@ -1120,6 +1412,7 @@ void setup_servers(GArray* servers) {
        for(i=0;i<servers->len;i++) {
                setup_serve(&(g_array_index(servers, SERVER, i)));
        }
+       children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
 }
 
 /**
@@ -1129,8 +1422,11 @@ int serveloop(GArray* servers) {
        struct sockaddr_in addrin;
        socklen_t addrinlen=sizeof(addrin);
        SERVER *serve;
-       int i, max, sock;
-       fd_set mset, rset;
+       int i;
+       int max;
+       int sock;
+       fd_set mset;
+       fd_set rset;
        struct timeval tv;
 
        /* 
@@ -1160,12 +1456,20 @@ int serveloop(GArray* servers) {
                        for(i=0;i<servers->len;i++) {
                                serve=&(g_array_index(servers, SERVER, i));
                                if(FD_ISSET(serve->socket, &rset)) {
+                                       int sock_flags;
+
                                        if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
                                                err("accept: %m");
 
                                        client = g_malloc(sizeof(CLIENT));
                                        client->server=serve;
                                        client->exportsize=OFFT_MAX;
+                                       if ((sock_flags = fcntl(serve->socket, F_GETFL, 0)) == -1) {
+                                               err("fcntl F_GETFL");
+                                       }
+                                       if (fcntl(net, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
+                                               err("fcntl F_SETFL O_NONBLOCK on client socket");
+                                       }
                                        client->net=net;
                                        set_peername(net, client);
                                        if (!authorized_client(client)) {
@@ -1208,6 +1512,24 @@ int serveloop(GArray* servers) {
 }
 
 /**
+ * Set up user-ID and/or group-ID
+ **/
+void dousers(void) {
+       struct passwd *pw;
+       struct group *gr;
+       if(runuser) {
+               pw=getpwnam(runuser);
+               if(setuid(pw->pw_uid)<0)
+                       msg3(LOG_DEBUG, "Could not set UID: %s", strerror(errno));
+       }
+       if(rungroup) {
+               gr=getgrnam(rungroup);
+               if(setgid(gr->gr_gid)<0)
+                       msg3(LOG_DEBUG, "Could not set GID: %s", strerror(errno));
+       }
+}
+
+/**
  * Main entry point...
  **/
 int main(int argc, char *argv[]) {
@@ -1220,6 +1542,8 @@ int main(int argc, char *argv[]) {
                exit(-1) ;
        }
 
+       memset(pidftemplate, '\0', 256);
+
        logging();
        config_file_pos = g_strdup(CFILE);
        serve=cmdline(argc, argv);
@@ -1260,6 +1584,7 @@ int main(int argc, char *argv[]) {
        }
        daemonize(serve);
        setup_servers(servers);
+       dousers();
        serveloop(servers);
        return 0 ;
 }