Add nbd-trdump manpage
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 int modernsock=0;         /**< Socket for the modern handler. Not used
169                                if a client was only specified on the
170                                command line; only port used if
171                                oldstyle is set to false (and then the
172                                command-line client isn't used, gna gna) */
173 char* modern_listen;      /**< listenaddr value for modernsock */
174
175 /**
176  * Types of virtuatlization
177  **/
178 typedef enum {
179         VIRT_NONE=0,    /**< No virtualization */
180         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
181         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
182                              doing the same as in IPLIT */
183         VIRT_CIDR,      /**< Every subnet in its own directory */
184 } VIRT_STYLE;
185
186 /**
187  * Variables associated with a server.
188  **/
189 typedef struct {
190         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
191         off_t expected_size; /**< size of the exported file as it was told to
192                                us through configuration */
193         gchar* listenaddr;   /**< The IP address we're listening on */
194         unsigned int port;   /**< port we're exporting this file at */
195         char* authname;      /**< filename of the authorization file */
196         int flags;           /**< flags associated with this exported file */
197         int socket;          /**< The socket of this server. */
198         int socket_family;   /**< family of the socket */
199         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
200         uint8_t cidrlen;     /**< The length of the mask when we use
201                                   CIDR-style virtualization */
202         gchar* prerun;       /**< command to be ran after connecting a client,
203                                   but before starting to serve */
204         gchar* postrun;      /**< command that will be ran after the client
205                                   disconnects */
206         gchar* servename;    /**< name of the export as selected by nbd-client */
207         int max_connections; /**< maximum number of opened connections */
208         gchar* transactionlog;/**< filename for transaction log */
209 } SERVER;
210
211 /**
212  * Variables associated with a client socket.
213  **/
214 typedef struct {
215         int fhandle;      /**< file descriptor */
216         off_t startoff;   /**< starting offset of this file */
217 } FILE_INFO;
218
219 typedef struct {
220         off_t exportsize;    /**< size of the file we're exporting */
221         char *clientname;    /**< peer */
222         char *exportname;    /**< (processed) filename of the file we're exporting */
223         GArray *export;    /**< array of FILE_INFO of exported files;
224                                array size is always 1 unless we're
225                                doing the multiple file option */
226         int net;             /**< The actual client socket */
227         SERVER *server;      /**< The server this client is getting data from */
228         char* difffilename;  /**< filename of the copy-on-write file, if any */
229         int difffile;        /**< filedescriptor of copyonwrite file. @todo
230                                shouldn't this be an array too? (cfr export) Or
231                                make -m and -c mutually exclusive */
232         u32 difffilelen;     /**< number of pages in difffile */
233         u32 *difmap;         /**< see comment on the global difmap for this one */
234         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
235         int transactionlogfd;/**< fd for transaction log */
236 } CLIENT;
237
238 /**
239  * Type of configuration file values
240  **/
241 typedef enum {
242         PARAM_INT,              /**< This parameter is an integer */
243         PARAM_STRING,           /**< This parameter is a string */
244         PARAM_BOOL,             /**< This parameter is a boolean */
245 } PARAM_TYPE;
246
247 /**
248  * Configuration file values
249  **/
250 typedef struct {
251         gchar *paramname;       /**< Name of the parameter, as it appears in
252                                   the config file */
253         gboolean required;      /**< Whether this is a required (as opposed to
254                                   optional) parameter */
255         PARAM_TYPE ptype;       /**< Type of the parameter. */
256         gpointer target;        /**< Pointer to where the data of this
257                                   parameter should be written. If ptype is
258                                   PARAM_BOOL, the data is or'ed rather than
259                                   overwritten. */
260         gint flagval;           /**< Flag mask for this parameter in case ptype
261                                   is PARAM_BOOL. */
262 } PARAM;
263
264 static inline const char * getcommandname(uint64_t command) {
265         switch (command) {
266         case NBD_CMD_READ:
267                 return "NBD_CMD_READ";
268         case NBD_CMD_WRITE:
269                 return "NBD_CMD_WRITE";
270         case NBD_CMD_DISC:
271                 return "NBD_CMD_DISC";
272         case NBD_CMD_FLUSH:
273                 return "NBD_CMD_FLUSH";
274         default:
275                 break;
276         }
277         return "UNKNOWN";
278 }
279
280 /**
281  * Check whether a client is allowed to connect. Works with an authorization
282  * file which contains one line per machine, no wildcards.
283  *
284  * @param opts The client who's trying to connect.
285  * @return 0 - authorization refused, 1 - OK
286  **/
287 int authorized_client(CLIENT *opts) {
288         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
289         FILE *f ;
290         char line[LINELEN]; 
291         char *tmp;
292         struct in_addr addr;
293         struct in_addr client;
294         struct in_addr cltemp;
295         int len;
296
297         if ((f=fopen(opts->server->authname,"r"))==NULL) {
298                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
299                      opts->server->authname,strerror(errno)) ;
300                 return 1 ; 
301         }
302   
303         inet_aton(opts->clientname, &client);
304         while (fgets(line,LINELEN,f)!=NULL) {
305                 if((tmp=index(line, '/'))) {
306                         if(strlen(line)<=tmp-line) {
307                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
308                                 return 0;
309                         }
310                         *(tmp++)=0;
311                         if(!inet_aton(line,&addr)) {
312                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
313                                 return 0;
314                         }
315                         len=strtol(tmp, NULL, 0);
316                         addr.s_addr>>=32-len;
317                         addr.s_addr<<=32-len;
318                         memcpy(&cltemp,&client,sizeof(client));
319                         cltemp.s_addr>>=32-len;
320                         cltemp.s_addr<<=32-len;
321                         if(addr.s_addr == cltemp.s_addr) {
322                                 return 1;
323                         }
324                 }
325                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
326                         fclose(f);
327                         return 1;
328                 }
329         }
330         fclose(f);
331         return 0;
332 }
333
334 /**
335  * Read data from a file descriptor into a buffer
336  *
337  * @param f a file descriptor
338  * @param buf a buffer
339  * @param len the number of bytes to be read
340  **/
341 static inline void readit(int f, void *buf, size_t len) {
342         ssize_t res;
343         while (len > 0) {
344                 DEBUG("*");
345                 if ((res = read(f, buf, len)) <= 0) {
346                         if(errno != EAGAIN) {
347                                 err("Read failed: %m");
348                         }
349                 } else {
350                         len -= res;
351                         buf += res;
352                 }
353         }
354 }
355
356 /**
357  * Write data from a buffer into a filedescriptor
358  *
359  * @param f a file descriptor
360  * @param buf a buffer containing data
361  * @param len the number of bytes to be written
362  **/
363 static inline void writeit(int f, void *buf, size_t len) {
364         ssize_t res;
365         while (len > 0) {
366                 DEBUG("+");
367                 if ((res = write(f, buf, len)) <= 0)
368                         err("Send failed: %m");
369                 len -= res;
370                 buf += res;
371         }
372 }
373
374 /**
375  * Print out a message about how to use nbd-server. Split out to a separate
376  * function so that we can call it from multiple places
377  */
378 void usage() {
379         printf("This is nbd-server version " VERSION "\n");
380         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
381                "\t-r|--read-only\t\tread only\n"
382                "\t-m|--multi-file\t\tmultiple file\n"
383                "\t-c|--copy-on-write\tcopy on write\n"
384                "\t-C|--config-file\tspecify an alternate configuration file\n"
385                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
386                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
387                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
388                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
389                "\tif port is set to 0, stdin is used (for running from inetd)\n"
390                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
391                "\t\taddress of the machine trying to connect\n" 
392                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
393         printf("Using configuration file %s\n", CFILE);
394 }
395
396 /* Dumps a config file section of the given SERVER*, and exits. */
397 void dump_section(SERVER* serve, gchar* section_header) {
398         printf("[%s]\n", section_header);
399         printf("\texportname = %s\n", serve->exportname);
400         printf("\tlistenaddr = %s\n", serve->listenaddr);
401         printf("\tport = %d\n", serve->port);
402         if(serve->flags & F_READONLY) {
403                 printf("\treadonly = true\n");
404         }
405         if(serve->flags & F_MULTIFILE) {
406                 printf("\tmultifile = true\n");
407         }
408         if(serve->flags & F_COPYONWRITE) {
409                 printf("\tcopyonwrite = true\n");
410         }
411         if(serve->expected_size) {
412                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
413         }
414         if(serve->authname) {
415                 printf("\tauthfile = %s\n", serve->authname);
416         }
417         exit(EXIT_SUCCESS);
418 }
419
420 /**
421  * Parse the command line.
422  *
423  * @param argc the argc argument to main()
424  * @param argv the argv argument to main()
425  **/
426 SERVER* cmdline(int argc, char *argv[]) {
427         int i=0;
428         int nonspecial=0;
429         int c;
430         struct option long_options[] = {
431                 {"read-only", no_argument, NULL, 'r'},
432                 {"multi-file", no_argument, NULL, 'm'},
433                 {"copy-on-write", no_argument, NULL, 'c'},
434                 {"dont-fork", no_argument, NULL, 'd'},
435                 {"authorize-file", required_argument, NULL, 'l'},
436                 {"config-file", required_argument, NULL, 'C'},
437                 {"pid-file", required_argument, NULL, 'p'},
438                 {"output-config", required_argument, NULL, 'o'},
439                 {"max-connection", required_argument, NULL, 'M'},
440                 {0,0,0,0}
441         };
442         SERVER *serve;
443         off_t es;
444         size_t last;
445         char suffix;
446         gboolean do_output=FALSE;
447         gchar* section_header="";
448         gchar** addr_port;
449
450         if(argc==1) {
451                 return NULL;
452         }
453         serve=g_new0(SERVER, 1);
454         serve->authname = g_strdup(default_authname);
455         serve->virtstyle=VIRT_IPLIT;
456         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
457                 switch (c) {
458                 case 1:
459                         /* non-option argument */
460                         switch(nonspecial++) {
461                         case 0:
462                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
463                                         addr_port=g_strsplit(optarg, ":", 2);
464
465                                         /* Check for "@" - maybe user using this separator
466                                                  for IPv4 address */
467                                         if(!addr_port[1]) {
468                                                 g_strfreev(addr_port);
469                                                 addr_port=g_strsplit(optarg, "@", 2);
470                                         }
471                                 } else {
472                                         addr_port=g_strsplit(optarg, "@", 2);
473                                 }
474
475                                 if(addr_port[1]) {
476                                         serve->port=strtol(addr_port[1], NULL, 0);
477                                         serve->listenaddr=g_strdup(addr_port[0]);
478                                 } else {
479                                         serve->listenaddr=NULL;
480                                         serve->port=strtol(addr_port[0], NULL, 0);
481                                 }
482                                 g_strfreev(addr_port);
483                                 break;
484                         case 1:
485                                 serve->exportname = g_strdup(optarg);
486                                 if(serve->exportname[0] != '/') {
487                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
488                                         exit(EXIT_FAILURE);
489                                 }
490                                 break;
491                         case 2:
492                                 last=strlen(optarg)-1;
493                                 suffix=optarg[last];
494                                 if (suffix == 'k' || suffix == 'K' ||
495                                     suffix == 'm' || suffix == 'M')
496                                         optarg[last] = '\0';
497                                 es = (off_t)atoll(optarg);
498                                 switch (suffix) {
499                                         case 'm':
500                                         case 'M':  es <<= 10;
501                                         case 'k':
502                                         case 'K':  es <<= 10;
503                                         default :  break;
504                                 }
505                                 serve->expected_size = es;
506                                 break;
507                         }
508                         break;
509                 case 'r':
510                         serve->flags |= F_READONLY;
511                         break;
512                 case 'm':
513                         serve->flags |= F_MULTIFILE;
514                         break;
515                 case 'o':
516                         do_output = TRUE;
517                         section_header = g_strdup(optarg);
518                         break;
519                 case 'p':
520                         strncpy(pidftemplate, optarg, 256);
521                         break;
522                 case 'c': 
523                         serve->flags |=F_COPYONWRITE;
524                         break;
525                 case 'd': 
526                         dontfork = 1;
527                         break;
528                 case 'C':
529                         g_free(config_file_pos);
530                         config_file_pos=g_strdup(optarg);
531                         break;
532                 case 'l':
533                         g_free(serve->authname);
534                         serve->authname=g_strdup(optarg);
535                         break;
536                 case 'M':
537                         serve->max_connections = strtol(optarg, NULL, 0);
538                         break;
539                 default:
540                         usage();
541                         exit(EXIT_FAILURE);
542                         break;
543                 }
544         }
545         /* What's left: the port to export, the name of the to be exported
546          * file, and, optionally, the size of the file, in that order. */
547         if(nonspecial<2) {
548                 g_free(serve);
549                 serve=NULL;
550         } else {
551                 do_oldstyle = TRUE;
552         }
553         if(do_output) {
554                 if(!serve) {
555                         g_critical("Need a complete configuration on the command line to output a config file section!");
556                         exit(EXIT_FAILURE);
557                 }
558                 dump_section(serve, section_header);
559         }
560         return serve;
561 }
562
563 /**
564  * Error codes for config file parsing
565  **/
566 typedef enum {
567         CFILE_NOTFOUND,         /**< The configuration file is not found */
568         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
569         CFILE_KEY_MISSING,      /**< A (required) key is missing */
570         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
571         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
572         CFILE_PROGERR,          /**< Programmer error */
573         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
574                                      define any exports */
575         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
576                                      old-style export. */
577 } CFILE_ERRORS;
578
579 /**
580  * Remove a SERVER from memory. Used from the hash table
581  **/
582 void remove_server(gpointer s) {
583         SERVER *server;
584
585         server=(SERVER*)s;
586         g_free(server->exportname);
587         if(server->authname)
588                 g_free(server->authname);
589         if(server->listenaddr)
590                 g_free(server->listenaddr);
591         if(server->prerun)
592                 g_free(server->prerun);
593         if(server->postrun)
594                 g_free(server->postrun);
595         if(server->transactionlog)
596                 g_free(server->transactionlog);
597         g_free(server);
598 }
599
600 /**
601  * duplicate server
602  * @param s the old server we want to duplicate
603  * @return new duplicated server
604  **/
605 SERVER* dup_serve(SERVER *s) {
606         SERVER *serve = NULL;
607
608         serve=g_new0(SERVER, 1);
609         if(serve == NULL)
610                 return NULL;
611
612         if(s->exportname)
613                 serve->exportname = g_strdup(s->exportname);
614
615         serve->expected_size = s->expected_size;
616
617         if(s->listenaddr)
618                 serve->listenaddr = g_strdup(s->listenaddr);
619
620         serve->port = s->port;
621
622         if(s->authname)
623                 serve->authname = strdup(s->authname);
624
625         serve->flags = s->flags;
626         serve->socket = s->socket;
627         serve->socket_family = s->socket_family;
628         serve->virtstyle = s->virtstyle;
629         serve->cidrlen = s->cidrlen;
630
631         if(s->prerun)
632                 serve->prerun = g_strdup(s->prerun);
633
634         if(s->postrun)
635                 serve->postrun = g_strdup(s->postrun);
636
637         if(s->transactionlog)
638                 serve->transactionlog = g_strdup(s->transactionlog);
639         
640         if(s->servename)
641                 serve->servename = g_strdup(s->servename);
642
643         serve->max_connections = s->max_connections;
644
645         return serve;
646 }
647
648 /**
649  * append new server to array
650  * @param s server
651  * @param a server array
652  * @return 0 success, -1 error
653  */
654 int append_serve(SERVER *s, GArray *a) {
655         SERVER *ns = NULL;
656         struct addrinfo hints;
657         struct addrinfo *ai = NULL;
658         struct addrinfo *rp = NULL;
659         char   host[NI_MAXHOST];
660         gchar  *port = NULL;
661         int e;
662         int ret;
663
664         if(!s) {
665                 err("Invalid parsing server");
666                 return -1;
667         }
668
669         port = g_strdup_printf("%d", s->port);
670
671         memset(&hints,'\0',sizeof(hints));
672         hints.ai_family = AF_UNSPEC;
673         hints.ai_socktype = SOCK_STREAM;
674         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
675         hints.ai_protocol = IPPROTO_TCP;
676
677         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
678
679         if (port)
680                 g_free(port);
681
682         if(e == 0) {
683                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
684                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
685
686                         if (e != 0) { // error
687                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
688                                 continue;
689                         }
690
691                         // duplicate server and set listenaddr to resolved IP address
692                         ns = dup_serve (s);
693                         if (ns) {
694                                 ns->listenaddr = g_strdup(host);
695                                 ns->socket_family = rp->ai_family;
696                                 g_array_append_val(a, *ns);
697                                 free(ns);
698                                 ns = NULL;
699                         }
700                 }
701
702                 ret = 0;
703         } else {
704                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
705                 ret = -1;
706         }
707
708         if (ai)
709                 freeaddrinfo(ai);
710
711         return ret;
712 }
713
714 /**
715  * Parse the config file.
716  *
717  * @param f the name of the config file
718  * @param e a GError. @see CFILE_ERRORS for what error values this function can
719  *      return.
720  * @return a Array of SERVER* pointers, If the config file is empty or does not
721  *      exist, returns an empty GHashTable; if the config file contains an
722  *      error, returns NULL, and e is set appropriately
723  **/
724 GArray* parse_cfile(gchar* f, GError** e) {
725         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
726         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
727         SERVER s;
728         gchar *virtstyle=NULL;
729         PARAM lp[] = {
730                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
731                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
732                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
733                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
734                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
735                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
736                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
737                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
738                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
739                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
740                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
741                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
742                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
743                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
744                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
745                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
746                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
747                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
748                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
749         };
750         const int lp_size=sizeof(lp)/sizeof(PARAM);
751         PARAM gp[] = {
752                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
753                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
754                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
755                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
756         };
757         PARAM* p=gp;
758         int p_size=sizeof(gp)/sizeof(PARAM);
759         GKeyFile *cfile;
760         GError *err = NULL;
761         const char *err_msg=NULL;
762         GQuark errdomain;
763         GArray *retval=NULL;
764         gchar **groups;
765         gboolean value;
766         gchar* startgroup;
767         gint i;
768         gint j;
769
770         errdomain = g_quark_from_string("parse_cfile");
771         cfile = g_key_file_new();
772         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
773         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
774                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
775                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
776                 g_key_file_free(cfile);
777                 return retval;
778         }
779         startgroup = g_key_file_get_start_group(cfile);
780         if(!startgroup || strcmp(startgroup, "generic")) {
781                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
782                 g_key_file_free(cfile);
783                 return NULL;
784         }
785         groups = g_key_file_get_groups(cfile, NULL);
786         for(i=0;groups[i];i++) {
787                 memset(&s, '\0', sizeof(SERVER));
788
789                 /* After the [generic] group, start parsing exports */
790                 if(i==1) {
791                         p=lp;
792                         p_size=lp_size;
793                 } 
794                 for(j=0;j<p_size;j++) {
795                         g_assert(p[j].target != NULL);
796                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
797                         switch(p[j].ptype) {
798                                 case PARAM_INT:
799                                         *((gint*)p[j].target) =
800                                                 g_key_file_get_integer(cfile,
801                                                                 groups[i],
802                                                                 p[j].paramname,
803                                                                 &err);
804                                         break;
805                                 case PARAM_STRING:
806                                         *((gchar**)p[j].target) =
807                                                 g_key_file_get_string(cfile,
808                                                                 groups[i],
809                                                                 p[j].paramname,
810                                                                 &err);
811                                         break;
812                                 case PARAM_BOOL:
813                                         value = g_key_file_get_boolean(cfile,
814                                                         groups[i],
815                                                         p[j].paramname, &err);
816                                         if(!err) {
817                                                 if(value) {
818                                                         *((gint*)p[j].target) |= p[j].flagval;
819                                                 } else {
820                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
821                                                 }
822                                         }
823                                         break;
824                         }
825                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) {
826                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export");
827                                 g_key_file_free(cfile);
828                                 return NULL;
829                         }
830                         if(err) {
831                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
832                                         if(!p[j].required) {
833                                                 /* Ignore not-found error for optional values */
834                                                 g_clear_error(&err);
835                                                 continue;
836                                         } else {
837                                                 err_msg = MISSING_REQUIRED_ERROR;
838                                         }
839                                 } else {
840                                         err_msg = DEFAULT_ERROR;
841                                 }
842                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
843                                 g_array_free(retval, TRUE);
844                                 g_error_free(err);
845                                 g_key_file_free(cfile);
846                                 return NULL;
847                         }
848                 }
849                 if(virtstyle) {
850                         if(!strncmp(virtstyle, "none", 4)) {
851                                 s.virtstyle=VIRT_NONE;
852                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
853                                 s.virtstyle=VIRT_IPLIT;
854                         } else if(!strncmp(virtstyle, "iphash", 6)) {
855                                 s.virtstyle=VIRT_IPHASH;
856                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
857                                 s.virtstyle=VIRT_CIDR;
858                                 if(strlen(virtstyle)<10) {
859                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
860                                         g_array_free(retval, TRUE);
861                                         g_key_file_free(cfile);
862                                         return NULL;
863                                 }
864                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
865                         } else {
866                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
867                                 g_array_free(retval, TRUE);
868                                 g_key_file_free(cfile);
869                                 return NULL;
870                         }
871                         if(s.port && !do_oldstyle) {
872                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
873                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
874                         }
875                 } else {
876                         s.virtstyle=VIRT_IPLIT;
877                 }
878                 /* Don't need to free this, it's not our string */
879                 virtstyle=NULL;
880                 /* Don't append values for the [generic] group */
881                 if(i>0) {
882                         s.socket_family = AF_UNSPEC;
883                         s.servename = groups[i];
884
885                         append_serve(&s, retval);
886                 } else {
887                         if(!do_oldstyle) {
888                                 lp[1].required = 0;
889                         }
890                 }
891 #ifndef WITH_SDP
892                 if(s.flags & F_SDP) {
893                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
894                         g_array_free(retval, TRUE);
895                         g_key_file_free(cfile);
896                         return NULL;
897                 }
898 #endif
899         }
900         if(i==1) {
901                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
902         }
903         g_key_file_free(cfile);
904         return retval;
905 }
906
907 /**
908  * Signal handler for SIGCHLD
909  * @param s the signal we're handling (must be SIGCHLD, or something
910  * is severely wrong)
911  **/
912 void sigchld_handler(int s) {
913         int status;
914         int* i;
915         pid_t pid;
916
917         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
918                 if(WIFEXITED(status)) {
919                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
920                 }
921                 i=g_hash_table_lookup(children, &pid);
922                 if(!i) {
923                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
924                 } else {
925                         DEBUG("Removing %d from the list of children", pid);
926                         g_hash_table_remove(children, &pid);
927                 }
928         }
929 }
930
931 /**
932  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
933  *
934  * @param key the key
935  * @param value the value corresponding to the above key
936  * @param user_data a pointer which we always set to 1, so that we know what
937  * will happen next.
938  **/
939 void killchild(gpointer key, gpointer value, gpointer user_data) {
940         pid_t *pid=value;
941         int *parent=user_data;
942
943         kill(*pid, SIGTERM);
944         *parent=1;
945 }
946
947 /**
948  * Handle SIGTERM and dispatch it to our children
949  * @param s the signal we're handling (must be SIGTERM, or something
950  * is severely wrong).
951  **/
952 void sigterm_handler(int s) {
953         int parent=0;
954
955         g_hash_table_foreach(children, killchild, &parent);
956
957         if(parent) {
958                 unlink(pidfname);
959         }
960
961         exit(EXIT_SUCCESS);
962 }
963
964 /**
965  * Detect the size of a file.
966  *
967  * @param fhandle An open filedescriptor
968  * @return the size of the file, or OFFT_MAX if detection was
969  * impossible.
970  **/
971 off_t size_autodetect(int fhandle) {
972         off_t es;
973         u64 bytes;
974         struct stat stat_buf;
975         int error;
976
977 #ifdef HAVE_SYS_MOUNT_H
978 #ifdef HAVE_SYS_IOCTL_H
979 #ifdef BLKGETSIZE64
980         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
981         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
982                 return (off_t)bytes;
983         }
984 #endif /* BLKGETSIZE64 */
985 #endif /* HAVE_SYS_IOCTL_H */
986 #endif /* HAVE_SYS_MOUNT_H */
987
988         DEBUG("looking for fhandle size with fstat\n");
989         stat_buf.st_size = 0;
990         error = fstat(fhandle, &stat_buf);
991         if (!error) {
992                 if(stat_buf.st_size > 0)
993                         return (off_t)stat_buf.st_size;
994         } else {
995                 err("fstat failed: %m");
996         }
997
998         DEBUG("looking for fhandle size with lseek SEEK_END\n");
999         es = lseek(fhandle, (off_t)0, SEEK_END);
1000         if (es > ((off_t)0)) {
1001                 return es;
1002         } else {
1003                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1004         }
1005
1006         err("Could not find size of exported block device: %m");
1007         return OFFT_MAX;
1008 }
1009
1010 /**
1011  * Get the file handle and offset, given an export offset.
1012  *
1013  * @param export An array of export files
1014  * @param a The offset to get corresponding file/offset for
1015  * @param fhandle [out] File descriptor
1016  * @param foffset [out] Offset into fhandle
1017  * @param maxbytes [out] Tells how many bytes can be read/written
1018  * from fhandle starting at foffset (0 if there is no limit)
1019  * @return 0 on success, -1 on failure
1020  **/
1021 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1022         /* Negative offset not allowed */
1023         if(a < 0)
1024                 return -1;
1025
1026         /* Binary search for last file with starting offset <= a */
1027         FILE_INFO fi;
1028         int start = 0;
1029         int end = export->len - 1;
1030         while( start <= end ) {
1031                 int mid = (start + end) / 2;
1032                 fi = g_array_index(export, FILE_INFO, mid);
1033                 if( fi.startoff < a ) {
1034                         start = mid + 1;
1035                 } else if( fi.startoff > a ) {
1036                         end = mid - 1;
1037                 } else {
1038                         start = end = mid;
1039                         break;
1040                 }
1041         }
1042
1043         /* end should never go negative, since first startoff is 0 and a >= 0 */
1044         g_assert(end >= 0);
1045
1046         fi = g_array_index(export, FILE_INFO, end);
1047         *fhandle = fi.fhandle;
1048         *foffset = a - fi.startoff;
1049         *maxbytes = 0;
1050         if( end+1 < export->len ) {
1051                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1052                 *maxbytes = fi_next.startoff - a;
1053         }
1054
1055         return 0;
1056 }
1057
1058 /**
1059  * seek to a position in a file, with error handling.
1060  * @param handle a filedescriptor
1061  * @param a position to seek to
1062  * @todo get rid of this; lastpoint is a global variable right now, but it
1063  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1064  * easier.
1065  **/
1066 void myseek(int handle,off_t a) {
1067         if (lseek(handle, a, SEEK_SET) < 0) {
1068                 err("Can not seek locally!\n");
1069         }
1070 }
1071
1072 /**
1073  * Write an amount of bytes at a given offset to the right file. This
1074  * abstracts the write-side of the multiple file option.
1075  *
1076  * @param a The offset where the write should start
1077  * @param buf The buffer to write from
1078  * @param len The length of buf
1079  * @param client The client we're serving for
1080  * @return The number of bytes actually written, or -1 in case of an error
1081  **/
1082 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1083         int fhandle;
1084         off_t foffset;
1085         size_t maxbytes;
1086         ssize_t retval;
1087
1088         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1089                 return -1;
1090         if(maxbytes && len > maxbytes)
1091                 len = maxbytes;
1092
1093         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1094
1095         myseek(fhandle, foffset);
1096         retval = write(fhandle, buf, len);
1097         if(client->server->flags & F_SYNC) {
1098                 fsync(fhandle);
1099         } else if (fua) {
1100
1101           /* This is where we would do the following
1102            *   #ifdef USE_SYNC_FILE_RANGE
1103            * However, we don't, for the reasons set out below
1104            * by Christoph Hellwig <hch@infradead.org>
1105            *
1106            * [BEGINS] 
1107            * fdatasync is equivalent to fsync except that it does not flush
1108            * non-essential metadata (basically just timestamps in practice), but it
1109            * does flush metadata requried to find the data again, e.g. allocation
1110            * information and extent maps.  sync_file_range does nothing but flush
1111            * out pagecache content - it means you basically won't get your data
1112            * back in case of a crash if you either:
1113            * 
1114            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1115            *  b) are using a sparse file on a filesystem
1116            *  c) are using a fallocate-preallocated file on a filesystem
1117            *  d) use any file on a COW filesystem like btrfs
1118            * 
1119            * e.g. it only does anything useful for you if you do not have a volatile
1120            * write cache, and either use a raw block device node, or just overwrite
1121            * an already fully allocated (and not preallocated) file on a non-COW
1122            * filesystem.
1123            * [ENDS]
1124            *
1125            * What we should do is open a second FD with O_DSYNC set, then write to
1126            * that when appropriate. However, with a Linux client, every REQ_FUA
1127            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1128            * problems.
1129            *
1130            */
1131 #if 0
1132                 sync_file_range(fhandle, foffset, len,
1133                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1134                                 SYNC_FILE_RANGE_WAIT_AFTER);
1135 #else
1136                 fdatasync(fhandle);
1137 #endif
1138         }
1139         return retval;
1140 }
1141
1142 /**
1143  * Call rawexpwrite repeatedly until all data has been written.
1144  * @return 0 on success, nonzero on failure
1145  **/
1146 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1147         ssize_t ret=0;
1148
1149         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1150                 a += ret;
1151                 buf += ret;
1152                 len -= ret;
1153         }
1154         return (ret < 0 || len != 0);
1155 }
1156
1157 /**
1158  * Read an amount of bytes at a given offset from the right file. This
1159  * abstracts the read-side of the multiple files option.
1160  *
1161  * @param a The offset where the read should start
1162  * @param buf A buffer to read into
1163  * @param len The size of buf
1164  * @param client The client we're serving for
1165  * @return The number of bytes actually read, or -1 in case of an
1166  * error.
1167  **/
1168 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1169         int fhandle;
1170         off_t foffset;
1171         size_t maxbytes;
1172
1173         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1174                 return -1;
1175         if(maxbytes && len > maxbytes)
1176                 len = maxbytes;
1177
1178         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1179
1180         myseek(fhandle, foffset);
1181         return read(fhandle, buf, len);
1182 }
1183
1184 /**
1185  * Call rawexpread repeatedly until all data has been read.
1186  * @return 0 on success, nonzero on failure
1187  **/
1188 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1189         ssize_t ret=0;
1190
1191         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1192                 a += ret;
1193                 buf += ret;
1194                 len -= ret;
1195         }
1196         return (ret < 0 || len != 0);
1197 }
1198
1199 /**
1200  * Read an amount of bytes at a given offset from the right file. This
1201  * abstracts the read-side of the copyonwrite stuff, and calls
1202  * rawexpread() with the right parameters to do the actual work.
1203  * @param a The offset where the read should start
1204  * @param buf A buffer to read into
1205  * @param len The size of buf
1206  * @param client The client we're going to read for
1207  * @return 0 on success, nonzero on failure
1208  **/
1209 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1210         off_t rdlen, offset;
1211         off_t mapcnt, mapl, maph, pagestart;
1212
1213         if (!(client->server->flags & F_COPYONWRITE))
1214                 return(rawexpread_fully(a, buf, len, client));
1215         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1216
1217         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1218
1219         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1220                 pagestart=mapcnt*DIFFPAGESIZE;
1221                 offset=a-pagestart;
1222                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1223                         len : (size_t)DIFFPAGESIZE-offset;
1224                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1225                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1226                                (unsigned long)(client->difmap[mapcnt]));
1227                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1228                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1229                 } else { /* the block is not there */
1230                         DEBUG("Page %llu is not here, we read the original one\n",
1231                                (unsigned long long)mapcnt);
1232                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1233                 }
1234                 len-=rdlen; a+=rdlen; buf+=rdlen;
1235         }
1236         return 0;
1237 }
1238
1239 /**
1240  * Write an amount of bytes at a given offset to the right file. This
1241  * abstracts the write-side of the copyonwrite option, and calls
1242  * rawexpwrite() with the right parameters to do the actual work.
1243  *
1244  * @param a The offset where the write should start
1245  * @param buf The buffer to write from
1246  * @param len The length of buf
1247  * @param client The client we're going to write for.
1248  * @return 0 on success, nonzero on failure
1249  **/
1250 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1251         char pagebuf[DIFFPAGESIZE];
1252         off_t mapcnt,mapl,maph;
1253         off_t wrlen,rdlen; 
1254         off_t pagestart;
1255         off_t offset;
1256
1257         if (!(client->server->flags & F_COPYONWRITE))
1258                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1259         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1260
1261         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1262
1263         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1264                 pagestart=mapcnt*DIFFPAGESIZE ;
1265                 offset=a-pagestart ;
1266                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1267                         len : (size_t)DIFFPAGESIZE-offset;
1268
1269                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1270                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1271                                (unsigned long)(client->difmap[mapcnt])) ;
1272                         myseek(client->difffile,
1273                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1274                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1275                 } else { /* the block is not there */
1276                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1277                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1278                         DEBUG("Page %llu is not here, we put it at %lu\n",
1279                                (unsigned long long)mapcnt,
1280                                (unsigned long)(client->difmap[mapcnt]));
1281                         rdlen=DIFFPAGESIZE ;
1282                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1283                                 return -1;
1284                         memcpy(pagebuf+offset,buf,wrlen) ;
1285                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1286                                         DIFFPAGESIZE)
1287                                 return -1;
1288                 }                                                   
1289                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1290         }
1291         if (client->server->flags & F_SYNC) {
1292                 fsync(client->difffile);
1293         } else if (fua) {
1294                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1295                    as we iterate through the above?
1296                  */
1297                 fdatasync(client->difffile);
1298         }
1299         return 0;
1300 }
1301
1302 int expflush(CLIENT *client) {
1303         gint i;
1304
1305         if (client->server->flags & F_COPYONWRITE) {
1306                 return fsync(client->difffile);
1307         }
1308         
1309         for (i = 0; i < client->export->len; i++) {
1310                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1311                 if (fsync(fi.fhandle) < 0)
1312                         return -1;
1313         }
1314         
1315         return 0;
1316 }
1317
1318 /**
1319  * Do the initial negotiation.
1320  *
1321  * @param client The client we're negotiating with.
1322  **/
1323 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1324         char zeros[128];
1325         uint64_t size_host;
1326         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1327         uint16_t smallflags = 0;
1328         uint64_t magic;
1329
1330         memset(zeros, '\0', sizeof(zeros));
1331         if(!client || !client->modern) {
1332                 /* common */
1333                 if (write(net, INIT_PASSWD, 8) < 0) {
1334                         err_nonfatal("Negotiation failed: %m");
1335                         if(client)
1336                                 exit(EXIT_FAILURE);
1337                 }
1338                 if(!client || client->modern) {
1339                         /* modern */
1340                         magic = htonll(opts_magic);
1341                 } else {
1342                         /* oldstyle */
1343                         magic = htonll(cliserv_magic);
1344                 }
1345                 if (write(net, &magic, sizeof(magic)) < 0) {
1346                         err_nonfatal("Negotiation failed: %m");
1347                         if(client)
1348                                 exit(EXIT_FAILURE);
1349                 }
1350         }
1351         if(!client) {
1352                 /* modern */
1353                 uint32_t reserved;
1354                 uint32_t opt;
1355                 uint32_t namelen;
1356                 char* name;
1357                 int i;
1358
1359                 if(!servers)
1360                         err("programmer error");
1361                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1362                         err("Negotiation failed: %m");
1363                 if (read(net, &reserved, sizeof(reserved)) < 0)
1364                         err("Negotiation failed: %m");
1365                 if (read(net, &magic, sizeof(magic)) < 0)
1366                         err("Negotiation failed: %m");
1367                 magic = ntohll(magic);
1368                 if(magic != opts_magic) {
1369                         close(net);
1370                         return NULL;
1371                 }
1372                 if (read(net, &opt, sizeof(opt)) < 0)
1373                         err("Negotiation failed: %m");
1374                 opt = ntohl(opt);
1375                 if(opt != NBD_OPT_EXPORT_NAME) {
1376                         close(net);
1377                         return NULL;
1378                 }
1379                 if (read(net, &namelen, sizeof(namelen)) < 0)
1380                         err("Negotiation failed: %m");
1381                 namelen = ntohl(namelen);
1382                 name = malloc(namelen+1);
1383                 name[namelen]=0;
1384                 if (read(net, name, namelen) < 0)
1385                         err("Negotiation failed: %m");
1386                 for(i=0; i<servers->len; i++) {
1387                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1388                         if(!strcmp(serve->servename, name)) {
1389                                 CLIENT* client = g_new0(CLIENT, 1);
1390                                 client->server = serve;
1391                                 client->exportsize = OFFT_MAX;
1392                                 client->net = net;
1393                                 client->modern = TRUE;
1394                                 client->transactionlogfd = -1;
1395                                 free(name);
1396                                 return client;
1397                         }
1398                 }
1399                 free(name);
1400                 return NULL;
1401         }
1402         /* common */
1403         size_host = htonll((u64)(client->exportsize));
1404         if (write(net, &size_host, 8) < 0)
1405                 err("Negotiation failed: %m");
1406         if (client->server->flags & F_READONLY)
1407                 flags |= NBD_FLAG_READ_ONLY;
1408         if (client->server->flags & F_FLUSH)
1409                 flags |= NBD_FLAG_SEND_FLUSH;
1410         if (client->server->flags & F_FUA)
1411                 flags |= NBD_FLAG_SEND_FUA;
1412         if (client->server->flags & F_ROTATIONAL)
1413                 flags |= NBD_FLAG_ROTATIONAL;
1414         if (!client->modern) {
1415                 /* oldstyle */
1416                 flags = htonl(flags);
1417                 if (write(client->net, &flags, 4) < 0)
1418                         err("Negotiation failed: %m");
1419         } else {
1420                 /* modern */
1421                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1422                 smallflags = htons(smallflags);
1423                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1424                         err("Negotiation failed: %m");
1425                 }
1426         }
1427         /* common */
1428         if (write(client->net, zeros, 124) < 0)
1429                 err("Negotiation failed: %m");
1430         return NULL;
1431 }
1432
1433 /** sending macro. */
1434 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1435         if (client->transactionlogfd != -1) \
1436                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1437 /** error macro. */
1438 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1439 /**
1440  * Serve a file to a single client.
1441  *
1442  * @todo This beast needs to be split up in many tiny little manageable
1443  * pieces. Preferably with a chainsaw.
1444  *
1445  * @param client The client we're going to serve to.
1446  * @return when the client disconnects
1447  **/
1448 int mainloop(CLIENT *client) {
1449         struct nbd_request request;
1450         struct nbd_reply reply;
1451         gboolean go_on=TRUE;
1452 #ifdef DODBG
1453         int i = 0;
1454 #endif
1455         negotiate(client->net, client, NULL);
1456         DEBUG("Entering request loop!\n");
1457         reply.magic = htonl(NBD_REPLY_MAGIC);
1458         reply.error = 0;
1459         while (go_on) {
1460                 char buf[BUFSIZE];
1461                 char* p;
1462                 size_t len;
1463                 size_t currlen;
1464                 size_t writelen;
1465                 uint16_t command;
1466 #ifdef DODBG
1467                 i++;
1468                 printf("%d: ", i);
1469 #endif
1470                 readit(client->net, &request, sizeof(request));
1471                 if (client->transactionlogfd != -1)
1472                         writeit(client->transactionlogfd, &request, sizeof(request));
1473
1474                 request.from = ntohll(request.from);
1475                 request.type = ntohl(request.type);
1476                 command = request.type & NBD_CMD_MASK_COMMAND;
1477                 len = ntohl(request.len);
1478
1479                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1480                                 (unsigned long long)request.from,
1481                                 (unsigned long long)request.from / 512, (unsigned int)len);
1482
1483                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1484                         err("Not enough magic.");
1485
1486                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1487
1488                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1489                         if ((request.from + len) > (OFFT_MAX)) {
1490                                 DEBUG("[Number too large!]");
1491                                 ERROR(client, reply, EINVAL);
1492                                 continue;
1493                         }
1494
1495                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1496                                 DEBUG("[RANGE!]");
1497                                 ERROR(client, reply, EINVAL);
1498                                 continue;
1499                         }
1500
1501                         currlen = len;
1502                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1503                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1504                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1505                         }
1506                 }
1507
1508                 switch (command) {
1509
1510                 case NBD_CMD_DISC:
1511                         msg2(LOG_INFO, "Disconnect request received.");
1512                         if (client->server->flags & F_COPYONWRITE) { 
1513                                 if (client->difmap) g_free(client->difmap) ;
1514                                 close(client->difffile);
1515                                 unlink(client->difffilename);
1516                                 free(client->difffilename);
1517                         }
1518                         go_on=FALSE;
1519                         continue;
1520
1521                 case NBD_CMD_WRITE:
1522                         DEBUG("wr: net->buf, ");
1523                         while(len > 0) {
1524                                 readit(client->net, buf, currlen);
1525                                 DEBUG("buf->exp, ");
1526                                 if ((client->server->flags & F_READONLY) ||
1527                                     (client->server->flags & F_AUTOREADONLY)) {
1528                                         DEBUG("[WRITE to READONLY!]");
1529                                         ERROR(client, reply, EPERM);
1530                                         continue;
1531                                 }
1532                                 if (expwrite(request.from, buf, len, client,
1533                                              request.type & NBD_CMD_FLAG_FUA)) {
1534                                         DEBUG("Write failed: %m" );
1535                                         ERROR(client, reply, errno);
1536                                         continue;
1537                                 }
1538                                 len -= currlen;
1539                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1540                         }
1541                         SEND(client->net, reply);
1542                         DEBUG("OK!\n");
1543                         continue;
1544
1545                 case NBD_CMD_FLUSH:
1546                         DEBUG("fl: ");
1547                         if (expflush(client)) {
1548                                 DEBUG("Flush failed: %m");
1549                                 ERROR(client, reply, errno);
1550                                 continue;
1551                         }
1552                         SEND(client->net, reply);
1553                         DEBUG("OK!\n");
1554                         continue;
1555
1556                 case NBD_CMD_READ:
1557                         DEBUG("exp->buf, ");
1558                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1559                         if (client->transactionlogfd != -1)
1560                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1561                         p = buf + sizeof(struct nbd_reply);
1562                         writelen = currlen + sizeof(struct nbd_reply);
1563                         while(len > 0) {
1564                                 if (expread(request.from, p, currlen, client)) {
1565                                         DEBUG("Read failed: %m");
1566                                         ERROR(client, reply, errno);
1567                                         continue;
1568                                 }
1569                                 
1570                                 DEBUG("buf->net, ");
1571                                 writeit(client->net, buf, writelen);
1572                                 len -= currlen;
1573                                 request.from += currlen;
1574                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1575                                 p = buf;
1576                                 writelen = currlen;
1577                         }
1578                         DEBUG("OK!\n");
1579                         continue;
1580
1581                 default:
1582                         DEBUG ("Ignoring unknown command\n");
1583                         continue;
1584                 }
1585         }
1586         return 0;
1587 }
1588
1589 /**
1590  * Set up client export array, which is an array of FILE_INFO.
1591  * Also, split a single exportfile into multiple ones, if that was asked.
1592  * @param client information on the client which we want to setup export for
1593  **/
1594 void setupexport(CLIENT* client) {
1595         int i;
1596         off_t laststartoff = 0, lastsize = 0;
1597         int multifile = (client->server->flags & F_MULTIFILE);
1598
1599         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1600
1601         /* If multi-file, open as many files as we can.
1602          * If not, open exactly one file.
1603          * Calculate file sizes as we go to get total size. */
1604         for(i=0; ; i++) {
1605                 FILE_INFO fi;
1606                 gchar *tmpname;
1607                 gchar* error_string;
1608                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1609
1610                 if(multifile) {
1611                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1612                 } else {
1613                         tmpname=g_strdup(client->exportname);
1614                 }
1615                 DEBUG( "Opening %s\n", tmpname );
1616                 fi.fhandle = open(tmpname, mode);
1617                 if(fi.fhandle == -1 && mode == O_RDWR) {
1618                         /* Try again because maybe media was read-only */
1619                         fi.fhandle = open(tmpname, O_RDONLY);
1620                         if(fi.fhandle != -1) {
1621                                 /* Opening the base file in copyonwrite mode is
1622                                  * okay */
1623                                 if(!(client->server->flags & F_COPYONWRITE)) {
1624                                         client->server->flags |= F_AUTOREADONLY;
1625                                         client->server->flags |= F_READONLY;
1626                                 }
1627                         }
1628                 }
1629                 if(fi.fhandle == -1) {
1630                         if(multifile && i>0)
1631                                 break;
1632                         error_string=g_strdup_printf(
1633                                 "Could not open exported file %s: %%m",
1634                                 tmpname);
1635                         err(error_string);
1636                 }
1637                 fi.startoff = laststartoff + lastsize;
1638                 g_array_append_val(client->export, fi);
1639                 g_free(tmpname);
1640
1641                 /* Starting offset and size of this file will be used to
1642                  * calculate starting offset of next file */
1643                 laststartoff = fi.startoff;
1644                 lastsize = size_autodetect(fi.fhandle);
1645
1646                 if(!multifile)
1647                         break;
1648         }
1649
1650         /* Set export size to total calculated size */
1651         client->exportsize = laststartoff + lastsize;
1652
1653         /* Export size may be overridden */
1654         if(client->server->expected_size) {
1655                 /* desired size must be <= total calculated size */
1656                 if(client->server->expected_size > client->exportsize) {
1657                         err("Size of exported file is too big\n");
1658                 }
1659
1660                 client->exportsize = client->server->expected_size;
1661         }
1662
1663         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1664         if(multifile) {
1665                 msg3(LOG_INFO, "Total number of files: %d", i);
1666         }
1667 }
1668
1669 int copyonwrite_prepare(CLIENT* client) {
1670         off_t i;
1671         if ((client->difffilename = malloc(1024))==NULL)
1672                 err("Failed to allocate string for diff file name");
1673         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1674                 (int)getpid()) ;
1675         client->difffilename[1023]='\0';
1676         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1677         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1678         if (client->difffile<0) err("Could not create diff file (%m)") ;
1679         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1680                 err("Could not allocate memory") ;
1681         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1682
1683         return 0;
1684 }
1685
1686 /**
1687  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1688  * options
1689  *
1690  * @param command the command to be ran. Read from the config file
1691  * @param file the file name we're about to export
1692  **/
1693 int do_run(gchar* command, gchar* file) {
1694         gchar* cmd;
1695         int retval=0;
1696
1697         if(command && *command) {
1698                 cmd = g_strdup_printf(command, file);
1699                 retval=system(cmd);
1700                 g_free(cmd);
1701         }
1702         return retval;
1703 }
1704
1705 /**
1706  * Serve a connection. 
1707  *
1708  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1709  * follow the road map.
1710  *
1711  * @param client a connected client
1712  **/
1713 void serveconnection(CLIENT *client) {
1714         if (client->server->transactionlog && (client->transactionlogfd == -1))
1715         {
1716                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1717                                                            O_WRONLY | O_CREAT,
1718                                                            S_IRUSR | S_IWUSR)))
1719                         g_warning("Could not open transaction log %s",
1720                                   client->server->transactionlog);
1721         }
1722
1723         if(do_run(client->server->prerun, client->exportname)) {
1724                 exit(EXIT_FAILURE);
1725         }
1726         setupexport(client);
1727
1728         if (client->server->flags & F_COPYONWRITE) {
1729                 copyonwrite_prepare(client);
1730         }
1731
1732         setmysockopt(client->net);
1733
1734         mainloop(client);
1735         do_run(client->server->postrun, client->exportname);
1736
1737         if (-1 != client->transactionlogfd)
1738         {
1739                 close(client->transactionlogfd);
1740                 client->transactionlogfd = -1;
1741         }
1742 }
1743
1744 /**
1745  * Find the name of the file we have to serve. This will use g_strdup_printf
1746  * to put the IP address of the client inside a filename containing
1747  * "%s" (in the form as specified by the "virtstyle" option). That name
1748  * is then written to client->exportname.
1749  *
1750  * @param net A socket connected to an nbd client
1751  * @param client information about the client. The IP address in human-readable
1752  * format will be written to a new char* buffer, the address of which will be
1753  * stored in client->clientname.
1754  **/
1755 void set_peername(int net, CLIENT *client) {
1756         struct sockaddr_storage addrin;
1757         struct sockaddr_storage netaddr;
1758         struct sockaddr_in  *netaddr4 = NULL;
1759         struct sockaddr_in6 *netaddr6 = NULL;
1760         size_t addrinlen = sizeof( addrin );
1761         struct addrinfo hints;
1762         struct addrinfo *ai = NULL;
1763         char peername[NI_MAXHOST];
1764         char netname[NI_MAXHOST];
1765         char *tmp = NULL;
1766         int i;
1767         int e;
1768         int shift;
1769
1770         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1771                 err("getsockname failed: %m");
1772
1773         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1774                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1775
1776         memset(&hints, '\0', sizeof (hints));
1777         hints.ai_flags = AI_ADDRCONFIG;
1778         e = getaddrinfo(peername, NULL, &hints, &ai);
1779
1780         if(e != 0) {
1781                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1782                 freeaddrinfo(ai);
1783                 return;
1784         }
1785
1786         switch(client->server->virtstyle) {
1787                 case VIRT_NONE:
1788                         client->exportname=g_strdup(client->server->exportname);
1789                         break;
1790                 case VIRT_IPHASH:
1791                         for(i=0;i<strlen(peername);i++) {
1792                                 if(peername[i]=='.') {
1793                                         peername[i]='/';
1794                                 }
1795                         }
1796                 case VIRT_IPLIT:
1797                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1798                         break;
1799                 case VIRT_CIDR:
1800                         memcpy(&netaddr, &addrin, addrinlen);
1801                         if(ai->ai_family == AF_INET) {
1802                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1803                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1804                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1805
1806                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1807                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1808                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1809                         }else if(ai->ai_family == AF_INET6) {
1810                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1811
1812                                 shift = 128-(client->server->cidrlen);
1813                                 i = 3;
1814                                 while(shift >= 32) {
1815                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1816                                         shift-=32;
1817                                         i--;
1818                                 }
1819                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1820                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1821
1822                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1823                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1824                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1825                         }
1826
1827                         if(tmp != NULL)
1828                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1829
1830                         break;
1831         }
1832
1833         freeaddrinfo(ai);
1834         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1835              peername, client->exportname);
1836         client->clientname=g_strdup(peername);
1837 }
1838
1839 /**
1840  * Destroy a pid_t*
1841  * @param data a pointer to pid_t which should be freed
1842  **/
1843 void destroy_pid_t(gpointer data) {
1844         g_free(data);
1845 }
1846
1847 /**
1848  * Loop through the available servers, and serve them. Never returns.
1849  **/
1850 int serveloop(GArray* servers) {
1851         struct sockaddr_storage addrin;
1852         socklen_t addrinlen=sizeof(addrin);
1853         int i;
1854         int max;
1855         int sock;
1856         fd_set mset;
1857         fd_set rset;
1858
1859         /* 
1860          * Set up the master fd_set. The set of descriptors we need
1861          * to select() for never changes anyway and it buys us a *lot*
1862          * of time to only build this once. However, if we ever choose
1863          * to not fork() for clients anymore, we may have to revisit
1864          * this.
1865          */
1866         max=0;
1867         FD_ZERO(&mset);
1868         for(i=0;i<servers->len;i++) {
1869                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1870                         FD_SET(sock, &mset);
1871                         max=sock>max?sock:max;
1872                 }
1873         }
1874         if(modernsock) {
1875                 FD_SET(modernsock, &mset);
1876                 max=modernsock>max?modernsock:max;
1877         }
1878         for(;;) {
1879                 CLIENT *client = NULL;
1880                 pid_t *pid;
1881
1882                 memcpy(&rset, &mset, sizeof(fd_set));
1883                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1884                         int net = 0;
1885                         SERVER* serve=NULL;
1886
1887                         DEBUG("accept, ");
1888                         if(FD_ISSET(modernsock, &rset)) {
1889                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1890                                         err("accept: %m");
1891                                 client = negotiate(net, NULL, servers);
1892                                 if(!client) {
1893                                         err_nonfatal("negotiation failed");
1894                                         close(net);
1895                                         net=0;
1896                                         continue;
1897                                 }
1898                                 serve = client->server;
1899                         }
1900                         for(i=0;i<servers->len && !net;i++) {
1901                                 serve=&(g_array_index(servers, SERVER, i));
1902                                 if(FD_ISSET(serve->socket, &rset)) {
1903                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1904                                                 err("accept: %m");
1905                                 }
1906                         }
1907                         if(net) {
1908                                 int sock_flags;
1909
1910                                 if(serve->max_connections > 0 &&
1911                                    g_hash_table_size(children) >= serve->max_connections) {
1912                                         msg2(LOG_INFO, "Max connections reached");
1913                                         close(net);
1914                                         continue;
1915                                 }
1916                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1917                                         err("fcntl F_GETFL");
1918                                 }
1919                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1920                                         err("fcntl F_SETFL ~O_NONBLOCK");
1921                                 }
1922                                 if(!client) {
1923                                         client = g_new0(CLIENT, 1);
1924                                         client->server=serve;
1925                                         client->exportsize=OFFT_MAX;
1926                                         client->net=net;
1927                                         client->transactionlogfd = -1;
1928                                 }
1929                                 set_peername(net, client);
1930                                 if (!authorized_client(client)) {
1931                                         msg2(LOG_INFO,"Unauthorized client") ;
1932                                         close(net);
1933                                         continue;
1934                                 }
1935                                 msg2(LOG_INFO,"Authorized client") ;
1936                                 pid=g_malloc(sizeof(pid_t));
1937
1938                                 if (!dontfork) {
1939                                         if ((*pid=fork())<0) {
1940                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1941                                                 close(net);
1942                                                 continue;
1943                                         }
1944                                         if (*pid>0) { /* parent */
1945                                                 close(net);
1946                                                 g_hash_table_insert(children, pid, pid);
1947                                                 continue;
1948                                         }
1949                                         /* child */
1950                                         g_hash_table_destroy(children);
1951                                         for(i=0;i<servers->len;i++) {
1952                                                 serve=&g_array_index(servers, SERVER, i);
1953                                                 close(serve->socket);
1954                                         }
1955                                         /* FALSE does not free the
1956                                            actual data. This is required,
1957                                            because the client has a
1958                                            direct reference into that
1959                                            data, and otherwise we get a
1960                                            segfault... */
1961                                         g_array_free(servers, FALSE);
1962                                 }
1963
1964                                 msg2(LOG_INFO,"Starting to serve");
1965                                 serveconnection(client);
1966                                 exit(EXIT_SUCCESS);
1967                         }
1968                 }
1969         }
1970 }
1971
1972 void dosockopts(int socket) {
1973 #ifndef sun
1974         int yes=1;
1975 #else
1976         char yes='1';
1977 #endif /* sun */
1978         int sock_flags;
1979
1980         /* lose the pesky "Address already in use" error message */
1981         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
1982                 err("setsockopt SO_REUSEADDR");
1983         }
1984         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
1985                 err("setsockopt SO_KEEPALIVE");
1986         }
1987
1988         /* make the listening socket non-blocking */
1989         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
1990                 err("fcntl F_GETFL");
1991         }
1992         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
1993                 err("fcntl F_SETFL O_NONBLOCK");
1994         }
1995 }
1996
1997 /**
1998  * Connect a server's socket.
1999  *
2000  * @param serve the server we want to connect.
2001  **/
2002 int setup_serve(SERVER *serve) {
2003         struct addrinfo hints;
2004         struct addrinfo *ai = NULL;
2005         gchar *port = NULL;
2006         int e;
2007
2008         if(!do_oldstyle) {
2009                 return serve->servename ? 1 : 0;
2010         }
2011         memset(&hints,'\0',sizeof(hints));
2012         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2013         hints.ai_socktype = SOCK_STREAM;
2014         hints.ai_family = serve->socket_family;
2015
2016         port = g_strdup_printf ("%d", serve->port);
2017         if (port == NULL)
2018                 return 0;
2019
2020         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2021
2022         g_free(port);
2023
2024         if(e != 0) {
2025                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2026                 serve->socket = -1;
2027                 freeaddrinfo(ai);
2028                 exit(EXIT_FAILURE);
2029         }
2030
2031         if(serve->socket_family == AF_UNSPEC)
2032                 serve->socket_family = ai->ai_family;
2033
2034 #ifdef WITH_SDP
2035         if ((serve->flags) && F_SDP) {
2036                 if (ai->ai_family == AF_INET)
2037                         ai->ai_family = AF_INET_SDP;
2038                 else (ai->ai_family == AF_INET6)
2039                         ai->ai_family = AF_INET6_SDP;
2040         }
2041 #endif
2042         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2043                 err("socket: %m");
2044
2045         dosockopts(serve->socket);
2046
2047         DEBUG("Waiting for connections... bind, ");
2048         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2049         if (e != 0 && errno != EADDRINUSE)
2050                 err("bind: %m");
2051         DEBUG("listen, ");
2052         if (listen(serve->socket, 1) < 0)
2053                 err("listen: %m");
2054
2055         freeaddrinfo (ai);
2056         if(serve->servename) {
2057                 return 1;
2058         } else {
2059                 return 0;
2060         }
2061 }
2062
2063 void open_modern(void) {
2064         struct addrinfo hints;
2065         struct addrinfo* ai = NULL;
2066         struct sock_flags;
2067         int e;
2068
2069         memset(&hints, '\0', sizeof(hints));
2070         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2071         hints.ai_socktype = SOCK_STREAM;
2072         hints.ai_family = AF_UNSPEC;
2073         hints.ai_protocol = IPPROTO_TCP;
2074         e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai);
2075         if(e != 0) {
2076                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2077                 exit(EXIT_FAILURE);
2078         }
2079         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2080                 err("socket: %m");
2081         }
2082
2083         dosockopts(modernsock);
2084
2085         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2086                 err("bind: %m");
2087         }
2088         if(listen(modernsock, 10) <0) {
2089                 err("listen: %m");
2090         }
2091
2092         freeaddrinfo(ai);
2093 }
2094
2095 /**
2096  * Connect our servers.
2097  **/
2098 void setup_servers(GArray* servers) {
2099         int i;
2100         struct sigaction sa;
2101         int want_modern=0;
2102
2103         for(i=0;i<servers->len;i++) {
2104                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2105         }
2106         if(want_modern) {
2107                 open_modern();
2108         }
2109         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2110
2111         sa.sa_handler = sigchld_handler;
2112         sigemptyset(&sa.sa_mask);
2113         sa.sa_flags = SA_RESTART;
2114         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2115                 err("sigaction: %m");
2116         sa.sa_handler = sigterm_handler;
2117         sigemptyset(&sa.sa_mask);
2118         sa.sa_flags = SA_RESTART;
2119         if(sigaction(SIGTERM, &sa, NULL) == -1)
2120                 err("sigaction: %m");
2121 }
2122
2123 /**
2124  * Go daemon (unless we specified at compile time that we didn't want this)
2125  * @param serve the first server of our configuration. If its port is zero,
2126  *      then do not daemonize, because we're doing inetd then. This parameter
2127  *      is only used to create a PID file of the form
2128  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2129  **/
2130 #if !defined(NODAEMON)
2131 void daemonize(SERVER* serve) {
2132         FILE*pidf;
2133
2134         if(serve && !(serve->port)) {
2135                 return;
2136         }
2137         if(daemon(0,0)<0) {
2138                 err("daemon");
2139         }
2140         if(!*pidftemplate) {
2141                 if(serve) {
2142                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2143                 } else {
2144                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2145                 }
2146         }
2147         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2148         pidf=fopen(pidfname, "w");
2149         if(pidf) {
2150                 fprintf(pidf,"%d\n", (int)getpid());
2151                 fclose(pidf);
2152         } else {
2153                 perror("fopen");
2154                 fprintf(stderr, "Not fatal; continuing");
2155         }
2156 }
2157 #else
2158 #define daemonize(serve)
2159 #endif /* !defined(NODAEMON) */
2160
2161 /*
2162  * Everything beyond this point (in the file) is run in non-daemon mode.
2163  * The stuff above daemonize() isn't.
2164  */
2165
2166 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2167
2168 void serve_err(SERVER* serve, const char* msg) {
2169         g_message("Export of %s on port %d failed:", serve->exportname,
2170                         serve->port);
2171         err(msg);
2172 }
2173
2174 /**
2175  * Set up user-ID and/or group-ID
2176  **/
2177 void dousers(void) {
2178         struct passwd *pw;
2179         struct group *gr;
2180         gchar* str;
2181         if(rungroup) {
2182                 gr=getgrnam(rungroup);
2183                 if(!gr) {
2184                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2185                         err(str);
2186                 }
2187                 if(setgid(gr->gr_gid)<0) {
2188                         err("Could not set GID: %m"); 
2189                 }
2190         }
2191         if(runuser) {
2192                 pw=getpwnam(runuser);
2193                 if(!pw) {
2194                         str = g_strdup_printf("Invalid user name: %s", runuser);
2195                         err(str);
2196                 }
2197                 if(setuid(pw->pw_uid)<0) {
2198                         err("Could not set UID: %m");
2199                 }
2200         }
2201 }
2202
2203 #ifndef ISSERVER
2204 void glib_message_syslog_redirect(const gchar *log_domain,
2205                                   GLogLevelFlags log_level,
2206                                   const gchar *message,
2207                                   gpointer user_data)
2208 {
2209     int level=LOG_DEBUG;
2210     
2211     switch( log_level )
2212     {
2213       case G_LOG_FLAG_FATAL:
2214       case G_LOG_LEVEL_CRITICAL:
2215       case G_LOG_LEVEL_ERROR:    
2216         level=LOG_ERR; 
2217         break;
2218       case G_LOG_LEVEL_WARNING:
2219         level=LOG_WARNING;
2220         break;
2221       case G_LOG_LEVEL_MESSAGE:
2222       case G_LOG_LEVEL_INFO:
2223         level=LOG_INFO;
2224         break;
2225       case G_LOG_LEVEL_DEBUG:
2226         level=LOG_DEBUG;
2227       default:
2228         level=LOG_ERR;
2229     }
2230     syslog(level, "%s", message);
2231 }
2232 #endif
2233
2234 /**
2235  * Main entry point...
2236  **/
2237 int main(int argc, char *argv[]) {
2238         SERVER *serve;
2239         GArray *servers;
2240         GError *err=NULL;
2241
2242         if (sizeof( struct nbd_request )!=28) {
2243                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2244                 exit(EXIT_FAILURE) ;
2245         }
2246
2247         memset(pidftemplate, '\0', 256);
2248
2249         logging();
2250         config_file_pos = g_strdup(CFILE);
2251         serve=cmdline(argc, argv);
2252         servers = parse_cfile(config_file_pos, &err);
2253         
2254         if(serve) {
2255                 serve->socket_family = AF_UNSPEC;
2256
2257                 append_serve(serve, servers);
2258      
2259                 if (!(serve->port)) {
2260                         CLIENT *client;
2261 #ifndef ISSERVER
2262                         /* You really should define ISSERVER if you're going to use
2263                          * inetd mode, but if you don't, closing stdout and stderr
2264                          * (which inetd had connected to the client socket) will let it
2265                          * work. */
2266                         close(1);
2267                         close(2);
2268                         open("/dev/null", O_WRONLY);
2269                         open("/dev/null", O_WRONLY);
2270                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2271 #endif
2272                         client=g_malloc(sizeof(CLIENT));
2273                         client->server=serve;
2274                         client->net=0;
2275                         client->exportsize=OFFT_MAX;
2276                         set_peername(0,client);
2277                         serveconnection(client);
2278                         return 0;
2279                 }
2280         }
2281     
2282         if(!servers || !servers->len) {
2283                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2284                                 && err->code == CFILE_NOTFOUND)) {
2285                         g_warning("Could not parse config file: %s", 
2286                                         err ? err->message : "Unknown error");
2287                 }
2288         }
2289         if(serve) {
2290                 g_warning("Specifying an export on the command line is deprecated.");
2291                 g_warning("Please use a configuration file instead.");
2292         }
2293
2294         if((!serve) && (!servers||!servers->len)) {
2295                 g_message("No configured exports; quitting.");
2296                 exit(EXIT_FAILURE);
2297         }
2298         if (!dontfork)
2299                 daemonize(serve);
2300         setup_servers(servers);
2301         dousers();
2302         serveloop(servers);
2303         return 0 ;
2304 }