Do not use sync_file_range
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG( a ) printf( a )
137 #define DEBUG2( a,b ) printf( a,b )
138 #define DEBUG3( a,b,c ) printf( a,b,c )
139 #define DEBUG4( a,b,c,d ) printf( a,b,c,d )
140 #define DEBUG5( a,b,c,d,e ) printf( a,b,c,d,e )
141 #else
142 #define DEBUG( a )
143 #define DEBUG2( a,b ) 
144 #define DEBUG3( a,b,c ) 
145 #define DEBUG4( a,b,c,d ) 
146 #define DEBUG5( a,b,c,d,e ) 
147 #endif
148 #ifndef PACKAGE_VERSION
149 #define PACKAGE_VERSION ""
150 #endif
151 /**
152  * The highest value a variable of type off_t can reach. This is a signed
153  * integer, so set all bits except for the leftmost one.
154  **/
155 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
156 #define LINELEN 256       /**< Size of static buffer used to read the
157                                authorization file (yuck) */
158 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
159 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
160 #define F_READONLY 1      /**< flag to tell us a file is readonly */
161 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
162 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
163                             copyonwrite */
164 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
165 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
166 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
167 #define F_SYNC 64         /**< Whether to fsync() after a write */
168 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
169 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
170 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
171 GHashTable *children;
172 char pidfname[256]; /**< name of our PID file */
173 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
174 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
175
176 int modernsock=0;         /**< Socket for the modern handler. Not used
177                                if a client was only specified on the
178                                command line; only port used if
179                                oldstyle is set to false (and then the
180                                command-line client isn't used, gna gna) */
181 char* modern_listen;      /**< listenaddr value for modernsock */
182
183 /**
184  * Types of virtuatlization
185  **/
186 typedef enum {
187         VIRT_NONE=0,    /**< No virtualization */
188         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
189         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
190                              doing the same as in IPLIT */
191         VIRT_CIDR,      /**< Every subnet in its own directory */
192 } VIRT_STYLE;
193
194 /**
195  * Variables associated with a server.
196  **/
197 typedef struct {
198         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
199         off_t expected_size; /**< size of the exported file as it was told to
200                                us through configuration */
201         gchar* listenaddr;   /**< The IP address we're listening on */
202         unsigned int port;   /**< port we're exporting this file at */
203         char* authname;      /**< filename of the authorization file */
204         int flags;           /**< flags associated with this exported file */
205         int socket;          /**< The socket of this server. */
206         int socket_family;   /**< family of the socket */
207         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
208         uint8_t cidrlen;     /**< The length of the mask when we use
209                                   CIDR-style virtualization */
210         gchar* prerun;       /**< command to be ran after connecting a client,
211                                   but before starting to serve */
212         gchar* postrun;      /**< command that will be ran after the client
213                                   disconnects */
214         gchar* servename;    /**< name of the export as selected by nbd-client */
215         int max_connections; /**< maximum number of opened connections */
216         gchar* transactionlog;/**< filename for transaction log */
217 } SERVER;
218
219 /**
220  * Variables associated with a client socket.
221  **/
222 typedef struct {
223         int fhandle;      /**< file descriptor */
224         off_t startoff;   /**< starting offset of this file */
225 } FILE_INFO;
226
227 typedef struct {
228         off_t exportsize;    /**< size of the file we're exporting */
229         char *clientname;    /**< peer */
230         char *exportname;    /**< (processed) filename of the file we're exporting */
231         GArray *export;    /**< array of FILE_INFO of exported files;
232                                array size is always 1 unless we're
233                                doing the multiple file option */
234         int net;             /**< The actual client socket */
235         SERVER *server;      /**< The server this client is getting data from */
236         char* difffilename;  /**< filename of the copy-on-write file, if any */
237         int difffile;        /**< filedescriptor of copyonwrite file. @todo
238                                shouldn't this be an array too? (cfr export) Or
239                                make -m and -c mutually exclusive */
240         u32 difffilelen;     /**< number of pages in difffile */
241         u32 *difmap;         /**< see comment on the global difmap for this one */
242         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
243         int transactionlogfd;/**< fd for transaction log */
244 } CLIENT;
245
246 /**
247  * Type of configuration file values
248  **/
249 typedef enum {
250         PARAM_INT,              /**< This parameter is an integer */
251         PARAM_STRING,           /**< This parameter is a string */
252         PARAM_BOOL,             /**< This parameter is a boolean */
253 } PARAM_TYPE;
254
255 /**
256  * Configuration file values
257  **/
258 typedef struct {
259         gchar *paramname;       /**< Name of the parameter, as it appears in
260                                   the config file */
261         gboolean required;      /**< Whether this is a required (as opposed to
262                                   optional) parameter */
263         PARAM_TYPE ptype;       /**< Type of the parameter. */
264         gpointer target;        /**< Pointer to where the data of this
265                                   parameter should be written. If ptype is
266                                   PARAM_BOOL, the data is or'ed rather than
267                                   overwritten. */
268         gint flagval;           /**< Flag mask for this parameter in case ptype
269                                   is PARAM_BOOL. */
270 } PARAM;
271
272 /**
273  * Check whether a client is allowed to connect. Works with an authorization
274  * file which contains one line per machine, no wildcards.
275  *
276  * @param opts The client who's trying to connect.
277  * @return 0 - authorization refused, 1 - OK
278  **/
279 int authorized_client(CLIENT *opts) {
280         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
281         FILE *f ;
282         char line[LINELEN]; 
283         char *tmp;
284         struct in_addr addr;
285         struct in_addr client;
286         struct in_addr cltemp;
287         int len;
288
289         if ((f=fopen(opts->server->authname,"r"))==NULL) {
290                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
291                      opts->server->authname,strerror(errno)) ;
292                 return 1 ; 
293         }
294   
295         inet_aton(opts->clientname, &client);
296         while (fgets(line,LINELEN,f)!=NULL) {
297                 if((tmp=index(line, '/'))) {
298                         if(strlen(line)<=tmp-line) {
299                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
300                                 return 0;
301                         }
302                         *(tmp++)=0;
303                         if(!inet_aton(line,&addr)) {
304                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
305                                 return 0;
306                         }
307                         len=strtol(tmp, NULL, 0);
308                         addr.s_addr>>=32-len;
309                         addr.s_addr<<=32-len;
310                         memcpy(&cltemp,&client,sizeof(client));
311                         cltemp.s_addr>>=32-len;
312                         cltemp.s_addr<<=32-len;
313                         if(addr.s_addr == cltemp.s_addr) {
314                                 return 1;
315                         }
316                 }
317                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
318                         fclose(f);
319                         return 1;
320                 }
321         }
322         fclose(f);
323         return 0;
324 }
325
326 /**
327  * Read data from a file descriptor into a buffer
328  *
329  * @param f a file descriptor
330  * @param buf a buffer
331  * @param len the number of bytes to be read
332  **/
333 static inline void readit(int f, void *buf, size_t len) {
334         ssize_t res;
335         while (len > 0) {
336                 DEBUG("*");
337                 if ((res = read(f, buf, len)) <= 0) {
338                         if(errno != EAGAIN) {
339                                 err("Read failed: %m");
340                         }
341                 } else {
342                         len -= res;
343                         buf += res;
344                 }
345         }
346 }
347
348 /**
349  * Write data from a buffer into a filedescriptor
350  *
351  * @param f a file descriptor
352  * @param buf a buffer containing data
353  * @param len the number of bytes to be written
354  **/
355 static inline void writeit(int f, void *buf, size_t len) {
356         ssize_t res;
357         while (len > 0) {
358                 DEBUG("+");
359                 if ((res = write(f, buf, len)) <= 0)
360                         err("Send failed: %m");
361                 len -= res;
362                 buf += res;
363         }
364 }
365
366 /**
367  * Print out a message about how to use nbd-server. Split out to a separate
368  * function so that we can call it from multiple places
369  */
370 void usage() {
371         printf("This is nbd-server version " VERSION "\n");
372         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
373                "\t-r|--read-only\t\tread only\n"
374                "\t-m|--multi-file\t\tmultiple file\n"
375                "\t-c|--copy-on-write\tcopy on write\n"
376                "\t-C|--config-file\tspecify an alternate configuration file\n"
377                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
378                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
379                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
380                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
381                "\tif port is set to 0, stdin is used (for running from inetd)\n"
382                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
383                "\t\taddress of the machine trying to connect\n" 
384                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
385         printf("Using configuration file %s\n", CFILE);
386 }
387
388 /* Dumps a config file section of the given SERVER*, and exits. */
389 void dump_section(SERVER* serve, gchar* section_header) {
390         printf("[%s]\n", section_header);
391         printf("\texportname = %s\n", serve->exportname);
392         printf("\tlistenaddr = %s\n", serve->listenaddr);
393         printf("\tport = %d\n", serve->port);
394         if(serve->flags & F_READONLY) {
395                 printf("\treadonly = true\n");
396         }
397         if(serve->flags & F_MULTIFILE) {
398                 printf("\tmultifile = true\n");
399         }
400         if(serve->flags & F_COPYONWRITE) {
401                 printf("\tcopyonwrite = true\n");
402         }
403         if(serve->expected_size) {
404                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
405         }
406         if(serve->authname) {
407                 printf("\tauthfile = %s\n", serve->authname);
408         }
409         exit(EXIT_SUCCESS);
410 }
411
412 /**
413  * Parse the command line.
414  *
415  * @param argc the argc argument to main()
416  * @param argv the argv argument to main()
417  **/
418 SERVER* cmdline(int argc, char *argv[]) {
419         int i=0;
420         int nonspecial=0;
421         int c;
422         struct option long_options[] = {
423                 {"read-only", no_argument, NULL, 'r'},
424                 {"multi-file", no_argument, NULL, 'm'},
425                 {"copy-on-write", no_argument, NULL, 'c'},
426                 {"dont-fork", no_argument, NULL, 'd'},
427                 {"authorize-file", required_argument, NULL, 'l'},
428                 {"config-file", required_argument, NULL, 'C'},
429                 {"pid-file", required_argument, NULL, 'p'},
430                 {"output-config", required_argument, NULL, 'o'},
431                 {"max-connection", required_argument, NULL, 'M'},
432                 {0,0,0,0}
433         };
434         SERVER *serve;
435         off_t es;
436         size_t last;
437         char suffix;
438         gboolean do_output=FALSE;
439         gchar* section_header="";
440         gchar** addr_port;
441
442         if(argc==1) {
443                 return NULL;
444         }
445         serve=g_new0(SERVER, 1);
446         serve->authname = g_strdup(default_authname);
447         serve->virtstyle=VIRT_IPLIT;
448         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
449                 switch (c) {
450                 case 1:
451                         /* non-option argument */
452                         switch(nonspecial++) {
453                         case 0:
454                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
455                                         addr_port=g_strsplit(optarg, ":", 2);
456
457                                         /* Check for "@" - maybe user using this separator
458                                                  for IPv4 address */
459                                         if(!addr_port[1]) {
460                                                 g_strfreev(addr_port);
461                                                 addr_port=g_strsplit(optarg, "@", 2);
462                                         }
463                                 } else {
464                                         addr_port=g_strsplit(optarg, "@", 2);
465                                 }
466
467                                 if(addr_port[1]) {
468                                         serve->port=strtol(addr_port[1], NULL, 0);
469                                         serve->listenaddr=g_strdup(addr_port[0]);
470                                 } else {
471                                         serve->listenaddr=NULL;
472                                         serve->port=strtol(addr_port[0], NULL, 0);
473                                 }
474                                 g_strfreev(addr_port);
475                                 break;
476                         case 1:
477                                 serve->exportname = g_strdup(optarg);
478                                 if(serve->exportname[0] != '/') {
479                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
480                                         exit(EXIT_FAILURE);
481                                 }
482                                 break;
483                         case 2:
484                                 last=strlen(optarg)-1;
485                                 suffix=optarg[last];
486                                 if (suffix == 'k' || suffix == 'K' ||
487                                     suffix == 'm' || suffix == 'M')
488                                         optarg[last] = '\0';
489                                 es = (off_t)atoll(optarg);
490                                 switch (suffix) {
491                                         case 'm':
492                                         case 'M':  es <<= 10;
493                                         case 'k':
494                                         case 'K':  es <<= 10;
495                                         default :  break;
496                                 }
497                                 serve->expected_size = es;
498                                 break;
499                         }
500                         break;
501                 case 'r':
502                         serve->flags |= F_READONLY;
503                         break;
504                 case 'm':
505                         serve->flags |= F_MULTIFILE;
506                         break;
507                 case 'o':
508                         do_output = TRUE;
509                         section_header = g_strdup(optarg);
510                         break;
511                 case 'p':
512                         strncpy(pidftemplate, optarg, 256);
513                         break;
514                 case 'c': 
515                         serve->flags |=F_COPYONWRITE;
516                         break;
517                 case 'd': 
518                         dontfork = 1;
519                         break;
520                 case 'C':
521                         g_free(config_file_pos);
522                         config_file_pos=g_strdup(optarg);
523                         break;
524                 case 'l':
525                         g_free(serve->authname);
526                         serve->authname=g_strdup(optarg);
527                         break;
528                 case 'M':
529                         serve->max_connections = strtol(optarg, NULL, 0);
530                         break;
531                 default:
532                         usage();
533                         exit(EXIT_FAILURE);
534                         break;
535                 }
536         }
537         /* What's left: the port to export, the name of the to be exported
538          * file, and, optionally, the size of the file, in that order. */
539         if(nonspecial<2) {
540                 g_free(serve);
541                 serve=NULL;
542         } else {
543                 do_oldstyle = TRUE;
544         }
545         if(do_output) {
546                 if(!serve) {
547                         g_critical("Need a complete configuration on the command line to output a config file section!");
548                         exit(EXIT_FAILURE);
549                 }
550                 dump_section(serve, section_header);
551         }
552         return serve;
553 }
554
555 /**
556  * Error codes for config file parsing
557  **/
558 typedef enum {
559         CFILE_NOTFOUND,         /**< The configuration file is not found */
560         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
561         CFILE_KEY_MISSING,      /**< A (required) key is missing */
562         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
563         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
564         CFILE_PROGERR,          /**< Programmer error */
565         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
566                                      define any exports */
567         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
568                                      old-style export. */
569 } CFILE_ERRORS;
570
571 /**
572  * Remove a SERVER from memory. Used from the hash table
573  **/
574 void remove_server(gpointer s) {
575         SERVER *server;
576
577         server=(SERVER*)s;
578         g_free(server->exportname);
579         if(server->authname)
580                 g_free(server->authname);
581         if(server->listenaddr)
582                 g_free(server->listenaddr);
583         if(server->prerun)
584                 g_free(server->prerun);
585         if(server->postrun)
586                 g_free(server->postrun);
587         if(server->transactionlog)
588                 g_free(server->transactionlog);
589         g_free(server);
590 }
591
592 /**
593  * duplicate server
594  * @param s the old server we want to duplicate
595  * @return new duplicated server
596  **/
597 SERVER* dup_serve(SERVER *s) {
598         SERVER *serve = NULL;
599
600         serve=g_new0(SERVER, 1);
601         if(serve == NULL)
602                 return NULL;
603
604         if(s->exportname)
605                 serve->exportname = g_strdup(s->exportname);
606
607         serve->expected_size = s->expected_size;
608
609         if(s->listenaddr)
610                 serve->listenaddr = g_strdup(s->listenaddr);
611
612         serve->port = s->port;
613
614         if(s->authname)
615                 serve->authname = strdup(s->authname);
616
617         serve->flags = s->flags;
618         serve->socket = s->socket;
619         serve->socket_family = s->socket_family;
620         serve->virtstyle = s->virtstyle;
621         serve->cidrlen = s->cidrlen;
622
623         if(s->prerun)
624                 serve->prerun = g_strdup(s->prerun);
625
626         if(s->postrun)
627                 serve->postrun = g_strdup(s->postrun);
628
629         if(s->transactionlog)
630                 serve->transactionlog = g_strdup(s->transactionlog);
631         
632         if(s->servename)
633                 serve->servename = g_strdup(s->servename);
634
635         serve->max_connections = s->max_connections;
636
637         return serve;
638 }
639
640 /**
641  * append new server to array
642  * @param s server
643  * @param a server array
644  * @return 0 success, -1 error
645  */
646 int append_serve(SERVER *s, GArray *a) {
647         SERVER *ns = NULL;
648         struct addrinfo hints;
649         struct addrinfo *ai = NULL;
650         struct addrinfo *rp = NULL;
651         char   host[NI_MAXHOST];
652         gchar  *port = NULL;
653         int e;
654         int ret;
655
656         if(!s) {
657                 err("Invalid parsing server");
658                 return -1;
659         }
660
661         port = g_strdup_printf("%d", s->port);
662
663         memset(&hints,'\0',sizeof(hints));
664         hints.ai_family = AF_UNSPEC;
665         hints.ai_socktype = SOCK_STREAM;
666         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
667         hints.ai_protocol = IPPROTO_TCP;
668
669         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
670
671         if (port)
672                 g_free(port);
673
674         if(e == 0) {
675                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
676                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
677
678                         if (e != 0) { // error
679                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
680                                 continue;
681                         }
682
683                         // duplicate server and set listenaddr to resolved IP address
684                         ns = dup_serve (s);
685                         if (ns) {
686                                 ns->listenaddr = g_strdup(host);
687                                 ns->socket_family = rp->ai_family;
688                                 g_array_append_val(a, *ns);
689                                 free(ns);
690                                 ns = NULL;
691                         }
692                 }
693
694                 ret = 0;
695         } else {
696                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
697                 ret = -1;
698         }
699
700         if (ai)
701                 freeaddrinfo(ai);
702
703         return ret;
704 }
705
706 /**
707  * Parse the config file.
708  *
709  * @param f the name of the config file
710  * @param e a GError. @see CFILE_ERRORS for what error values this function can
711  *      return.
712  * @return a Array of SERVER* pointers, If the config file is empty or does not
713  *      exist, returns an empty GHashTable; if the config file contains an
714  *      error, returns NULL, and e is set appropriately
715  **/
716 GArray* parse_cfile(gchar* f, GError** e) {
717         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
718         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
719         SERVER s;
720         gchar *virtstyle=NULL;
721         PARAM lp[] = {
722                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
723                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
724                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
725                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
726                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
727                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
728                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
729                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
730                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
731                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
732                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
733                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
734                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
735                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
736                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
737                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
738                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
739                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
740                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
741         };
742         const int lp_size=sizeof(lp)/sizeof(PARAM);
743         PARAM gp[] = {
744                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
745                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
746                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
747                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
748         };
749         PARAM* p=gp;
750         int p_size=sizeof(gp)/sizeof(PARAM);
751         GKeyFile *cfile;
752         GError *err = NULL;
753         const char *err_msg=NULL;
754         GQuark errdomain;
755         GArray *retval=NULL;
756         gchar **groups;
757         gboolean value;
758         gchar* startgroup;
759         gint i;
760         gint j;
761
762         errdomain = g_quark_from_string("parse_cfile");
763         cfile = g_key_file_new();
764         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
765         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
766                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
767                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
768                 g_key_file_free(cfile);
769                 return retval;
770         }
771         startgroup = g_key_file_get_start_group(cfile);
772         if(!startgroup || strcmp(startgroup, "generic")) {
773                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
774                 g_key_file_free(cfile);
775                 return NULL;
776         }
777         groups = g_key_file_get_groups(cfile, NULL);
778         for(i=0;groups[i];i++) {
779                 memset(&s, '\0', sizeof(SERVER));
780
781                 /* After the [generic] group, start parsing exports */
782                 if(i==1) {
783                         p=lp;
784                         p_size=lp_size;
785                 } 
786                 for(j=0;j<p_size;j++) {
787                         g_assert(p[j].target != NULL);
788                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
789                         switch(p[j].ptype) {
790                                 case PARAM_INT:
791                                         *((gint*)p[j].target) =
792                                                 g_key_file_get_integer(cfile,
793                                                                 groups[i],
794                                                                 p[j].paramname,
795                                                                 &err);
796                                         break;
797                                 case PARAM_STRING:
798                                         *((gchar**)p[j].target) =
799                                                 g_key_file_get_string(cfile,
800                                                                 groups[i],
801                                                                 p[j].paramname,
802                                                                 &err);
803                                         break;
804                                 case PARAM_BOOL:
805                                         value = g_key_file_get_boolean(cfile,
806                                                         groups[i],
807                                                         p[j].paramname, &err);
808                                         if(!err) {
809                                                 if(value) {
810                                                         *((gint*)p[j].target) |= p[j].flagval;
811                                                 } else {
812                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
813                                                 }
814                                         }
815                                         break;
816                         }
817                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) {
818                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export");
819                                 g_key_file_free(cfile);
820                                 return NULL;
821                         }
822                         if(err) {
823                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
824                                         if(!p[j].required) {
825                                                 /* Ignore not-found error for optional values */
826                                                 g_clear_error(&err);
827                                                 continue;
828                                         } else {
829                                                 err_msg = MISSING_REQUIRED_ERROR;
830                                         }
831                                 } else {
832                                         err_msg = DEFAULT_ERROR;
833                                 }
834                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
835                                 g_array_free(retval, TRUE);
836                                 g_error_free(err);
837                                 g_key_file_free(cfile);
838                                 return NULL;
839                         }
840                 }
841                 if(virtstyle) {
842                         if(!strncmp(virtstyle, "none", 4)) {
843                                 s.virtstyle=VIRT_NONE;
844                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
845                                 s.virtstyle=VIRT_IPLIT;
846                         } else if(!strncmp(virtstyle, "iphash", 6)) {
847                                 s.virtstyle=VIRT_IPHASH;
848                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
849                                 s.virtstyle=VIRT_CIDR;
850                                 if(strlen(virtstyle)<10) {
851                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
852                                         g_array_free(retval, TRUE);
853                                         g_key_file_free(cfile);
854                                         return NULL;
855                                 }
856                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
857                         } else {
858                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
859                                 g_array_free(retval, TRUE);
860                                 g_key_file_free(cfile);
861                                 return NULL;
862                         }
863                         if(s.port && !do_oldstyle) {
864                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
865                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
866                         }
867                 } else {
868                         s.virtstyle=VIRT_IPLIT;
869                 }
870                 /* Don't need to free this, it's not our string */
871                 virtstyle=NULL;
872                 /* Don't append values for the [generic] group */
873                 if(i>0) {
874                         s.socket_family = AF_UNSPEC;
875                         s.servename = groups[i];
876
877                         append_serve(&s, retval);
878                 } else {
879                         if(!do_oldstyle) {
880                                 lp[1].required = 0;
881                         }
882                 }
883 #ifndef WITH_SDP
884                 if(s.flags & F_SDP) {
885                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
886                         g_array_free(retval, TRUE);
887                         g_key_file_free(cfile);
888                         return NULL;
889                 }
890 #endif
891         }
892         if(i==1) {
893                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
894         }
895         g_key_file_free(cfile);
896         return retval;
897 }
898
899 /**
900  * Signal handler for SIGCHLD
901  * @param s the signal we're handling (must be SIGCHLD, or something
902  * is severely wrong)
903  **/
904 void sigchld_handler(int s) {
905         int status;
906         int* i;
907         pid_t pid;
908
909         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
910                 if(WIFEXITED(status)) {
911                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
912                 }
913                 i=g_hash_table_lookup(children, &pid);
914                 if(!i) {
915                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
916                 } else {
917                         DEBUG2("Removing %d from the list of children", pid);
918                         g_hash_table_remove(children, &pid);
919                 }
920         }
921 }
922
923 /**
924  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
925  *
926  * @param key the key
927  * @param value the value corresponding to the above key
928  * @param user_data a pointer which we always set to 1, so that we know what
929  * will happen next.
930  **/
931 void killchild(gpointer key, gpointer value, gpointer user_data) {
932         pid_t *pid=value;
933         int *parent=user_data;
934
935         kill(*pid, SIGTERM);
936         *parent=1;
937 }
938
939 /**
940  * Handle SIGTERM and dispatch it to our children
941  * @param s the signal we're handling (must be SIGTERM, or something
942  * is severely wrong).
943  **/
944 void sigterm_handler(int s) {
945         int parent=0;
946
947         g_hash_table_foreach(children, killchild, &parent);
948
949         if(parent) {
950                 unlink(pidfname);
951         }
952
953         exit(EXIT_SUCCESS);
954 }
955
956 /**
957  * Detect the size of a file.
958  *
959  * @param fhandle An open filedescriptor
960  * @return the size of the file, or OFFT_MAX if detection was
961  * impossible.
962  **/
963 off_t size_autodetect(int fhandle) {
964         off_t es;
965         u64 bytes;
966         struct stat stat_buf;
967         int error;
968
969 #ifdef HAVE_SYS_MOUNT_H
970 #ifdef HAVE_SYS_IOCTL_H
971 #ifdef BLKGETSIZE64
972         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
973         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
974                 return (off_t)bytes;
975         }
976 #endif /* BLKGETSIZE64 */
977 #endif /* HAVE_SYS_IOCTL_H */
978 #endif /* HAVE_SYS_MOUNT_H */
979
980         DEBUG("looking for fhandle size with fstat\n");
981         stat_buf.st_size = 0;
982         error = fstat(fhandle, &stat_buf);
983         if (!error) {
984                 if(stat_buf.st_size > 0)
985                         return (off_t)stat_buf.st_size;
986         } else {
987                 err("fstat failed: %m");
988         }
989
990         DEBUG("looking for fhandle size with lseek SEEK_END\n");
991         es = lseek(fhandle, (off_t)0, SEEK_END);
992         if (es > ((off_t)0)) {
993                 return es;
994         } else {
995                 DEBUG2("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
996         }
997
998         err("Could not find size of exported block device: %m");
999         return OFFT_MAX;
1000 }
1001
1002 /**
1003  * Get the file handle and offset, given an export offset.
1004  *
1005  * @param export An array of export files
1006  * @param a The offset to get corresponding file/offset for
1007  * @param fhandle [out] File descriptor
1008  * @param foffset [out] Offset into fhandle
1009  * @param maxbytes [out] Tells how many bytes can be read/written
1010  * from fhandle starting at foffset (0 if there is no limit)
1011  * @return 0 on success, -1 on failure
1012  **/
1013 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1014         /* Negative offset not allowed */
1015         if(a < 0)
1016                 return -1;
1017
1018         /* Binary search for last file with starting offset <= a */
1019         FILE_INFO fi;
1020         int start = 0;
1021         int end = export->len - 1;
1022         while( start <= end ) {
1023                 int mid = (start + end) / 2;
1024                 fi = g_array_index(export, FILE_INFO, mid);
1025                 if( fi.startoff < a ) {
1026                         start = mid + 1;
1027                 } else if( fi.startoff > a ) {
1028                         end = mid - 1;
1029                 } else {
1030                         start = end = mid;
1031                         break;
1032                 }
1033         }
1034
1035         /* end should never go negative, since first startoff is 0 and a >= 0 */
1036         g_assert(end >= 0);
1037
1038         fi = g_array_index(export, FILE_INFO, end);
1039         *fhandle = fi.fhandle;
1040         *foffset = a - fi.startoff;
1041         *maxbytes = 0;
1042         if( end+1 < export->len ) {
1043                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1044                 *maxbytes = fi_next.startoff - a;
1045         }
1046
1047         return 0;
1048 }
1049
1050 /**
1051  * seek to a position in a file, with error handling.
1052  * @param handle a filedescriptor
1053  * @param a position to seek to
1054  * @todo get rid of this; lastpoint is a global variable right now, but it
1055  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1056  * easier.
1057  **/
1058 void myseek(int handle,off_t a) {
1059         if (lseek(handle, a, SEEK_SET) < 0) {
1060                 err("Can not seek locally!\n");
1061         }
1062 }
1063
1064 /**
1065  * Write an amount of bytes at a given offset to the right file. This
1066  * abstracts the write-side of the multiple file option.
1067  *
1068  * @param a The offset where the write should start
1069  * @param buf The buffer to write from
1070  * @param len The length of buf
1071  * @param client The client we're serving for
1072  * @return The number of bytes actually written, or -1 in case of an error
1073  **/
1074 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1075         int fhandle;
1076         off_t foffset;
1077         size_t maxbytes;
1078         ssize_t retval;
1079
1080         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1081                 return -1;
1082         if(maxbytes && len > maxbytes)
1083                 len = maxbytes;
1084
1085         DEBUG5("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, foffset, len, fua);
1086
1087         myseek(fhandle, foffset);
1088         retval = write(fhandle, buf, len);
1089         if(client->server->flags & F_SYNC) {
1090                 fsync(fhandle);
1091         } else if (fua) {
1092
1093           /* This is where we would do the following
1094            *   #ifdef USE_SYNC_FILE_RANGE
1095            * However, we don't, for the reasons set out below
1096            * by Christoph Hellwig <hch@infradead.org>
1097            *
1098            * [BEGINS] 
1099            * fdatasync is equivalent to fsync except that it does not flush
1100            * non-essential metadata (basically just timestamps in practice), but it
1101            * does flush metadata requried to find the data again, e.g. allocation
1102            * information and extent maps.  sync_file_range does nothing but flush
1103            * out pagecache content - it means you basically won't get your data
1104            * back in case of a crash if you either:
1105            * 
1106            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1107            *  b) are using a sparse file on a filesystem
1108            *  c) are using a fallocate-preallocated file on a filesystem
1109            *  d) use any file on a COW filesystem like btrfs
1110            * 
1111            * e.g. it only does anything useful for you if you do not have a volatile
1112            * write cache, and either use a raw block device node, or just overwrite
1113            * an already fully allocated (and not preallocated) file on a non-COW
1114            * filesystem.
1115            * [ENDS]
1116            *
1117            * What we should do is open a second FD with O_DSYNC set, then write to
1118            * that when appropriate. However, with a Linux client, every REQ_FUA
1119            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1120            * problems.
1121            *
1122            */
1123 #if 0
1124                 sync_file_range(fhandle, foffset, len,
1125                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1126                                 SYNC_FILE_RANGE_WAIT_AFTER);
1127 #else
1128                 fdatasync(fhandle);
1129 #endif
1130         }
1131         return retval;
1132 }
1133
1134 /**
1135  * Call rawexpwrite repeatedly until all data has been written.
1136  * @return 0 on success, nonzero on failure
1137  **/
1138 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1139         ssize_t ret=0;
1140
1141         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1142                 a += ret;
1143                 buf += ret;
1144                 len -= ret;
1145         }
1146         return (ret < 0 || len != 0);
1147 }
1148
1149 /**
1150  * Read an amount of bytes at a given offset from the right file. This
1151  * abstracts the read-side of the multiple files option.
1152  *
1153  * @param a The offset where the read should start
1154  * @param buf A buffer to read into
1155  * @param len The size of buf
1156  * @param client The client we're serving for
1157  * @return The number of bytes actually read, or -1 in case of an
1158  * error.
1159  **/
1160 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1161         int fhandle;
1162         off_t foffset;
1163         size_t maxbytes;
1164
1165         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1166                 return -1;
1167         if(maxbytes && len > maxbytes)
1168                 len = maxbytes;
1169
1170         DEBUG4("(READ from fd %d offset %llu len %u), ", fhandle, foffset, len);
1171
1172         myseek(fhandle, foffset);
1173         return read(fhandle, buf, len);
1174 }
1175
1176 /**
1177  * Call rawexpread repeatedly until all data has been read.
1178  * @return 0 on success, nonzero on failure
1179  **/
1180 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1181         ssize_t ret=0;
1182
1183         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1184                 a += ret;
1185                 buf += ret;
1186                 len -= ret;
1187         }
1188         return (ret < 0 || len != 0);
1189 }
1190
1191 /**
1192  * Read an amount of bytes at a given offset from the right file. This
1193  * abstracts the read-side of the copyonwrite stuff, and calls
1194  * rawexpread() with the right parameters to do the actual work.
1195  * @param a The offset where the read should start
1196  * @param buf A buffer to read into
1197  * @param len The size of buf
1198  * @param client The client we're going to read for
1199  * @return 0 on success, nonzero on failure
1200  **/
1201 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1202         off_t rdlen, offset;
1203         off_t mapcnt, mapl, maph, pagestart;
1204
1205         if (!(client->server->flags & F_COPYONWRITE))
1206                 return(rawexpread_fully(a, buf, len, client));
1207         DEBUG3("Asked to read %d bytes at %llu.\n", len, (unsigned long long)a);
1208
1209         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1210
1211         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1212                 pagestart=mapcnt*DIFFPAGESIZE;
1213                 offset=a-pagestart;
1214                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1215                         len : (size_t)DIFFPAGESIZE-offset;
1216                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1217                         DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1218                                (unsigned long)(client->difmap[mapcnt]));
1219                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1220                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1221                 } else { /* the block is not there */
1222                         DEBUG2("Page %llu is not here, we read the original one\n",
1223                                (unsigned long long)mapcnt);
1224                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1225                 }
1226                 len-=rdlen; a+=rdlen; buf+=rdlen;
1227         }
1228         return 0;
1229 }
1230
1231 /**
1232  * Write an amount of bytes at a given offset to the right file. This
1233  * abstracts the write-side of the copyonwrite option, and calls
1234  * rawexpwrite() with the right parameters to do the actual work.
1235  *
1236  * @param a The offset where the write should start
1237  * @param buf The buffer to write from
1238  * @param len The length of buf
1239  * @param client The client we're going to write for.
1240  * @return 0 on success, nonzero on failure
1241  **/
1242 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1243         char pagebuf[DIFFPAGESIZE];
1244         off_t mapcnt,mapl,maph;
1245         off_t wrlen,rdlen; 
1246         off_t pagestart;
1247         off_t offset;
1248
1249         if (!(client->server->flags & F_COPYONWRITE))
1250                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1251         DEBUG3("Asked to write %d bytes at %llu.\n", len, (unsigned long long)a);
1252
1253         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1254
1255         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1256                 pagestart=mapcnt*DIFFPAGESIZE ;
1257                 offset=a-pagestart ;
1258                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1259                         len : (size_t)DIFFPAGESIZE-offset;
1260
1261                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1262                         DEBUG3("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1263                                (unsigned long)(client->difmap[mapcnt])) ;
1264                         myseek(client->difffile,
1265                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1266                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1267                 } else { /* the block is not there */
1268                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1269                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1270                         DEBUG3("Page %llu is not here, we put it at %lu\n",
1271                                (unsigned long long)mapcnt,
1272                                (unsigned long)(client->difmap[mapcnt]));
1273                         rdlen=DIFFPAGESIZE ;
1274                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1275                                 return -1;
1276                         memcpy(pagebuf+offset,buf,wrlen) ;
1277                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1278                                         DIFFPAGESIZE)
1279                                 return -1;
1280                 }                                                   
1281                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1282         }
1283         if (client->server->flags & F_SYNC) {
1284                 fsync(client->difffile);
1285         } else if (fua) {
1286                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1287                    as we iterate through the above?
1288                  */
1289                 fdatasync(client->difffile);
1290         }
1291         return 0;
1292 }
1293
1294 int expflush(CLIENT *client) {
1295         int fhandle;
1296         off_t foffset;
1297         size_t maxbytes;
1298         gint i;
1299
1300         if (client->server->flags & F_COPYONWRITE) {
1301                 return fsync(client->difffile);
1302         }
1303         
1304         for (i = 0; i < client->export->len; i++) {
1305                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1306                 if (fsync(fi.fhandle) < 0)
1307                         return -1;
1308         }
1309         
1310         return 0;
1311 }
1312
1313 /**
1314  * Do the initial negotiation.
1315  *
1316  * @param client The client we're negotiating with.
1317  **/
1318 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1319         char zeros[128];
1320         uint64_t size_host;
1321         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1322         uint16_t smallflags = 0;
1323         uint64_t magic;
1324
1325         memset(zeros, '\0', sizeof(zeros));
1326         if(!client || !client->modern) {
1327                 /* common */
1328                 if (write(net, INIT_PASSWD, 8) < 0) {
1329                         err_nonfatal("Negotiation failed: %m");
1330                         if(client)
1331                                 exit(EXIT_FAILURE);
1332                 }
1333                 if(!client || client->modern) {
1334                         /* modern */
1335                         magic = htonll(opts_magic);
1336                 } else {
1337                         /* oldstyle */
1338                         magic = htonll(cliserv_magic);
1339                 }
1340                 if (write(net, &magic, sizeof(magic)) < 0) {
1341                         err_nonfatal("Negotiation failed: %m");
1342                         if(client)
1343                                 exit(EXIT_FAILURE);
1344                 }
1345         }
1346         if(!client) {
1347                 /* modern */
1348                 uint32_t reserved;
1349                 uint32_t opt;
1350                 uint32_t namelen;
1351                 char* name;
1352                 int i;
1353
1354                 if(!servers)
1355                         err("programmer error");
1356                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1357                         err("Negotiation failed: %m");
1358                 if (read(net, &reserved, sizeof(reserved)) < 0)
1359                         err("Negotiation failed: %m");
1360                 if (read(net, &magic, sizeof(magic)) < 0)
1361                         err("Negotiation failed: %m");
1362                 magic = ntohll(magic);
1363                 if(magic != opts_magic) {
1364                         close(net);
1365                         return NULL;
1366                 }
1367                 if (read(net, &opt, sizeof(opt)) < 0)
1368                         err("Negotiation failed: %m");
1369                 opt = ntohl(opt);
1370                 if(opt != NBD_OPT_EXPORT_NAME) {
1371                         close(net);
1372                         return NULL;
1373                 }
1374                 if (read(net, &namelen, sizeof(namelen)) < 0)
1375                         err("Negotiation failed: %m");
1376                 namelen = ntohl(namelen);
1377                 name = malloc(namelen+1);
1378                 name[namelen]=0;
1379                 if (read(net, name, namelen) < 0)
1380                         err("Negotiation failed: %m");
1381                 for(i=0; i<servers->len; i++) {
1382                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1383                         if(!strcmp(serve->servename, name)) {
1384                                 CLIENT* client = g_new0(CLIENT, 1);
1385                                 client->server = serve;
1386                                 client->exportsize = OFFT_MAX;
1387                                 client->net = net;
1388                                 client->modern = TRUE;
1389                                 client->transactionlogfd = -1;
1390                                 free(name);
1391                                 return client;
1392                         }
1393                 }
1394                 free(name);
1395                 return NULL;
1396         }
1397         /* common */
1398         size_host = htonll((u64)(client->exportsize));
1399         if (write(net, &size_host, 8) < 0)
1400                 err("Negotiation failed: %m");
1401         if (client->server->flags & F_READONLY)
1402                 flags |= NBD_FLAG_READ_ONLY;
1403         if (client->server->flags & F_FLUSH)
1404                 flags |= NBD_FLAG_SEND_FLUSH;
1405         if (client->server->flags & F_FUA)
1406                 flags |= NBD_FLAG_SEND_FUA;
1407         if (client->server->flags & F_ROTATIONAL)
1408                 flags |= NBD_FLAG_ROTATIONAL;
1409         if (!client->modern) {
1410                 /* oldstyle */
1411                 flags = htonl(flags);
1412                 if (write(client->net, &flags, 4) < 0)
1413                         err("Negotiation failed: %m");
1414         } else {
1415                 /* modern */
1416                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1417                 smallflags = htons(smallflags);
1418                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1419                         err("Negotiation failed: %m");
1420                 }
1421         }
1422         /* common */
1423         if (write(client->net, zeros, 124) < 0)
1424                 err("Negotiation failed: %m");
1425         return NULL;
1426 }
1427
1428 /** sending macro. */
1429 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1430         if (client->transactionlogfd != -1) \
1431                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1432 /** error macro. */
1433 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1434 /**
1435  * Serve a file to a single client.
1436  *
1437  * @todo This beast needs to be split up in many tiny little manageable
1438  * pieces. Preferably with a chainsaw.
1439  *
1440  * @param client The client we're going to serve to.
1441  * @return when the client disconnects
1442  **/
1443 int mainloop(CLIENT *client) {
1444         struct nbd_request request;
1445         struct nbd_reply reply;
1446         gboolean go_on=TRUE;
1447 #ifdef DODBG
1448         int i = 0;
1449 #endif
1450         negotiate(client->net, client, NULL);
1451         DEBUG("Entering request loop!\n");
1452         reply.magic = htonl(NBD_REPLY_MAGIC);
1453         reply.error = 0;
1454         while (go_on) {
1455                 char buf[BUFSIZE];
1456                 char* p;
1457                 size_t len;
1458                 size_t currlen;
1459                 size_t writelen;
1460                 uint16_t command;
1461 #ifdef DODBG
1462                 i++;
1463                 printf("%d: ", i);
1464 #endif
1465                 readit(client->net, &request, sizeof(request));
1466                 if (client->transactionlogfd != -1)
1467                         writeit(client->transactionlogfd, &request, sizeof(request));
1468
1469                 request.from = ntohll(request.from);
1470                 request.type = ntohl(request.type);
1471                 command = request.type & NBD_CMD_MASK_COMMAND;
1472
1473                 if (command==NBD_CMD_DISC) {
1474                         msg2(LOG_INFO, "Disconnect request received.");
1475                         if (client->server->flags & F_COPYONWRITE) { 
1476                                 if (client->difmap) g_free(client->difmap) ;
1477                                 close(client->difffile);
1478                                 unlink(client->difffilename);
1479                                 free(client->difffilename);
1480                         }
1481                         go_on=FALSE;
1482                         continue;
1483                 }
1484
1485                 len = ntohl(request.len);
1486
1487                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1488                         err("Not enough magic.");
1489                 if (len > BUFSIZE - sizeof(struct nbd_reply)) {
1490                         currlen = BUFSIZE - sizeof(struct nbd_reply);
1491                         msg2(LOG_INFO, "oversized request (this is not a problem)");
1492                 } else {
1493                         currlen = len;
1494                 }
1495 #ifdef DODBG
1496                 printf("%s from %llu (%llu) len %d, ", command ? "WRITE" :
1497                                 "READ", (unsigned long long)request.from,
1498                                 (unsigned long long)request.from / 512, len);
1499 #endif
1500                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1501
1502                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1503                         if ((request.from + len) > (OFFT_MAX)) {
1504                                 DEBUG("[Number too large!]");
1505                                 ERROR(client, reply, EINVAL);
1506                                 continue;
1507                         }
1508
1509                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1510                                 DEBUG("[RANGE!]");
1511                                 ERROR(client, reply, EINVAL);
1512                                 continue;
1513                         }
1514                 }
1515
1516                 if (command==NBD_CMD_WRITE) {
1517                         DEBUG("wr: net->buf, ");
1518                         while(len > 0) {
1519                                 readit(client->net, buf, currlen);
1520                                 DEBUG("buf->exp, ");
1521                                 if ((client->server->flags & F_READONLY) ||
1522                                     (client->server->flags & F_AUTOREADONLY)) {
1523                                         DEBUG("[WRITE to READONLY!]");
1524                                         ERROR(client, reply, EPERM);
1525                                         continue;
1526                                 }
1527                                 if (expwrite(request.from, buf, len, client,
1528                                              request.type & NBD_CMD_FLAG_FUA)) {
1529                                         DEBUG("Write failed: %m" );
1530                                         ERROR(client, reply, errno);
1531                                         continue;
1532                                 }
1533                                 SEND(client->net, reply);
1534                                 DEBUG("OK!\n");
1535                                 len -= currlen;
1536                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1537                         }
1538                         continue;
1539                 }
1540
1541                 if (command==NBD_CMD_FLUSH) {
1542                         DEBUG("fl: ");
1543                         if (expflush(client)) {
1544                                 DEBUG("Flush failed: %m");
1545                                 ERROR(client, reply, errno);
1546                                 continue;
1547                         }
1548                         SEND(client->net, reply);
1549                         DEBUG("OK!\n");
1550                         continue;
1551                 }
1552
1553                 if (command==NBD_CMD_READ) {
1554                         DEBUG("exp->buf, ");
1555                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1556                         if (client->transactionlogfd != -1)
1557                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1558                         p = buf + sizeof(struct nbd_reply);
1559                         writelen = currlen + sizeof(struct nbd_reply);
1560                         while(len > 0) {
1561                                 if (expread(request.from, p, currlen, client)) {
1562                                         DEBUG("Read failed: %m");
1563                                         ERROR(client, reply, errno);
1564                                         continue;
1565                                 }
1566                                 
1567                                 DEBUG("buf->net, ");
1568                                 writeit(client->net, buf, writelen);
1569                                 len -= currlen;
1570                                 request.from += currlen;
1571                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1572                                 p = buf;
1573                                 writelen = currlen;
1574                         }
1575                         DEBUG("OK!\n");
1576                         continue;
1577                 }
1578
1579                 DEBUG ("Ignoring unknown command\n");
1580         }
1581         return 0;
1582 }
1583
1584 /**
1585  * Set up client export array, which is an array of FILE_INFO.
1586  * Also, split a single exportfile into multiple ones, if that was asked.
1587  * @param client information on the client which we want to setup export for
1588  **/
1589 void setupexport(CLIENT* client) {
1590         int i;
1591         off_t laststartoff = 0, lastsize = 0;
1592         int multifile = (client->server->flags & F_MULTIFILE);
1593
1594         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1595
1596         /* If multi-file, open as many files as we can.
1597          * If not, open exactly one file.
1598          * Calculate file sizes as we go to get total size. */
1599         for(i=0; ; i++) {
1600                 FILE_INFO fi;
1601                 gchar *tmpname;
1602                 gchar* error_string;
1603                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1604
1605                 if(multifile) {
1606                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1607                 } else {
1608                         tmpname=g_strdup(client->exportname);
1609                 }
1610                 DEBUG2( "Opening %s\n", tmpname );
1611                 fi.fhandle = open(tmpname, mode);
1612                 if(fi.fhandle == -1 && mode == O_RDWR) {
1613                         /* Try again because maybe media was read-only */
1614                         fi.fhandle = open(tmpname, O_RDONLY);
1615                         if(fi.fhandle != -1) {
1616                                 /* Opening the base file in copyonwrite mode is
1617                                  * okay */
1618                                 if(!(client->server->flags & F_COPYONWRITE)) {
1619                                         client->server->flags |= F_AUTOREADONLY;
1620                                         client->server->flags |= F_READONLY;
1621                                 }
1622                         }
1623                 }
1624                 if(fi.fhandle == -1) {
1625                         if(multifile && i>0)
1626                                 break;
1627                         error_string=g_strdup_printf(
1628                                 "Could not open exported file %s: %%m",
1629                                 tmpname);
1630                         err(error_string);
1631                 }
1632                 fi.startoff = laststartoff + lastsize;
1633                 g_array_append_val(client->export, fi);
1634                 g_free(tmpname);
1635
1636                 /* Starting offset and size of this file will be used to
1637                  * calculate starting offset of next file */
1638                 laststartoff = fi.startoff;
1639                 lastsize = size_autodetect(fi.fhandle);
1640
1641                 if(!multifile)
1642                         break;
1643         }
1644
1645         /* Set export size to total calculated size */
1646         client->exportsize = laststartoff + lastsize;
1647
1648         /* Export size may be overridden */
1649         if(client->server->expected_size) {
1650                 /* desired size must be <= total calculated size */
1651                 if(client->server->expected_size > client->exportsize) {
1652                         err("Size of exported file is too big\n");
1653                 }
1654
1655                 client->exportsize = client->server->expected_size;
1656         }
1657
1658         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1659         if(multifile) {
1660                 msg3(LOG_INFO, "Total number of files: %d", i);
1661         }
1662 }
1663
1664 int copyonwrite_prepare(CLIENT* client) {
1665         off_t i;
1666         if ((client->difffilename = malloc(1024))==NULL)
1667                 err("Failed to allocate string for diff file name");
1668         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1669                 (int)getpid()) ;
1670         client->difffilename[1023]='\0';
1671         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1672         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1673         if (client->difffile<0) err("Could not create diff file (%m)") ;
1674         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1675                 err("Could not allocate memory") ;
1676         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1677
1678         return 0;
1679 }
1680
1681 /**
1682  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1683  * options
1684  *
1685  * @param command the command to be ran. Read from the config file
1686  * @param file the file name we're about to export
1687  **/
1688 int do_run(gchar* command, gchar* file) {
1689         gchar* cmd;
1690         int retval=0;
1691
1692         if(command && *command) {
1693                 cmd = g_strdup_printf(command, file);
1694                 retval=system(cmd);
1695                 g_free(cmd);
1696         }
1697         return retval;
1698 }
1699
1700 /**
1701  * Serve a connection. 
1702  *
1703  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1704  * follow the road map.
1705  *
1706  * @param client a connected client
1707  **/
1708 void serveconnection(CLIENT *client) {
1709         if (client->server->transactionlog && (client->transactionlogfd == -1))
1710         {
1711                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1712                                                            O_WRONLY | O_CREAT,
1713                                                            S_IRUSR | S_IWUSR)))
1714                         g_warning("Could not open transaction log %s",
1715                                   client->server->transactionlog);
1716         }
1717
1718         if(do_run(client->server->prerun, client->exportname)) {
1719                 exit(EXIT_FAILURE);
1720         }
1721         setupexport(client);
1722
1723         if (client->server->flags & F_COPYONWRITE) {
1724                 copyonwrite_prepare(client);
1725         }
1726
1727         setmysockopt(client->net);
1728
1729         mainloop(client);
1730         do_run(client->server->postrun, client->exportname);
1731
1732         if (-1 != client->transactionlogfd)
1733         {
1734                 close(client->transactionlogfd);
1735                 client->transactionlogfd = -1;
1736         }
1737 }
1738
1739 /**
1740  * Find the name of the file we have to serve. This will use g_strdup_printf
1741  * to put the IP address of the client inside a filename containing
1742  * "%s" (in the form as specified by the "virtstyle" option). That name
1743  * is then written to client->exportname.
1744  *
1745  * @param net A socket connected to an nbd client
1746  * @param client information about the client. The IP address in human-readable
1747  * format will be written to a new char* buffer, the address of which will be
1748  * stored in client->clientname.
1749  **/
1750 void set_peername(int net, CLIENT *client) {
1751         struct sockaddr_storage addrin;
1752         struct sockaddr_storage netaddr;
1753         struct sockaddr_in  *netaddr4 = NULL;
1754         struct sockaddr_in6 *netaddr6 = NULL;
1755         size_t addrinlen = sizeof( addrin );
1756         struct addrinfo hints;
1757         struct addrinfo *ai = NULL;
1758         char peername[NI_MAXHOST];
1759         char netname[NI_MAXHOST];
1760         char *tmp = NULL;
1761         int i;
1762         int e;
1763         int shift;
1764
1765         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1766                 err("getsockname failed: %m");
1767
1768         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1769                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1770
1771         memset(&hints, '\0', sizeof (hints));
1772         hints.ai_flags = AI_ADDRCONFIG;
1773         e = getaddrinfo(peername, NULL, &hints, &ai);
1774
1775         if(e != 0) {
1776                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1777                 freeaddrinfo(ai);
1778                 return;
1779         }
1780
1781         switch(client->server->virtstyle) {
1782                 case VIRT_NONE:
1783                         client->exportname=g_strdup(client->server->exportname);
1784                         break;
1785                 case VIRT_IPHASH:
1786                         for(i=0;i<strlen(peername);i++) {
1787                                 if(peername[i]=='.') {
1788                                         peername[i]='/';
1789                                 }
1790                         }
1791                 case VIRT_IPLIT:
1792                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1793                         break;
1794                 case VIRT_CIDR:
1795                         memcpy(&netaddr, &addrin, addrinlen);
1796                         if(ai->ai_family == AF_INET) {
1797                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1798                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1799                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1800
1801                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1802                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1803                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1804                         }else if(ai->ai_family == AF_INET6) {
1805                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1806
1807                                 shift = 128-(client->server->cidrlen);
1808                                 i = 3;
1809                                 while(shift >= 32) {
1810                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1811                                         shift-=32;
1812                                         i--;
1813                                 }
1814                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1815                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1816
1817                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1818                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1819                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1820                         }
1821
1822                         if(tmp != NULL)
1823                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1824
1825                         break;
1826         }
1827
1828         freeaddrinfo(ai);
1829         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1830              peername, client->exportname);
1831         client->clientname=g_strdup(peername);
1832 }
1833
1834 /**
1835  * Destroy a pid_t*
1836  * @param data a pointer to pid_t which should be freed
1837  **/
1838 void destroy_pid_t(gpointer data) {
1839         g_free(data);
1840 }
1841
1842 /**
1843  * Loop through the available servers, and serve them. Never returns.
1844  **/
1845 int serveloop(GArray* servers) {
1846         struct sockaddr_storage addrin;
1847         socklen_t addrinlen=sizeof(addrin);
1848         int i;
1849         int max;
1850         int sock;
1851         fd_set mset;
1852         fd_set rset;
1853
1854         /* 
1855          * Set up the master fd_set. The set of descriptors we need
1856          * to select() for never changes anyway and it buys us a *lot*
1857          * of time to only build this once. However, if we ever choose
1858          * to not fork() for clients anymore, we may have to revisit
1859          * this.
1860          */
1861         max=0;
1862         FD_ZERO(&mset);
1863         for(i=0;i<servers->len;i++) {
1864                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1865                         FD_SET(sock, &mset);
1866                         max=sock>max?sock:max;
1867                 }
1868         }
1869         if(modernsock) {
1870                 FD_SET(modernsock, &mset);
1871                 max=modernsock>max?modernsock:max;
1872         }
1873         for(;;) {
1874                 CLIENT *client = NULL;
1875                 pid_t *pid;
1876
1877                 memcpy(&rset, &mset, sizeof(fd_set));
1878                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1879                         int net = 0;
1880                         SERVER* serve=NULL;
1881
1882                         DEBUG("accept, ");
1883                         if(FD_ISSET(modernsock, &rset)) {
1884                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1885                                         err("accept: %m");
1886                                 client = negotiate(net, NULL, servers);
1887                                 if(!client) {
1888                                         err_nonfatal("negotiation failed");
1889                                         close(net);
1890                                         net=0;
1891                                         continue;
1892                                 }
1893                                 serve = client->server;
1894                         }
1895                         for(i=0;i<servers->len && !net;i++) {
1896                                 serve=&(g_array_index(servers, SERVER, i));
1897                                 if(FD_ISSET(serve->socket, &rset)) {
1898                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1899                                                 err("accept: %m");
1900                                 }
1901                         }
1902                         if(net) {
1903                                 int sock_flags;
1904
1905                                 if(serve->max_connections > 0 &&
1906                                    g_hash_table_size(children) >= serve->max_connections) {
1907                                         msg2(LOG_INFO, "Max connections reached");
1908                                         close(net);
1909                                         continue;
1910                                 }
1911                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1912                                         err("fcntl F_GETFL");
1913                                 }
1914                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1915                                         err("fcntl F_SETFL ~O_NONBLOCK");
1916                                 }
1917                                 if(!client) {
1918                                         client = g_new0(CLIENT, 1);
1919                                         client->server=serve;
1920                                         client->exportsize=OFFT_MAX;
1921                                         client->net=net;
1922                                         client->transactionlogfd = -1;
1923                                 }
1924                                 set_peername(net, client);
1925                                 if (!authorized_client(client)) {
1926                                         msg2(LOG_INFO,"Unauthorized client") ;
1927                                         close(net);
1928                                         continue;
1929                                 }
1930                                 msg2(LOG_INFO,"Authorized client") ;
1931                                 pid=g_malloc(sizeof(pid_t));
1932
1933                                 if (!dontfork) {
1934                                         if ((*pid=fork())<0) {
1935                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1936                                                 close(net);
1937                                                 continue;
1938                                         }
1939                                         if (*pid>0) { /* parent */
1940                                                 close(net);
1941                                                 g_hash_table_insert(children, pid, pid);
1942                                                 continue;
1943                                         }
1944                                         /* child */
1945                                         g_hash_table_destroy(children);
1946                                         for(i=0;i<servers->len;i++) {
1947                                                 serve=&g_array_index(servers, SERVER, i);
1948                                                 close(serve->socket);
1949                                         }
1950                                         /* FALSE does not free the
1951                                            actual data. This is required,
1952                                            because the client has a
1953                                            direct reference into that
1954                                            data, and otherwise we get a
1955                                            segfault... */
1956                                         g_array_free(servers, FALSE);
1957                                 }
1958
1959                                 msg2(LOG_INFO,"Starting to serve");
1960                                 serveconnection(client);
1961                                 exit(EXIT_SUCCESS);
1962                         }
1963                 }
1964         }
1965 }
1966
1967 void dosockopts(int socket) {
1968 #ifndef sun
1969         int yes=1;
1970 #else
1971         char yes='1';
1972 #endif /* sun */
1973         int sock_flags;
1974
1975         /* lose the pesky "Address already in use" error message */
1976         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
1977                 err("setsockopt SO_REUSEADDR");
1978         }
1979         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
1980                 err("setsockopt SO_KEEPALIVE");
1981         }
1982
1983         /* make the listening socket non-blocking */
1984         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
1985                 err("fcntl F_GETFL");
1986         }
1987         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
1988                 err("fcntl F_SETFL O_NONBLOCK");
1989         }
1990 }
1991
1992 /**
1993  * Connect a server's socket.
1994  *
1995  * @param serve the server we want to connect.
1996  **/
1997 int setup_serve(SERVER *serve) {
1998         struct addrinfo hints;
1999         struct addrinfo *ai = NULL;
2000         gchar *port = NULL;
2001         int e;
2002
2003         if(!do_oldstyle) {
2004                 return serve->servename ? 1 : 0;
2005         }
2006         memset(&hints,'\0',sizeof(hints));
2007         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2008         hints.ai_socktype = SOCK_STREAM;
2009         hints.ai_family = serve->socket_family;
2010
2011         port = g_strdup_printf ("%d", serve->port);
2012         if (port == NULL)
2013                 return 0;
2014
2015         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2016
2017         g_free(port);
2018
2019         if(e != 0) {
2020                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2021                 serve->socket = -1;
2022                 freeaddrinfo(ai);
2023                 exit(EXIT_FAILURE);
2024         }
2025
2026         if(serve->socket_family == AF_UNSPEC)
2027                 serve->socket_family = ai->ai_family;
2028
2029 #ifdef WITH_SDP
2030         if ((serve->flags) && F_SDP) {
2031                 if (ai->ai_family == AF_INET)
2032                         ai->ai_family = AF_INET_SDP;
2033                 else (ai->ai_family == AF_INET6)
2034                         ai->ai_family = AF_INET6_SDP;
2035         }
2036 #endif
2037         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2038                 err("socket: %m");
2039
2040         dosockopts(serve->socket);
2041
2042         DEBUG("Waiting for connections... bind, ");
2043         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2044         if (e != 0 && errno != EADDRINUSE)
2045                 err("bind: %m");
2046         DEBUG("listen, ");
2047         if (listen(serve->socket, 1) < 0)
2048                 err("listen: %m");
2049
2050         freeaddrinfo (ai);
2051         if(serve->servename) {
2052                 return 1;
2053         } else {
2054                 return 0;
2055         }
2056 }
2057
2058 void open_modern(void) {
2059         struct addrinfo hints;
2060         struct addrinfo* ai = NULL;
2061         struct sock_flags;
2062         int e;
2063
2064         memset(&hints, '\0', sizeof(hints));
2065         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2066         hints.ai_socktype = SOCK_STREAM;
2067         hints.ai_family = AF_UNSPEC;
2068         hints.ai_protocol = IPPROTO_TCP;
2069         e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai);
2070         if(e != 0) {
2071                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2072                 exit(EXIT_FAILURE);
2073         }
2074         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2075                 err("socket: %m");
2076         }
2077
2078         dosockopts(modernsock);
2079
2080         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2081                 err("bind: %m");
2082         }
2083         if(listen(modernsock, 10) <0) {
2084                 err("listen: %m");
2085         }
2086
2087         freeaddrinfo(ai);
2088 }
2089
2090 /**
2091  * Connect our servers.
2092  **/
2093 void setup_servers(GArray* servers) {
2094         int i;
2095         struct sigaction sa;
2096         int want_modern=0;
2097
2098         for(i=0;i<servers->len;i++) {
2099                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2100         }
2101         if(want_modern) {
2102                 open_modern();
2103         }
2104         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2105
2106         sa.sa_handler = sigchld_handler;
2107         sigemptyset(&sa.sa_mask);
2108         sa.sa_flags = SA_RESTART;
2109         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2110                 err("sigaction: %m");
2111         sa.sa_handler = sigterm_handler;
2112         sigemptyset(&sa.sa_mask);
2113         sa.sa_flags = SA_RESTART;
2114         if(sigaction(SIGTERM, &sa, NULL) == -1)
2115                 err("sigaction: %m");
2116 }
2117
2118 /**
2119  * Go daemon (unless we specified at compile time that we didn't want this)
2120  * @param serve the first server of our configuration. If its port is zero,
2121  *      then do not daemonize, because we're doing inetd then. This parameter
2122  *      is only used to create a PID file of the form
2123  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2124  **/
2125 #if !defined(NODAEMON)
2126 void daemonize(SERVER* serve) {
2127         FILE*pidf;
2128
2129         if(serve && !(serve->port)) {
2130                 return;
2131         }
2132         if(daemon(0,0)<0) {
2133                 err("daemon");
2134         }
2135         if(!*pidftemplate) {
2136                 if(serve) {
2137                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2138                 } else {
2139                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2140                 }
2141         }
2142         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2143         pidf=fopen(pidfname, "w");
2144         if(pidf) {
2145                 fprintf(pidf,"%d\n", (int)getpid());
2146                 fclose(pidf);
2147         } else {
2148                 perror("fopen");
2149                 fprintf(stderr, "Not fatal; continuing");
2150         }
2151 }
2152 #else
2153 #define daemonize(serve)
2154 #endif /* !defined(NODAEMON) */
2155
2156 /*
2157  * Everything beyond this point (in the file) is run in non-daemon mode.
2158  * The stuff above daemonize() isn't.
2159  */
2160
2161 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2162
2163 void serve_err(SERVER* serve, const char* msg) {
2164         g_message("Export of %s on port %d failed:", serve->exportname,
2165                         serve->port);
2166         err(msg);
2167 }
2168
2169 /**
2170  * Set up user-ID and/or group-ID
2171  **/
2172 void dousers(void) {
2173         struct passwd *pw;
2174         struct group *gr;
2175         gchar* str;
2176         if(rungroup) {
2177                 gr=getgrnam(rungroup);
2178                 if(!gr) {
2179                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2180                         err(str);
2181                 }
2182                 if(setgid(gr->gr_gid)<0) {
2183                         err("Could not set GID: %m"); 
2184                 }
2185         }
2186         if(runuser) {
2187                 pw=getpwnam(runuser);
2188                 if(!pw) {
2189                         str = g_strdup_printf("Invalid user name: %s", runuser);
2190                         err(str);
2191                 }
2192                 if(setuid(pw->pw_uid)<0) {
2193                         err("Could not set UID: %m");
2194                 }
2195         }
2196 }
2197
2198 #ifndef ISSERVER
2199 void glib_message_syslog_redirect(const gchar *log_domain,
2200                                   GLogLevelFlags log_level,
2201                                   const gchar *message,
2202                                   gpointer user_data)
2203 {
2204     int level=LOG_DEBUG;
2205     
2206     switch( log_level )
2207     {
2208       case G_LOG_FLAG_FATAL:
2209       case G_LOG_LEVEL_CRITICAL:
2210       case G_LOG_LEVEL_ERROR:    
2211         level=LOG_ERR; 
2212         break;
2213       case G_LOG_LEVEL_WARNING:
2214         level=LOG_WARNING;
2215         break;
2216       case G_LOG_LEVEL_MESSAGE:
2217       case G_LOG_LEVEL_INFO:
2218         level=LOG_INFO;
2219         break;
2220       case G_LOG_LEVEL_DEBUG:
2221         level=LOG_DEBUG;
2222       default:
2223         level=LOG_ERR;
2224     }
2225     syslog(level, "%s", message);
2226 }
2227 #endif
2228
2229 /**
2230  * Main entry point...
2231  **/
2232 int main(int argc, char *argv[]) {
2233         SERVER *serve;
2234         GArray *servers;
2235         GError *err=NULL;
2236
2237         if (sizeof( struct nbd_request )!=28) {
2238                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2239                 exit(EXIT_FAILURE) ;
2240         }
2241
2242         memset(pidftemplate, '\0', 256);
2243
2244         logging();
2245         config_file_pos = g_strdup(CFILE);
2246         serve=cmdline(argc, argv);
2247         servers = parse_cfile(config_file_pos, &err);
2248         
2249         if(serve) {
2250                 serve->socket_family = AF_UNSPEC;
2251
2252                 append_serve(serve, servers);
2253      
2254                 if (!(serve->port)) {
2255                         CLIENT *client;
2256 #ifndef ISSERVER
2257                         /* You really should define ISSERVER if you're going to use
2258                          * inetd mode, but if you don't, closing stdout and stderr
2259                          * (which inetd had connected to the client socket) will let it
2260                          * work. */
2261                         close(1);
2262                         close(2);
2263                         open("/dev/null", O_WRONLY);
2264                         open("/dev/null", O_WRONLY);
2265                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2266 #endif
2267                         client=g_malloc(sizeof(CLIENT));
2268                         client->server=serve;
2269                         client->net=0;
2270                         client->exportsize=OFFT_MAX;
2271                         set_peername(0,client);
2272                         serveconnection(client);
2273                         return 0;
2274                 }
2275         }
2276     
2277         if(!servers || !servers->len) {
2278                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2279                                 && err->code == CFILE_NOTFOUND)) {
2280                         g_warning("Could not parse config file: %s", 
2281                                         err ? err->message : "Unknown error");
2282                 }
2283         }
2284         if(serve) {
2285                 g_warning("Specifying an export on the command line is deprecated.");
2286                 g_warning("Please use a configuration file instead.");
2287         }
2288
2289         if((!serve) && (!servers||!servers->len)) {
2290                 g_message("No configured exports; quitting.");
2291                 exit(EXIT_FAILURE);
2292         }
2293         if (!dontfork)
2294                 daemonize(serve);
2295         setup_servers(servers);
2296         dousers();
2297         serveloop(servers);
2298         return 0 ;
2299 }