Modernize DEBUG macros, and make code -Wall -Werror clean
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 int modernsock=0;         /**< Socket for the modern handler. Not used
169                                if a client was only specified on the
170                                command line; only port used if
171                                oldstyle is set to false (and then the
172                                command-line client isn't used, gna gna) */
173 char* modern_listen;      /**< listenaddr value for modernsock */
174
175 /**
176  * Types of virtuatlization
177  **/
178 typedef enum {
179         VIRT_NONE=0,    /**< No virtualization */
180         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
181         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
182                              doing the same as in IPLIT */
183         VIRT_CIDR,      /**< Every subnet in its own directory */
184 } VIRT_STYLE;
185
186 /**
187  * Variables associated with a server.
188  **/
189 typedef struct {
190         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
191         off_t expected_size; /**< size of the exported file as it was told to
192                                us through configuration */
193         gchar* listenaddr;   /**< The IP address we're listening on */
194         unsigned int port;   /**< port we're exporting this file at */
195         char* authname;      /**< filename of the authorization file */
196         int flags;           /**< flags associated with this exported file */
197         int socket;          /**< The socket of this server. */
198         int socket_family;   /**< family of the socket */
199         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
200         uint8_t cidrlen;     /**< The length of the mask when we use
201                                   CIDR-style virtualization */
202         gchar* prerun;       /**< command to be ran after connecting a client,
203                                   but before starting to serve */
204         gchar* postrun;      /**< command that will be ran after the client
205                                   disconnects */
206         gchar* servename;    /**< name of the export as selected by nbd-client */
207         int max_connections; /**< maximum number of opened connections */
208         gchar* transactionlog;/**< filename for transaction log */
209 } SERVER;
210
211 /**
212  * Variables associated with a client socket.
213  **/
214 typedef struct {
215         int fhandle;      /**< file descriptor */
216         off_t startoff;   /**< starting offset of this file */
217 } FILE_INFO;
218
219 typedef struct {
220         off_t exportsize;    /**< size of the file we're exporting */
221         char *clientname;    /**< peer */
222         char *exportname;    /**< (processed) filename of the file we're exporting */
223         GArray *export;    /**< array of FILE_INFO of exported files;
224                                array size is always 1 unless we're
225                                doing the multiple file option */
226         int net;             /**< The actual client socket */
227         SERVER *server;      /**< The server this client is getting data from */
228         char* difffilename;  /**< filename of the copy-on-write file, if any */
229         int difffile;        /**< filedescriptor of copyonwrite file. @todo
230                                shouldn't this be an array too? (cfr export) Or
231                                make -m and -c mutually exclusive */
232         u32 difffilelen;     /**< number of pages in difffile */
233         u32 *difmap;         /**< see comment on the global difmap for this one */
234         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
235         int transactionlogfd;/**< fd for transaction log */
236 } CLIENT;
237
238 /**
239  * Type of configuration file values
240  **/
241 typedef enum {
242         PARAM_INT,              /**< This parameter is an integer */
243         PARAM_STRING,           /**< This parameter is a string */
244         PARAM_BOOL,             /**< This parameter is a boolean */
245 } PARAM_TYPE;
246
247 /**
248  * Configuration file values
249  **/
250 typedef struct {
251         gchar *paramname;       /**< Name of the parameter, as it appears in
252                                   the config file */
253         gboolean required;      /**< Whether this is a required (as opposed to
254                                   optional) parameter */
255         PARAM_TYPE ptype;       /**< Type of the parameter. */
256         gpointer target;        /**< Pointer to where the data of this
257                                   parameter should be written. If ptype is
258                                   PARAM_BOOL, the data is or'ed rather than
259                                   overwritten. */
260         gint flagval;           /**< Flag mask for this parameter in case ptype
261                                   is PARAM_BOOL. */
262 } PARAM;
263
264 /**
265  * Check whether a client is allowed to connect. Works with an authorization
266  * file which contains one line per machine, no wildcards.
267  *
268  * @param opts The client who's trying to connect.
269  * @return 0 - authorization refused, 1 - OK
270  **/
271 int authorized_client(CLIENT *opts) {
272         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
273         FILE *f ;
274         char line[LINELEN]; 
275         char *tmp;
276         struct in_addr addr;
277         struct in_addr client;
278         struct in_addr cltemp;
279         int len;
280
281         if ((f=fopen(opts->server->authname,"r"))==NULL) {
282                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
283                      opts->server->authname,strerror(errno)) ;
284                 return 1 ; 
285         }
286   
287         inet_aton(opts->clientname, &client);
288         while (fgets(line,LINELEN,f)!=NULL) {
289                 if((tmp=index(line, '/'))) {
290                         if(strlen(line)<=tmp-line) {
291                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
292                                 return 0;
293                         }
294                         *(tmp++)=0;
295                         if(!inet_aton(line,&addr)) {
296                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
297                                 return 0;
298                         }
299                         len=strtol(tmp, NULL, 0);
300                         addr.s_addr>>=32-len;
301                         addr.s_addr<<=32-len;
302                         memcpy(&cltemp,&client,sizeof(client));
303                         cltemp.s_addr>>=32-len;
304                         cltemp.s_addr<<=32-len;
305                         if(addr.s_addr == cltemp.s_addr) {
306                                 return 1;
307                         }
308                 }
309                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
310                         fclose(f);
311                         return 1;
312                 }
313         }
314         fclose(f);
315         return 0;
316 }
317
318 /**
319  * Read data from a file descriptor into a buffer
320  *
321  * @param f a file descriptor
322  * @param buf a buffer
323  * @param len the number of bytes to be read
324  **/
325 static inline void readit(int f, void *buf, size_t len) {
326         ssize_t res;
327         while (len > 0) {
328                 DEBUG("*");
329                 if ((res = read(f, buf, len)) <= 0) {
330                         if(errno != EAGAIN) {
331                                 err("Read failed: %m");
332                         }
333                 } else {
334                         len -= res;
335                         buf += res;
336                 }
337         }
338 }
339
340 /**
341  * Write data from a buffer into a filedescriptor
342  *
343  * @param f a file descriptor
344  * @param buf a buffer containing data
345  * @param len the number of bytes to be written
346  **/
347 static inline void writeit(int f, void *buf, size_t len) {
348         ssize_t res;
349         while (len > 0) {
350                 DEBUG("+");
351                 if ((res = write(f, buf, len)) <= 0)
352                         err("Send failed: %m");
353                 len -= res;
354                 buf += res;
355         }
356 }
357
358 /**
359  * Print out a message about how to use nbd-server. Split out to a separate
360  * function so that we can call it from multiple places
361  */
362 void usage() {
363         printf("This is nbd-server version " VERSION "\n");
364         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
365                "\t-r|--read-only\t\tread only\n"
366                "\t-m|--multi-file\t\tmultiple file\n"
367                "\t-c|--copy-on-write\tcopy on write\n"
368                "\t-C|--config-file\tspecify an alternate configuration file\n"
369                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
370                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
371                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
372                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
373                "\tif port is set to 0, stdin is used (for running from inetd)\n"
374                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
375                "\t\taddress of the machine trying to connect\n" 
376                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
377         printf("Using configuration file %s\n", CFILE);
378 }
379
380 /* Dumps a config file section of the given SERVER*, and exits. */
381 void dump_section(SERVER* serve, gchar* section_header) {
382         printf("[%s]\n", section_header);
383         printf("\texportname = %s\n", serve->exportname);
384         printf("\tlistenaddr = %s\n", serve->listenaddr);
385         printf("\tport = %d\n", serve->port);
386         if(serve->flags & F_READONLY) {
387                 printf("\treadonly = true\n");
388         }
389         if(serve->flags & F_MULTIFILE) {
390                 printf("\tmultifile = true\n");
391         }
392         if(serve->flags & F_COPYONWRITE) {
393                 printf("\tcopyonwrite = true\n");
394         }
395         if(serve->expected_size) {
396                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
397         }
398         if(serve->authname) {
399                 printf("\tauthfile = %s\n", serve->authname);
400         }
401         exit(EXIT_SUCCESS);
402 }
403
404 /**
405  * Parse the command line.
406  *
407  * @param argc the argc argument to main()
408  * @param argv the argv argument to main()
409  **/
410 SERVER* cmdline(int argc, char *argv[]) {
411         int i=0;
412         int nonspecial=0;
413         int c;
414         struct option long_options[] = {
415                 {"read-only", no_argument, NULL, 'r'},
416                 {"multi-file", no_argument, NULL, 'm'},
417                 {"copy-on-write", no_argument, NULL, 'c'},
418                 {"dont-fork", no_argument, NULL, 'd'},
419                 {"authorize-file", required_argument, NULL, 'l'},
420                 {"config-file", required_argument, NULL, 'C'},
421                 {"pid-file", required_argument, NULL, 'p'},
422                 {"output-config", required_argument, NULL, 'o'},
423                 {"max-connection", required_argument, NULL, 'M'},
424                 {0,0,0,0}
425         };
426         SERVER *serve;
427         off_t es;
428         size_t last;
429         char suffix;
430         gboolean do_output=FALSE;
431         gchar* section_header="";
432         gchar** addr_port;
433
434         if(argc==1) {
435                 return NULL;
436         }
437         serve=g_new0(SERVER, 1);
438         serve->authname = g_strdup(default_authname);
439         serve->virtstyle=VIRT_IPLIT;
440         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
441                 switch (c) {
442                 case 1:
443                         /* non-option argument */
444                         switch(nonspecial++) {
445                         case 0:
446                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
447                                         addr_port=g_strsplit(optarg, ":", 2);
448
449                                         /* Check for "@" - maybe user using this separator
450                                                  for IPv4 address */
451                                         if(!addr_port[1]) {
452                                                 g_strfreev(addr_port);
453                                                 addr_port=g_strsplit(optarg, "@", 2);
454                                         }
455                                 } else {
456                                         addr_port=g_strsplit(optarg, "@", 2);
457                                 }
458
459                                 if(addr_port[1]) {
460                                         serve->port=strtol(addr_port[1], NULL, 0);
461                                         serve->listenaddr=g_strdup(addr_port[0]);
462                                 } else {
463                                         serve->listenaddr=NULL;
464                                         serve->port=strtol(addr_port[0], NULL, 0);
465                                 }
466                                 g_strfreev(addr_port);
467                                 break;
468                         case 1:
469                                 serve->exportname = g_strdup(optarg);
470                                 if(serve->exportname[0] != '/') {
471                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
472                                         exit(EXIT_FAILURE);
473                                 }
474                                 break;
475                         case 2:
476                                 last=strlen(optarg)-1;
477                                 suffix=optarg[last];
478                                 if (suffix == 'k' || suffix == 'K' ||
479                                     suffix == 'm' || suffix == 'M')
480                                         optarg[last] = '\0';
481                                 es = (off_t)atoll(optarg);
482                                 switch (suffix) {
483                                         case 'm':
484                                         case 'M':  es <<= 10;
485                                         case 'k':
486                                         case 'K':  es <<= 10;
487                                         default :  break;
488                                 }
489                                 serve->expected_size = es;
490                                 break;
491                         }
492                         break;
493                 case 'r':
494                         serve->flags |= F_READONLY;
495                         break;
496                 case 'm':
497                         serve->flags |= F_MULTIFILE;
498                         break;
499                 case 'o':
500                         do_output = TRUE;
501                         section_header = g_strdup(optarg);
502                         break;
503                 case 'p':
504                         strncpy(pidftemplate, optarg, 256);
505                         break;
506                 case 'c': 
507                         serve->flags |=F_COPYONWRITE;
508                         break;
509                 case 'd': 
510                         dontfork = 1;
511                         break;
512                 case 'C':
513                         g_free(config_file_pos);
514                         config_file_pos=g_strdup(optarg);
515                         break;
516                 case 'l':
517                         g_free(serve->authname);
518                         serve->authname=g_strdup(optarg);
519                         break;
520                 case 'M':
521                         serve->max_connections = strtol(optarg, NULL, 0);
522                         break;
523                 default:
524                         usage();
525                         exit(EXIT_FAILURE);
526                         break;
527                 }
528         }
529         /* What's left: the port to export, the name of the to be exported
530          * file, and, optionally, the size of the file, in that order. */
531         if(nonspecial<2) {
532                 g_free(serve);
533                 serve=NULL;
534         } else {
535                 do_oldstyle = TRUE;
536         }
537         if(do_output) {
538                 if(!serve) {
539                         g_critical("Need a complete configuration on the command line to output a config file section!");
540                         exit(EXIT_FAILURE);
541                 }
542                 dump_section(serve, section_header);
543         }
544         return serve;
545 }
546
547 /**
548  * Error codes for config file parsing
549  **/
550 typedef enum {
551         CFILE_NOTFOUND,         /**< The configuration file is not found */
552         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
553         CFILE_KEY_MISSING,      /**< A (required) key is missing */
554         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
555         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
556         CFILE_PROGERR,          /**< Programmer error */
557         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
558                                      define any exports */
559         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
560                                      old-style export. */
561 } CFILE_ERRORS;
562
563 /**
564  * Remove a SERVER from memory. Used from the hash table
565  **/
566 void remove_server(gpointer s) {
567         SERVER *server;
568
569         server=(SERVER*)s;
570         g_free(server->exportname);
571         if(server->authname)
572                 g_free(server->authname);
573         if(server->listenaddr)
574                 g_free(server->listenaddr);
575         if(server->prerun)
576                 g_free(server->prerun);
577         if(server->postrun)
578                 g_free(server->postrun);
579         if(server->transactionlog)
580                 g_free(server->transactionlog);
581         g_free(server);
582 }
583
584 /**
585  * duplicate server
586  * @param s the old server we want to duplicate
587  * @return new duplicated server
588  **/
589 SERVER* dup_serve(SERVER *s) {
590         SERVER *serve = NULL;
591
592         serve=g_new0(SERVER, 1);
593         if(serve == NULL)
594                 return NULL;
595
596         if(s->exportname)
597                 serve->exportname = g_strdup(s->exportname);
598
599         serve->expected_size = s->expected_size;
600
601         if(s->listenaddr)
602                 serve->listenaddr = g_strdup(s->listenaddr);
603
604         serve->port = s->port;
605
606         if(s->authname)
607                 serve->authname = strdup(s->authname);
608
609         serve->flags = s->flags;
610         serve->socket = s->socket;
611         serve->socket_family = s->socket_family;
612         serve->virtstyle = s->virtstyle;
613         serve->cidrlen = s->cidrlen;
614
615         if(s->prerun)
616                 serve->prerun = g_strdup(s->prerun);
617
618         if(s->postrun)
619                 serve->postrun = g_strdup(s->postrun);
620
621         if(s->transactionlog)
622                 serve->transactionlog = g_strdup(s->transactionlog);
623         
624         if(s->servename)
625                 serve->servename = g_strdup(s->servename);
626
627         serve->max_connections = s->max_connections;
628
629         return serve;
630 }
631
632 /**
633  * append new server to array
634  * @param s server
635  * @param a server array
636  * @return 0 success, -1 error
637  */
638 int append_serve(SERVER *s, GArray *a) {
639         SERVER *ns = NULL;
640         struct addrinfo hints;
641         struct addrinfo *ai = NULL;
642         struct addrinfo *rp = NULL;
643         char   host[NI_MAXHOST];
644         gchar  *port = NULL;
645         int e;
646         int ret;
647
648         if(!s) {
649                 err("Invalid parsing server");
650                 return -1;
651         }
652
653         port = g_strdup_printf("%d", s->port);
654
655         memset(&hints,'\0',sizeof(hints));
656         hints.ai_family = AF_UNSPEC;
657         hints.ai_socktype = SOCK_STREAM;
658         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
659         hints.ai_protocol = IPPROTO_TCP;
660
661         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
662
663         if (port)
664                 g_free(port);
665
666         if(e == 0) {
667                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
668                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
669
670                         if (e != 0) { // error
671                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
672                                 continue;
673                         }
674
675                         // duplicate server and set listenaddr to resolved IP address
676                         ns = dup_serve (s);
677                         if (ns) {
678                                 ns->listenaddr = g_strdup(host);
679                                 ns->socket_family = rp->ai_family;
680                                 g_array_append_val(a, *ns);
681                                 free(ns);
682                                 ns = NULL;
683                         }
684                 }
685
686                 ret = 0;
687         } else {
688                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
689                 ret = -1;
690         }
691
692         if (ai)
693                 freeaddrinfo(ai);
694
695         return ret;
696 }
697
698 /**
699  * Parse the config file.
700  *
701  * @param f the name of the config file
702  * @param e a GError. @see CFILE_ERRORS for what error values this function can
703  *      return.
704  * @return a Array of SERVER* pointers, If the config file is empty or does not
705  *      exist, returns an empty GHashTable; if the config file contains an
706  *      error, returns NULL, and e is set appropriately
707  **/
708 GArray* parse_cfile(gchar* f, GError** e) {
709         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
710         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
711         SERVER s;
712         gchar *virtstyle=NULL;
713         PARAM lp[] = {
714                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
715                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
716                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
717                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
718                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
719                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
720                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
721                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
722                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
723                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
724                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
725                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
726                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
727                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
728                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
729                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
730                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
731                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
732                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
733         };
734         const int lp_size=sizeof(lp)/sizeof(PARAM);
735         PARAM gp[] = {
736                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
737                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
738                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
739                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
740         };
741         PARAM* p=gp;
742         int p_size=sizeof(gp)/sizeof(PARAM);
743         GKeyFile *cfile;
744         GError *err = NULL;
745         const char *err_msg=NULL;
746         GQuark errdomain;
747         GArray *retval=NULL;
748         gchar **groups;
749         gboolean value;
750         gchar* startgroup;
751         gint i;
752         gint j;
753
754         errdomain = g_quark_from_string("parse_cfile");
755         cfile = g_key_file_new();
756         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
757         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
758                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
759                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
760                 g_key_file_free(cfile);
761                 return retval;
762         }
763         startgroup = g_key_file_get_start_group(cfile);
764         if(!startgroup || strcmp(startgroup, "generic")) {
765                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
766                 g_key_file_free(cfile);
767                 return NULL;
768         }
769         groups = g_key_file_get_groups(cfile, NULL);
770         for(i=0;groups[i];i++) {
771                 memset(&s, '\0', sizeof(SERVER));
772
773                 /* After the [generic] group, start parsing exports */
774                 if(i==1) {
775                         p=lp;
776                         p_size=lp_size;
777                 } 
778                 for(j=0;j<p_size;j++) {
779                         g_assert(p[j].target != NULL);
780                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
781                         switch(p[j].ptype) {
782                                 case PARAM_INT:
783                                         *((gint*)p[j].target) =
784                                                 g_key_file_get_integer(cfile,
785                                                                 groups[i],
786                                                                 p[j].paramname,
787                                                                 &err);
788                                         break;
789                                 case PARAM_STRING:
790                                         *((gchar**)p[j].target) =
791                                                 g_key_file_get_string(cfile,
792                                                                 groups[i],
793                                                                 p[j].paramname,
794                                                                 &err);
795                                         break;
796                                 case PARAM_BOOL:
797                                         value = g_key_file_get_boolean(cfile,
798                                                         groups[i],
799                                                         p[j].paramname, &err);
800                                         if(!err) {
801                                                 if(value) {
802                                                         *((gint*)p[j].target) |= p[j].flagval;
803                                                 } else {
804                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
805                                                 }
806                                         }
807                                         break;
808                         }
809                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) {
810                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export");
811                                 g_key_file_free(cfile);
812                                 return NULL;
813                         }
814                         if(err) {
815                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
816                                         if(!p[j].required) {
817                                                 /* Ignore not-found error for optional values */
818                                                 g_clear_error(&err);
819                                                 continue;
820                                         } else {
821                                                 err_msg = MISSING_REQUIRED_ERROR;
822                                         }
823                                 } else {
824                                         err_msg = DEFAULT_ERROR;
825                                 }
826                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
827                                 g_array_free(retval, TRUE);
828                                 g_error_free(err);
829                                 g_key_file_free(cfile);
830                                 return NULL;
831                         }
832                 }
833                 if(virtstyle) {
834                         if(!strncmp(virtstyle, "none", 4)) {
835                                 s.virtstyle=VIRT_NONE;
836                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
837                                 s.virtstyle=VIRT_IPLIT;
838                         } else if(!strncmp(virtstyle, "iphash", 6)) {
839                                 s.virtstyle=VIRT_IPHASH;
840                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
841                                 s.virtstyle=VIRT_CIDR;
842                                 if(strlen(virtstyle)<10) {
843                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
844                                         g_array_free(retval, TRUE);
845                                         g_key_file_free(cfile);
846                                         return NULL;
847                                 }
848                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
849                         } else {
850                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
851                                 g_array_free(retval, TRUE);
852                                 g_key_file_free(cfile);
853                                 return NULL;
854                         }
855                         if(s.port && !do_oldstyle) {
856                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
857                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
858                         }
859                 } else {
860                         s.virtstyle=VIRT_IPLIT;
861                 }
862                 /* Don't need to free this, it's not our string */
863                 virtstyle=NULL;
864                 /* Don't append values for the [generic] group */
865                 if(i>0) {
866                         s.socket_family = AF_UNSPEC;
867                         s.servename = groups[i];
868
869                         append_serve(&s, retval);
870                 } else {
871                         if(!do_oldstyle) {
872                                 lp[1].required = 0;
873                         }
874                 }
875 #ifndef WITH_SDP
876                 if(s.flags & F_SDP) {
877                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
878                         g_array_free(retval, TRUE);
879                         g_key_file_free(cfile);
880                         return NULL;
881                 }
882 #endif
883         }
884         if(i==1) {
885                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
886         }
887         g_key_file_free(cfile);
888         return retval;
889 }
890
891 /**
892  * Signal handler for SIGCHLD
893  * @param s the signal we're handling (must be SIGCHLD, or something
894  * is severely wrong)
895  **/
896 void sigchld_handler(int s) {
897         int status;
898         int* i;
899         pid_t pid;
900
901         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
902                 if(WIFEXITED(status)) {
903                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
904                 }
905                 i=g_hash_table_lookup(children, &pid);
906                 if(!i) {
907                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
908                 } else {
909                         DEBUG("Removing %d from the list of children", pid);
910                         g_hash_table_remove(children, &pid);
911                 }
912         }
913 }
914
915 /**
916  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
917  *
918  * @param key the key
919  * @param value the value corresponding to the above key
920  * @param user_data a pointer which we always set to 1, so that we know what
921  * will happen next.
922  **/
923 void killchild(gpointer key, gpointer value, gpointer user_data) {
924         pid_t *pid=value;
925         int *parent=user_data;
926
927         kill(*pid, SIGTERM);
928         *parent=1;
929 }
930
931 /**
932  * Handle SIGTERM and dispatch it to our children
933  * @param s the signal we're handling (must be SIGTERM, or something
934  * is severely wrong).
935  **/
936 void sigterm_handler(int s) {
937         int parent=0;
938
939         g_hash_table_foreach(children, killchild, &parent);
940
941         if(parent) {
942                 unlink(pidfname);
943         }
944
945         exit(EXIT_SUCCESS);
946 }
947
948 /**
949  * Detect the size of a file.
950  *
951  * @param fhandle An open filedescriptor
952  * @return the size of the file, or OFFT_MAX if detection was
953  * impossible.
954  **/
955 off_t size_autodetect(int fhandle) {
956         off_t es;
957         u64 bytes;
958         struct stat stat_buf;
959         int error;
960
961 #ifdef HAVE_SYS_MOUNT_H
962 #ifdef HAVE_SYS_IOCTL_H
963 #ifdef BLKGETSIZE64
964         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
965         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
966                 return (off_t)bytes;
967         }
968 #endif /* BLKGETSIZE64 */
969 #endif /* HAVE_SYS_IOCTL_H */
970 #endif /* HAVE_SYS_MOUNT_H */
971
972         DEBUG("looking for fhandle size with fstat\n");
973         stat_buf.st_size = 0;
974         error = fstat(fhandle, &stat_buf);
975         if (!error) {
976                 if(stat_buf.st_size > 0)
977                         return (off_t)stat_buf.st_size;
978         } else {
979                 err("fstat failed: %m");
980         }
981
982         DEBUG("looking for fhandle size with lseek SEEK_END\n");
983         es = lseek(fhandle, (off_t)0, SEEK_END);
984         if (es > ((off_t)0)) {
985                 return es;
986         } else {
987                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
988         }
989
990         err("Could not find size of exported block device: %m");
991         return OFFT_MAX;
992 }
993
994 /**
995  * Get the file handle and offset, given an export offset.
996  *
997  * @param export An array of export files
998  * @param a The offset to get corresponding file/offset for
999  * @param fhandle [out] File descriptor
1000  * @param foffset [out] Offset into fhandle
1001  * @param maxbytes [out] Tells how many bytes can be read/written
1002  * from fhandle starting at foffset (0 if there is no limit)
1003  * @return 0 on success, -1 on failure
1004  **/
1005 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1006         /* Negative offset not allowed */
1007         if(a < 0)
1008                 return -1;
1009
1010         /* Binary search for last file with starting offset <= a */
1011         FILE_INFO fi;
1012         int start = 0;
1013         int end = export->len - 1;
1014         while( start <= end ) {
1015                 int mid = (start + end) / 2;
1016                 fi = g_array_index(export, FILE_INFO, mid);
1017                 if( fi.startoff < a ) {
1018                         start = mid + 1;
1019                 } else if( fi.startoff > a ) {
1020                         end = mid - 1;
1021                 } else {
1022                         start = end = mid;
1023                         break;
1024                 }
1025         }
1026
1027         /* end should never go negative, since first startoff is 0 and a >= 0 */
1028         g_assert(end >= 0);
1029
1030         fi = g_array_index(export, FILE_INFO, end);
1031         *fhandle = fi.fhandle;
1032         *foffset = a - fi.startoff;
1033         *maxbytes = 0;
1034         if( end+1 < export->len ) {
1035                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1036                 *maxbytes = fi_next.startoff - a;
1037         }
1038
1039         return 0;
1040 }
1041
1042 /**
1043  * seek to a position in a file, with error handling.
1044  * @param handle a filedescriptor
1045  * @param a position to seek to
1046  * @todo get rid of this; lastpoint is a global variable right now, but it
1047  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1048  * easier.
1049  **/
1050 void myseek(int handle,off_t a) {
1051         if (lseek(handle, a, SEEK_SET) < 0) {
1052                 err("Can not seek locally!\n");
1053         }
1054 }
1055
1056 /**
1057  * Write an amount of bytes at a given offset to the right file. This
1058  * abstracts the write-side of the multiple file option.
1059  *
1060  * @param a The offset where the write should start
1061  * @param buf The buffer to write from
1062  * @param len The length of buf
1063  * @param client The client we're serving for
1064  * @return The number of bytes actually written, or -1 in case of an error
1065  **/
1066 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1067         int fhandle;
1068         off_t foffset;
1069         size_t maxbytes;
1070         ssize_t retval;
1071
1072         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1073                 return -1;
1074         if(maxbytes && len > maxbytes)
1075                 len = maxbytes;
1076
1077         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1078
1079         myseek(fhandle, foffset);
1080         retval = write(fhandle, buf, len);
1081         if(client->server->flags & F_SYNC) {
1082                 fsync(fhandle);
1083         } else if (fua) {
1084
1085           /* This is where we would do the following
1086            *   #ifdef USE_SYNC_FILE_RANGE
1087            * However, we don't, for the reasons set out below
1088            * by Christoph Hellwig <hch@infradead.org>
1089            *
1090            * [BEGINS] 
1091            * fdatasync is equivalent to fsync except that it does not flush
1092            * non-essential metadata (basically just timestamps in practice), but it
1093            * does flush metadata requried to find the data again, e.g. allocation
1094            * information and extent maps.  sync_file_range does nothing but flush
1095            * out pagecache content - it means you basically won't get your data
1096            * back in case of a crash if you either:
1097            * 
1098            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1099            *  b) are using a sparse file on a filesystem
1100            *  c) are using a fallocate-preallocated file on a filesystem
1101            *  d) use any file on a COW filesystem like btrfs
1102            * 
1103            * e.g. it only does anything useful for you if you do not have a volatile
1104            * write cache, and either use a raw block device node, or just overwrite
1105            * an already fully allocated (and not preallocated) file on a non-COW
1106            * filesystem.
1107            * [ENDS]
1108            *
1109            * What we should do is open a second FD with O_DSYNC set, then write to
1110            * that when appropriate. However, with a Linux client, every REQ_FUA
1111            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1112            * problems.
1113            *
1114            */
1115 #if 0
1116                 sync_file_range(fhandle, foffset, len,
1117                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1118                                 SYNC_FILE_RANGE_WAIT_AFTER);
1119 #else
1120                 fdatasync(fhandle);
1121 #endif
1122         }
1123         return retval;
1124 }
1125
1126 /**
1127  * Call rawexpwrite repeatedly until all data has been written.
1128  * @return 0 on success, nonzero on failure
1129  **/
1130 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1131         ssize_t ret=0;
1132
1133         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1134                 a += ret;
1135                 buf += ret;
1136                 len -= ret;
1137         }
1138         return (ret < 0 || len != 0);
1139 }
1140
1141 /**
1142  * Read an amount of bytes at a given offset from the right file. This
1143  * abstracts the read-side of the multiple files option.
1144  *
1145  * @param a The offset where the read should start
1146  * @param buf A buffer to read into
1147  * @param len The size of buf
1148  * @param client The client we're serving for
1149  * @return The number of bytes actually read, or -1 in case of an
1150  * error.
1151  **/
1152 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1153         int fhandle;
1154         off_t foffset;
1155         size_t maxbytes;
1156
1157         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1158                 return -1;
1159         if(maxbytes && len > maxbytes)
1160                 len = maxbytes;
1161
1162         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1163
1164         myseek(fhandle, foffset);
1165         return read(fhandle, buf, len);
1166 }
1167
1168 /**
1169  * Call rawexpread repeatedly until all data has been read.
1170  * @return 0 on success, nonzero on failure
1171  **/
1172 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1173         ssize_t ret=0;
1174
1175         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1176                 a += ret;
1177                 buf += ret;
1178                 len -= ret;
1179         }
1180         return (ret < 0 || len != 0);
1181 }
1182
1183 /**
1184  * Read an amount of bytes at a given offset from the right file. This
1185  * abstracts the read-side of the copyonwrite stuff, and calls
1186  * rawexpread() with the right parameters to do the actual work.
1187  * @param a The offset where the read should start
1188  * @param buf A buffer to read into
1189  * @param len The size of buf
1190  * @param client The client we're going to read for
1191  * @return 0 on success, nonzero on failure
1192  **/
1193 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1194         off_t rdlen, offset;
1195         off_t mapcnt, mapl, maph, pagestart;
1196
1197         if (!(client->server->flags & F_COPYONWRITE))
1198                 return(rawexpread_fully(a, buf, len, client));
1199         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1200
1201         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1202
1203         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1204                 pagestart=mapcnt*DIFFPAGESIZE;
1205                 offset=a-pagestart;
1206                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1207                         len : (size_t)DIFFPAGESIZE-offset;
1208                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1209                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1210                                (unsigned long)(client->difmap[mapcnt]));
1211                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1212                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1213                 } else { /* the block is not there */
1214                         DEBUG("Page %llu is not here, we read the original one\n",
1215                                (unsigned long long)mapcnt);
1216                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1217                 }
1218                 len-=rdlen; a+=rdlen; buf+=rdlen;
1219         }
1220         return 0;
1221 }
1222
1223 /**
1224  * Write an amount of bytes at a given offset to the right file. This
1225  * abstracts the write-side of the copyonwrite option, and calls
1226  * rawexpwrite() with the right parameters to do the actual work.
1227  *
1228  * @param a The offset where the write should start
1229  * @param buf The buffer to write from
1230  * @param len The length of buf
1231  * @param client The client we're going to write for.
1232  * @return 0 on success, nonzero on failure
1233  **/
1234 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1235         char pagebuf[DIFFPAGESIZE];
1236         off_t mapcnt,mapl,maph;
1237         off_t wrlen,rdlen; 
1238         off_t pagestart;
1239         off_t offset;
1240
1241         if (!(client->server->flags & F_COPYONWRITE))
1242                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1243         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1244
1245         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1246
1247         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1248                 pagestart=mapcnt*DIFFPAGESIZE ;
1249                 offset=a-pagestart ;
1250                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1251                         len : (size_t)DIFFPAGESIZE-offset;
1252
1253                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1254                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1255                                (unsigned long)(client->difmap[mapcnt])) ;
1256                         myseek(client->difffile,
1257                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1258                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1259                 } else { /* the block is not there */
1260                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1261                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1262                         DEBUG("Page %llu is not here, we put it at %lu\n",
1263                                (unsigned long long)mapcnt,
1264                                (unsigned long)(client->difmap[mapcnt]));
1265                         rdlen=DIFFPAGESIZE ;
1266                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1267                                 return -1;
1268                         memcpy(pagebuf+offset,buf,wrlen) ;
1269                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1270                                         DIFFPAGESIZE)
1271                                 return -1;
1272                 }                                                   
1273                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1274         }
1275         if (client->server->flags & F_SYNC) {
1276                 fsync(client->difffile);
1277         } else if (fua) {
1278                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1279                    as we iterate through the above?
1280                  */
1281                 fdatasync(client->difffile);
1282         }
1283         return 0;
1284 }
1285
1286 int expflush(CLIENT *client) {
1287         gint i;
1288
1289         if (client->server->flags & F_COPYONWRITE) {
1290                 return fsync(client->difffile);
1291         }
1292         
1293         for (i = 0; i < client->export->len; i++) {
1294                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1295                 if (fsync(fi.fhandle) < 0)
1296                         return -1;
1297         }
1298         
1299         return 0;
1300 }
1301
1302 /**
1303  * Do the initial negotiation.
1304  *
1305  * @param client The client we're negotiating with.
1306  **/
1307 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1308         char zeros[128];
1309         uint64_t size_host;
1310         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1311         uint16_t smallflags = 0;
1312         uint64_t magic;
1313
1314         memset(zeros, '\0', sizeof(zeros));
1315         if(!client || !client->modern) {
1316                 /* common */
1317                 if (write(net, INIT_PASSWD, 8) < 0) {
1318                         err_nonfatal("Negotiation failed: %m");
1319                         if(client)
1320                                 exit(EXIT_FAILURE);
1321                 }
1322                 if(!client || client->modern) {
1323                         /* modern */
1324                         magic = htonll(opts_magic);
1325                 } else {
1326                         /* oldstyle */
1327                         magic = htonll(cliserv_magic);
1328                 }
1329                 if (write(net, &magic, sizeof(magic)) < 0) {
1330                         err_nonfatal("Negotiation failed: %m");
1331                         if(client)
1332                                 exit(EXIT_FAILURE);
1333                 }
1334         }
1335         if(!client) {
1336                 /* modern */
1337                 uint32_t reserved;
1338                 uint32_t opt;
1339                 uint32_t namelen;
1340                 char* name;
1341                 int i;
1342
1343                 if(!servers)
1344                         err("programmer error");
1345                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1346                         err("Negotiation failed: %m");
1347                 if (read(net, &reserved, sizeof(reserved)) < 0)
1348                         err("Negotiation failed: %m");
1349                 if (read(net, &magic, sizeof(magic)) < 0)
1350                         err("Negotiation failed: %m");
1351                 magic = ntohll(magic);
1352                 if(magic != opts_magic) {
1353                         close(net);
1354                         return NULL;
1355                 }
1356                 if (read(net, &opt, sizeof(opt)) < 0)
1357                         err("Negotiation failed: %m");
1358                 opt = ntohl(opt);
1359                 if(opt != NBD_OPT_EXPORT_NAME) {
1360                         close(net);
1361                         return NULL;
1362                 }
1363                 if (read(net, &namelen, sizeof(namelen)) < 0)
1364                         err("Negotiation failed: %m");
1365                 namelen = ntohl(namelen);
1366                 name = malloc(namelen+1);
1367                 name[namelen]=0;
1368                 if (read(net, name, namelen) < 0)
1369                         err("Negotiation failed: %m");
1370                 for(i=0; i<servers->len; i++) {
1371                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1372                         if(!strcmp(serve->servename, name)) {
1373                                 CLIENT* client = g_new0(CLIENT, 1);
1374                                 client->server = serve;
1375                                 client->exportsize = OFFT_MAX;
1376                                 client->net = net;
1377                                 client->modern = TRUE;
1378                                 client->transactionlogfd = -1;
1379                                 free(name);
1380                                 return client;
1381                         }
1382                 }
1383                 free(name);
1384                 return NULL;
1385         }
1386         /* common */
1387         size_host = htonll((u64)(client->exportsize));
1388         if (write(net, &size_host, 8) < 0)
1389                 err("Negotiation failed: %m");
1390         if (client->server->flags & F_READONLY)
1391                 flags |= NBD_FLAG_READ_ONLY;
1392         if (client->server->flags & F_FLUSH)
1393                 flags |= NBD_FLAG_SEND_FLUSH;
1394         if (client->server->flags & F_FUA)
1395                 flags |= NBD_FLAG_SEND_FUA;
1396         if (client->server->flags & F_ROTATIONAL)
1397                 flags |= NBD_FLAG_ROTATIONAL;
1398         if (!client->modern) {
1399                 /* oldstyle */
1400                 flags = htonl(flags);
1401                 if (write(client->net, &flags, 4) < 0)
1402                         err("Negotiation failed: %m");
1403         } else {
1404                 /* modern */
1405                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1406                 smallflags = htons(smallflags);
1407                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1408                         err("Negotiation failed: %m");
1409                 }
1410         }
1411         /* common */
1412         if (write(client->net, zeros, 124) < 0)
1413                 err("Negotiation failed: %m");
1414         return NULL;
1415 }
1416
1417 /** sending macro. */
1418 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1419         if (client->transactionlogfd != -1) \
1420                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1421 /** error macro. */
1422 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1423 /**
1424  * Serve a file to a single client.
1425  *
1426  * @todo This beast needs to be split up in many tiny little manageable
1427  * pieces. Preferably with a chainsaw.
1428  *
1429  * @param client The client we're going to serve to.
1430  * @return when the client disconnects
1431  **/
1432 int mainloop(CLIENT *client) {
1433         struct nbd_request request;
1434         struct nbd_reply reply;
1435         gboolean go_on=TRUE;
1436 #ifdef DODBG
1437         int i = 0;
1438 #endif
1439         negotiate(client->net, client, NULL);
1440         DEBUG("Entering request loop!\n");
1441         reply.magic = htonl(NBD_REPLY_MAGIC);
1442         reply.error = 0;
1443         while (go_on) {
1444                 char buf[BUFSIZE];
1445                 char* p;
1446                 size_t len;
1447                 size_t currlen;
1448                 size_t writelen;
1449                 uint16_t command;
1450 #ifdef DODBG
1451                 i++;
1452                 printf("%d: ", i);
1453 #endif
1454                 readit(client->net, &request, sizeof(request));
1455                 if (client->transactionlogfd != -1)
1456                         writeit(client->transactionlogfd, &request, sizeof(request));
1457
1458                 request.from = ntohll(request.from);
1459                 request.type = ntohl(request.type);
1460                 command = request.type & NBD_CMD_MASK_COMMAND;
1461
1462                 if (command==NBD_CMD_DISC) {
1463                         msg2(LOG_INFO, "Disconnect request received.");
1464                         if (client->server->flags & F_COPYONWRITE) { 
1465                                 if (client->difmap) g_free(client->difmap) ;
1466                                 close(client->difffile);
1467                                 unlink(client->difffilename);
1468                                 free(client->difffilename);
1469                         }
1470                         go_on=FALSE;
1471                         continue;
1472                 }
1473
1474                 len = ntohl(request.len);
1475
1476                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1477                         err("Not enough magic.");
1478                 if (len > BUFSIZE - sizeof(struct nbd_reply)) {
1479                         currlen = BUFSIZE - sizeof(struct nbd_reply);
1480                         msg2(LOG_INFO, "oversized request (this is not a problem)");
1481                 } else {
1482                         currlen = len;
1483                 }
1484                 DEBUG("%s from %llu (%llu) len %d, ", command ? "WRITE" :
1485                                 "READ", (unsigned long long)request.from,
1486                                 (unsigned long long)request.from / 512, (unsigned int)len);
1487                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1488
1489                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1490                         if ((request.from + len) > (OFFT_MAX)) {
1491                                 DEBUG("[Number too large!]");
1492                                 ERROR(client, reply, EINVAL);
1493                                 continue;
1494                         }
1495
1496                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1497                                 DEBUG("[RANGE!]");
1498                                 ERROR(client, reply, EINVAL);
1499                                 continue;
1500                         }
1501                 }
1502
1503                 if (command==NBD_CMD_WRITE) {
1504                         DEBUG("wr: net->buf, ");
1505                         while(len > 0) {
1506                                 readit(client->net, buf, currlen);
1507                                 DEBUG("buf->exp, ");
1508                                 if ((client->server->flags & F_READONLY) ||
1509                                     (client->server->flags & F_AUTOREADONLY)) {
1510                                         DEBUG("[WRITE to READONLY!]");
1511                                         ERROR(client, reply, EPERM);
1512                                         continue;
1513                                 }
1514                                 if (expwrite(request.from, buf, len, client,
1515                                              request.type & NBD_CMD_FLAG_FUA)) {
1516                                         DEBUG("Write failed: %m" );
1517                                         ERROR(client, reply, errno);
1518                                         continue;
1519                                 }
1520                                 SEND(client->net, reply);
1521                                 DEBUG("OK!\n");
1522                                 len -= currlen;
1523                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1524                         }
1525                         continue;
1526                 }
1527
1528                 if (command==NBD_CMD_FLUSH) {
1529                         DEBUG("fl: ");
1530                         if (expflush(client)) {
1531                                 DEBUG("Flush failed: %m");
1532                                 ERROR(client, reply, errno);
1533                                 continue;
1534                         }
1535                         SEND(client->net, reply);
1536                         DEBUG("OK!\n");
1537                         continue;
1538                 }
1539
1540                 if (command==NBD_CMD_READ) {
1541                         DEBUG("exp->buf, ");
1542                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1543                         if (client->transactionlogfd != -1)
1544                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1545                         p = buf + sizeof(struct nbd_reply);
1546                         writelen = currlen + sizeof(struct nbd_reply);
1547                         while(len > 0) {
1548                                 if (expread(request.from, p, currlen, client)) {
1549                                         DEBUG("Read failed: %m");
1550                                         ERROR(client, reply, errno);
1551                                         continue;
1552                                 }
1553                                 
1554                                 DEBUG("buf->net, ");
1555                                 writeit(client->net, buf, writelen);
1556                                 len -= currlen;
1557                                 request.from += currlen;
1558                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1559                                 p = buf;
1560                                 writelen = currlen;
1561                         }
1562                         DEBUG("OK!\n");
1563                         continue;
1564                 }
1565
1566                 DEBUG ("Ignoring unknown command\n");
1567         }
1568         return 0;
1569 }
1570
1571 /**
1572  * Set up client export array, which is an array of FILE_INFO.
1573  * Also, split a single exportfile into multiple ones, if that was asked.
1574  * @param client information on the client which we want to setup export for
1575  **/
1576 void setupexport(CLIENT* client) {
1577         int i;
1578         off_t laststartoff = 0, lastsize = 0;
1579         int multifile = (client->server->flags & F_MULTIFILE);
1580
1581         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1582
1583         /* If multi-file, open as many files as we can.
1584          * If not, open exactly one file.
1585          * Calculate file sizes as we go to get total size. */
1586         for(i=0; ; i++) {
1587                 FILE_INFO fi;
1588                 gchar *tmpname;
1589                 gchar* error_string;
1590                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1591
1592                 if(multifile) {
1593                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1594                 } else {
1595                         tmpname=g_strdup(client->exportname);
1596                 }
1597                 DEBUG( "Opening %s\n", tmpname );
1598                 fi.fhandle = open(tmpname, mode);
1599                 if(fi.fhandle == -1 && mode == O_RDWR) {
1600                         /* Try again because maybe media was read-only */
1601                         fi.fhandle = open(tmpname, O_RDONLY);
1602                         if(fi.fhandle != -1) {
1603                                 /* Opening the base file in copyonwrite mode is
1604                                  * okay */
1605                                 if(!(client->server->flags & F_COPYONWRITE)) {
1606                                         client->server->flags |= F_AUTOREADONLY;
1607                                         client->server->flags |= F_READONLY;
1608                                 }
1609                         }
1610                 }
1611                 if(fi.fhandle == -1) {
1612                         if(multifile && i>0)
1613                                 break;
1614                         error_string=g_strdup_printf(
1615                                 "Could not open exported file %s: %%m",
1616                                 tmpname);
1617                         err(error_string);
1618                 }
1619                 fi.startoff = laststartoff + lastsize;
1620                 g_array_append_val(client->export, fi);
1621                 g_free(tmpname);
1622
1623                 /* Starting offset and size of this file will be used to
1624                  * calculate starting offset of next file */
1625                 laststartoff = fi.startoff;
1626                 lastsize = size_autodetect(fi.fhandle);
1627
1628                 if(!multifile)
1629                         break;
1630         }
1631
1632         /* Set export size to total calculated size */
1633         client->exportsize = laststartoff + lastsize;
1634
1635         /* Export size may be overridden */
1636         if(client->server->expected_size) {
1637                 /* desired size must be <= total calculated size */
1638                 if(client->server->expected_size > client->exportsize) {
1639                         err("Size of exported file is too big\n");
1640                 }
1641
1642                 client->exportsize = client->server->expected_size;
1643         }
1644
1645         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1646         if(multifile) {
1647                 msg3(LOG_INFO, "Total number of files: %d", i);
1648         }
1649 }
1650
1651 int copyonwrite_prepare(CLIENT* client) {
1652         off_t i;
1653         if ((client->difffilename = malloc(1024))==NULL)
1654                 err("Failed to allocate string for diff file name");
1655         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1656                 (int)getpid()) ;
1657         client->difffilename[1023]='\0';
1658         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1659         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1660         if (client->difffile<0) err("Could not create diff file (%m)") ;
1661         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1662                 err("Could not allocate memory") ;
1663         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1664
1665         return 0;
1666 }
1667
1668 /**
1669  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1670  * options
1671  *
1672  * @param command the command to be ran. Read from the config file
1673  * @param file the file name we're about to export
1674  **/
1675 int do_run(gchar* command, gchar* file) {
1676         gchar* cmd;
1677         int retval=0;
1678
1679         if(command && *command) {
1680                 cmd = g_strdup_printf(command, file);
1681                 retval=system(cmd);
1682                 g_free(cmd);
1683         }
1684         return retval;
1685 }
1686
1687 /**
1688  * Serve a connection. 
1689  *
1690  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1691  * follow the road map.
1692  *
1693  * @param client a connected client
1694  **/
1695 void serveconnection(CLIENT *client) {
1696         if (client->server->transactionlog && (client->transactionlogfd == -1))
1697         {
1698                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1699                                                            O_WRONLY | O_CREAT,
1700                                                            S_IRUSR | S_IWUSR)))
1701                         g_warning("Could not open transaction log %s",
1702                                   client->server->transactionlog);
1703         }
1704
1705         if(do_run(client->server->prerun, client->exportname)) {
1706                 exit(EXIT_FAILURE);
1707         }
1708         setupexport(client);
1709
1710         if (client->server->flags & F_COPYONWRITE) {
1711                 copyonwrite_prepare(client);
1712         }
1713
1714         setmysockopt(client->net);
1715
1716         mainloop(client);
1717         do_run(client->server->postrun, client->exportname);
1718
1719         if (-1 != client->transactionlogfd)
1720         {
1721                 close(client->transactionlogfd);
1722                 client->transactionlogfd = -1;
1723         }
1724 }
1725
1726 /**
1727  * Find the name of the file we have to serve. This will use g_strdup_printf
1728  * to put the IP address of the client inside a filename containing
1729  * "%s" (in the form as specified by the "virtstyle" option). That name
1730  * is then written to client->exportname.
1731  *
1732  * @param net A socket connected to an nbd client
1733  * @param client information about the client. The IP address in human-readable
1734  * format will be written to a new char* buffer, the address of which will be
1735  * stored in client->clientname.
1736  **/
1737 void set_peername(int net, CLIENT *client) {
1738         struct sockaddr_storage addrin;
1739         struct sockaddr_storage netaddr;
1740         struct sockaddr_in  *netaddr4 = NULL;
1741         struct sockaddr_in6 *netaddr6 = NULL;
1742         size_t addrinlen = sizeof( addrin );
1743         struct addrinfo hints;
1744         struct addrinfo *ai = NULL;
1745         char peername[NI_MAXHOST];
1746         char netname[NI_MAXHOST];
1747         char *tmp = NULL;
1748         int i;
1749         int e;
1750         int shift;
1751
1752         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1753                 err("getsockname failed: %m");
1754
1755         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1756                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1757
1758         memset(&hints, '\0', sizeof (hints));
1759         hints.ai_flags = AI_ADDRCONFIG;
1760         e = getaddrinfo(peername, NULL, &hints, &ai);
1761
1762         if(e != 0) {
1763                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1764                 freeaddrinfo(ai);
1765                 return;
1766         }
1767
1768         switch(client->server->virtstyle) {
1769                 case VIRT_NONE:
1770                         client->exportname=g_strdup(client->server->exportname);
1771                         break;
1772                 case VIRT_IPHASH:
1773                         for(i=0;i<strlen(peername);i++) {
1774                                 if(peername[i]=='.') {
1775                                         peername[i]='/';
1776                                 }
1777                         }
1778                 case VIRT_IPLIT:
1779                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1780                         break;
1781                 case VIRT_CIDR:
1782                         memcpy(&netaddr, &addrin, addrinlen);
1783                         if(ai->ai_family == AF_INET) {
1784                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1785                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1786                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1787
1788                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1789                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1790                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1791                         }else if(ai->ai_family == AF_INET6) {
1792                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1793
1794                                 shift = 128-(client->server->cidrlen);
1795                                 i = 3;
1796                                 while(shift >= 32) {
1797                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1798                                         shift-=32;
1799                                         i--;
1800                                 }
1801                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1802                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1803
1804                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1805                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1806                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1807                         }
1808
1809                         if(tmp != NULL)
1810                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1811
1812                         break;
1813         }
1814
1815         freeaddrinfo(ai);
1816         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1817              peername, client->exportname);
1818         client->clientname=g_strdup(peername);
1819 }
1820
1821 /**
1822  * Destroy a pid_t*
1823  * @param data a pointer to pid_t which should be freed
1824  **/
1825 void destroy_pid_t(gpointer data) {
1826         g_free(data);
1827 }
1828
1829 /**
1830  * Loop through the available servers, and serve them. Never returns.
1831  **/
1832 int serveloop(GArray* servers) {
1833         struct sockaddr_storage addrin;
1834         socklen_t addrinlen=sizeof(addrin);
1835         int i;
1836         int max;
1837         int sock;
1838         fd_set mset;
1839         fd_set rset;
1840
1841         /* 
1842          * Set up the master fd_set. The set of descriptors we need
1843          * to select() for never changes anyway and it buys us a *lot*
1844          * of time to only build this once. However, if we ever choose
1845          * to not fork() for clients anymore, we may have to revisit
1846          * this.
1847          */
1848         max=0;
1849         FD_ZERO(&mset);
1850         for(i=0;i<servers->len;i++) {
1851                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1852                         FD_SET(sock, &mset);
1853                         max=sock>max?sock:max;
1854                 }
1855         }
1856         if(modernsock) {
1857                 FD_SET(modernsock, &mset);
1858                 max=modernsock>max?modernsock:max;
1859         }
1860         for(;;) {
1861                 CLIENT *client = NULL;
1862                 pid_t *pid;
1863
1864                 memcpy(&rset, &mset, sizeof(fd_set));
1865                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1866                         int net = 0;
1867                         SERVER* serve=NULL;
1868
1869                         DEBUG("accept, ");
1870                         if(FD_ISSET(modernsock, &rset)) {
1871                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1872                                         err("accept: %m");
1873                                 client = negotiate(net, NULL, servers);
1874                                 if(!client) {
1875                                         err_nonfatal("negotiation failed");
1876                                         close(net);
1877                                         net=0;
1878                                         continue;
1879                                 }
1880                                 serve = client->server;
1881                         }
1882                         for(i=0;i<servers->len && !net;i++) {
1883                                 serve=&(g_array_index(servers, SERVER, i));
1884                                 if(FD_ISSET(serve->socket, &rset)) {
1885                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1886                                                 err("accept: %m");
1887                                 }
1888                         }
1889                         if(net) {
1890                                 int sock_flags;
1891
1892                                 if(serve->max_connections > 0 &&
1893                                    g_hash_table_size(children) >= serve->max_connections) {
1894                                         msg2(LOG_INFO, "Max connections reached");
1895                                         close(net);
1896                                         continue;
1897                                 }
1898                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1899                                         err("fcntl F_GETFL");
1900                                 }
1901                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1902                                         err("fcntl F_SETFL ~O_NONBLOCK");
1903                                 }
1904                                 if(!client) {
1905                                         client = g_new0(CLIENT, 1);
1906                                         client->server=serve;
1907                                         client->exportsize=OFFT_MAX;
1908                                         client->net=net;
1909                                         client->transactionlogfd = -1;
1910                                 }
1911                                 set_peername(net, client);
1912                                 if (!authorized_client(client)) {
1913                                         msg2(LOG_INFO,"Unauthorized client") ;
1914                                         close(net);
1915                                         continue;
1916                                 }
1917                                 msg2(LOG_INFO,"Authorized client") ;
1918                                 pid=g_malloc(sizeof(pid_t));
1919
1920                                 if (!dontfork) {
1921                                         if ((*pid=fork())<0) {
1922                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1923                                                 close(net);
1924                                                 continue;
1925                                         }
1926                                         if (*pid>0) { /* parent */
1927                                                 close(net);
1928                                                 g_hash_table_insert(children, pid, pid);
1929                                                 continue;
1930                                         }
1931                                         /* child */
1932                                         g_hash_table_destroy(children);
1933                                         for(i=0;i<servers->len;i++) {
1934                                                 serve=&g_array_index(servers, SERVER, i);
1935                                                 close(serve->socket);
1936                                         }
1937                                         /* FALSE does not free the
1938                                            actual data. This is required,
1939                                            because the client has a
1940                                            direct reference into that
1941                                            data, and otherwise we get a
1942                                            segfault... */
1943                                         g_array_free(servers, FALSE);
1944                                 }
1945
1946                                 msg2(LOG_INFO,"Starting to serve");
1947                                 serveconnection(client);
1948                                 exit(EXIT_SUCCESS);
1949                         }
1950                 }
1951         }
1952 }
1953
1954 void dosockopts(int socket) {
1955 #ifndef sun
1956         int yes=1;
1957 #else
1958         char yes='1';
1959 #endif /* sun */
1960         int sock_flags;
1961
1962         /* lose the pesky "Address already in use" error message */
1963         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
1964                 err("setsockopt SO_REUSEADDR");
1965         }
1966         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
1967                 err("setsockopt SO_KEEPALIVE");
1968         }
1969
1970         /* make the listening socket non-blocking */
1971         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
1972                 err("fcntl F_GETFL");
1973         }
1974         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
1975                 err("fcntl F_SETFL O_NONBLOCK");
1976         }
1977 }
1978
1979 /**
1980  * Connect a server's socket.
1981  *
1982  * @param serve the server we want to connect.
1983  **/
1984 int setup_serve(SERVER *serve) {
1985         struct addrinfo hints;
1986         struct addrinfo *ai = NULL;
1987         gchar *port = NULL;
1988         int e;
1989
1990         if(!do_oldstyle) {
1991                 return serve->servename ? 1 : 0;
1992         }
1993         memset(&hints,'\0',sizeof(hints));
1994         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
1995         hints.ai_socktype = SOCK_STREAM;
1996         hints.ai_family = serve->socket_family;
1997
1998         port = g_strdup_printf ("%d", serve->port);
1999         if (port == NULL)
2000                 return 0;
2001
2002         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2003
2004         g_free(port);
2005
2006         if(e != 0) {
2007                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2008                 serve->socket = -1;
2009                 freeaddrinfo(ai);
2010                 exit(EXIT_FAILURE);
2011         }
2012
2013         if(serve->socket_family == AF_UNSPEC)
2014                 serve->socket_family = ai->ai_family;
2015
2016 #ifdef WITH_SDP
2017         if ((serve->flags) && F_SDP) {
2018                 if (ai->ai_family == AF_INET)
2019                         ai->ai_family = AF_INET_SDP;
2020                 else (ai->ai_family == AF_INET6)
2021                         ai->ai_family = AF_INET6_SDP;
2022         }
2023 #endif
2024         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2025                 err("socket: %m");
2026
2027         dosockopts(serve->socket);
2028
2029         DEBUG("Waiting for connections... bind, ");
2030         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2031         if (e != 0 && errno != EADDRINUSE)
2032                 err("bind: %m");
2033         DEBUG("listen, ");
2034         if (listen(serve->socket, 1) < 0)
2035                 err("listen: %m");
2036
2037         freeaddrinfo (ai);
2038         if(serve->servename) {
2039                 return 1;
2040         } else {
2041                 return 0;
2042         }
2043 }
2044
2045 void open_modern(void) {
2046         struct addrinfo hints;
2047         struct addrinfo* ai = NULL;
2048         struct sock_flags;
2049         int e;
2050
2051         memset(&hints, '\0', sizeof(hints));
2052         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2053         hints.ai_socktype = SOCK_STREAM;
2054         hints.ai_family = AF_UNSPEC;
2055         hints.ai_protocol = IPPROTO_TCP;
2056         e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai);
2057         if(e != 0) {
2058                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2059                 exit(EXIT_FAILURE);
2060         }
2061         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2062                 err("socket: %m");
2063         }
2064
2065         dosockopts(modernsock);
2066
2067         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2068                 err("bind: %m");
2069         }
2070         if(listen(modernsock, 10) <0) {
2071                 err("listen: %m");
2072         }
2073
2074         freeaddrinfo(ai);
2075 }
2076
2077 /**
2078  * Connect our servers.
2079  **/
2080 void setup_servers(GArray* servers) {
2081         int i;
2082         struct sigaction sa;
2083         int want_modern=0;
2084
2085         for(i=0;i<servers->len;i++) {
2086                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2087         }
2088         if(want_modern) {
2089                 open_modern();
2090         }
2091         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2092
2093         sa.sa_handler = sigchld_handler;
2094         sigemptyset(&sa.sa_mask);
2095         sa.sa_flags = SA_RESTART;
2096         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2097                 err("sigaction: %m");
2098         sa.sa_handler = sigterm_handler;
2099         sigemptyset(&sa.sa_mask);
2100         sa.sa_flags = SA_RESTART;
2101         if(sigaction(SIGTERM, &sa, NULL) == -1)
2102                 err("sigaction: %m");
2103 }
2104
2105 /**
2106  * Go daemon (unless we specified at compile time that we didn't want this)
2107  * @param serve the first server of our configuration. If its port is zero,
2108  *      then do not daemonize, because we're doing inetd then. This parameter
2109  *      is only used to create a PID file of the form
2110  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2111  **/
2112 #if !defined(NODAEMON)
2113 void daemonize(SERVER* serve) {
2114         FILE*pidf;
2115
2116         if(serve && !(serve->port)) {
2117                 return;
2118         }
2119         if(daemon(0,0)<0) {
2120                 err("daemon");
2121         }
2122         if(!*pidftemplate) {
2123                 if(serve) {
2124                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2125                 } else {
2126                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2127                 }
2128         }
2129         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2130         pidf=fopen(pidfname, "w");
2131         if(pidf) {
2132                 fprintf(pidf,"%d\n", (int)getpid());
2133                 fclose(pidf);
2134         } else {
2135                 perror("fopen");
2136                 fprintf(stderr, "Not fatal; continuing");
2137         }
2138 }
2139 #else
2140 #define daemonize(serve)
2141 #endif /* !defined(NODAEMON) */
2142
2143 /*
2144  * Everything beyond this point (in the file) is run in non-daemon mode.
2145  * The stuff above daemonize() isn't.
2146  */
2147
2148 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2149
2150 void serve_err(SERVER* serve, const char* msg) {
2151         g_message("Export of %s on port %d failed:", serve->exportname,
2152                         serve->port);
2153         err(msg);
2154 }
2155
2156 /**
2157  * Set up user-ID and/or group-ID
2158  **/
2159 void dousers(void) {
2160         struct passwd *pw;
2161         struct group *gr;
2162         gchar* str;
2163         if(rungroup) {
2164                 gr=getgrnam(rungroup);
2165                 if(!gr) {
2166                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2167                         err(str);
2168                 }
2169                 if(setgid(gr->gr_gid)<0) {
2170                         err("Could not set GID: %m"); 
2171                 }
2172         }
2173         if(runuser) {
2174                 pw=getpwnam(runuser);
2175                 if(!pw) {
2176                         str = g_strdup_printf("Invalid user name: %s", runuser);
2177                         err(str);
2178                 }
2179                 if(setuid(pw->pw_uid)<0) {
2180                         err("Could not set UID: %m");
2181                 }
2182         }
2183 }
2184
2185 #ifndef ISSERVER
2186 void glib_message_syslog_redirect(const gchar *log_domain,
2187                                   GLogLevelFlags log_level,
2188                                   const gchar *message,
2189                                   gpointer user_data)
2190 {
2191     int level=LOG_DEBUG;
2192     
2193     switch( log_level )
2194     {
2195       case G_LOG_FLAG_FATAL:
2196       case G_LOG_LEVEL_CRITICAL:
2197       case G_LOG_LEVEL_ERROR:    
2198         level=LOG_ERR; 
2199         break;
2200       case G_LOG_LEVEL_WARNING:
2201         level=LOG_WARNING;
2202         break;
2203       case G_LOG_LEVEL_MESSAGE:
2204       case G_LOG_LEVEL_INFO:
2205         level=LOG_INFO;
2206         break;
2207       case G_LOG_LEVEL_DEBUG:
2208         level=LOG_DEBUG;
2209       default:
2210         level=LOG_ERR;
2211     }
2212     syslog(level, "%s", message);
2213 }
2214 #endif
2215
2216 /**
2217  * Main entry point...
2218  **/
2219 int main(int argc, char *argv[]) {
2220         SERVER *serve;
2221         GArray *servers;
2222         GError *err=NULL;
2223
2224         if (sizeof( struct nbd_request )!=28) {
2225                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2226                 exit(EXIT_FAILURE) ;
2227         }
2228
2229         memset(pidftemplate, '\0', 256);
2230
2231         logging();
2232         config_file_pos = g_strdup(CFILE);
2233         serve=cmdline(argc, argv);
2234         servers = parse_cfile(config_file_pos, &err);
2235         
2236         if(serve) {
2237                 serve->socket_family = AF_UNSPEC;
2238
2239                 append_serve(serve, servers);
2240      
2241                 if (!(serve->port)) {
2242                         CLIENT *client;
2243 #ifndef ISSERVER
2244                         /* You really should define ISSERVER if you're going to use
2245                          * inetd mode, but if you don't, closing stdout and stderr
2246                          * (which inetd had connected to the client socket) will let it
2247                          * work. */
2248                         close(1);
2249                         close(2);
2250                         open("/dev/null", O_WRONLY);
2251                         open("/dev/null", O_WRONLY);
2252                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2253 #endif
2254                         client=g_malloc(sizeof(CLIENT));
2255                         client->server=serve;
2256                         client->net=0;
2257                         client->exportsize=OFFT_MAX;
2258                         set_peername(0,client);
2259                         serveconnection(client);
2260                         return 0;
2261                 }
2262         }
2263     
2264         if(!servers || !servers->len) {
2265                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2266                                 && err->code == CFILE_NOTFOUND)) {
2267                         g_warning("Could not parse config file: %s", 
2268                                         err ? err->message : "Unknown error");
2269                 }
2270         }
2271         if(serve) {
2272                 g_warning("Specifying an export on the command line is deprecated.");
2273                 g_warning("Please use a configuration file instead.");
2274         }
2275
2276         if((!serve) && (!servers||!servers->len)) {
2277                 g_message("No configured exports; quitting.");
2278                 exit(EXIT_FAILURE);
2279         }
2280         if (!dontfork)
2281                 daemonize(serve);
2282         setup_servers(servers);
2283         dousers();
2284         serveloop(servers);
2285         return 0 ;
2286 }