nbd-tester-client: ignore SIGPIPE so we pick up and print the error
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 #define NEG_INIT        (1 << 0)
169 #define NEG_OLD         (1 << 1)
170 #define NEG_MODERN      (1 << 2)
171
172 int modernsock=0;         /**< Socket for the modern handler. Not used
173                                if a client was only specified on the
174                                command line; only port used if
175                                oldstyle is set to false (and then the
176                                command-line client isn't used, gna gna) */
177 char* modern_listen;      /**< listenaddr value for modernsock */
178 char* modernport=NBD_DEFAULT_PORT; /**< Port number on which to listen for
179                                       new-style nbd-client connections */
180
181 /**
182  * Types of virtuatlization
183  **/
184 typedef enum {
185         VIRT_NONE=0,    /**< No virtualization */
186         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
187         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
188                              doing the same as in IPLIT */
189         VIRT_CIDR,      /**< Every subnet in its own directory */
190 } VIRT_STYLE;
191
192 /**
193  * Variables associated with a server.
194  **/
195 typedef struct {
196         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
197         off_t expected_size; /**< size of the exported file as it was told to
198                                us through configuration */
199         gchar* listenaddr;   /**< The IP address we're listening on */
200         unsigned int port;   /**< port we're exporting this file at */
201         char* authname;      /**< filename of the authorization file */
202         int flags;           /**< flags associated with this exported file */
203         int socket;          /**< The socket of this server. */
204         int socket_family;   /**< family of the socket */
205         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
206         uint8_t cidrlen;     /**< The length of the mask when we use
207                                   CIDR-style virtualization */
208         gchar* prerun;       /**< command to be ran after connecting a client,
209                                   but before starting to serve */
210         gchar* postrun;      /**< command that will be ran after the client
211                                   disconnects */
212         gchar* servename;    /**< name of the export as selected by nbd-client */
213         int max_connections; /**< maximum number of opened connections */
214         gchar* transactionlog;/**< filename for transaction log */
215 } SERVER;
216
217 /**
218  * Variables associated with a client socket.
219  **/
220 typedef struct {
221         int fhandle;      /**< file descriptor */
222         off_t startoff;   /**< starting offset of this file */
223 } FILE_INFO;
224
225 typedef struct {
226         off_t exportsize;    /**< size of the file we're exporting */
227         char *clientname;    /**< peer */
228         char *exportname;    /**< (processed) filename of the file we're exporting */
229         GArray *export;    /**< array of FILE_INFO of exported files;
230                                array size is always 1 unless we're
231                                doing the multiple file option */
232         int net;             /**< The actual client socket */
233         SERVER *server;      /**< The server this client is getting data from */
234         char* difffilename;  /**< filename of the copy-on-write file, if any */
235         int difffile;        /**< filedescriptor of copyonwrite file. @todo
236                                shouldn't this be an array too? (cfr export) Or
237                                make -m and -c mutually exclusive */
238         u32 difffilelen;     /**< number of pages in difffile */
239         u32 *difmap;         /**< see comment on the global difmap for this one */
240         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
241         int transactionlogfd;/**< fd for transaction log */
242 } CLIENT;
243
244 /**
245  * Type of configuration file values
246  **/
247 typedef enum {
248         PARAM_INT,              /**< This parameter is an integer */
249         PARAM_STRING,           /**< This parameter is a string */
250         PARAM_BOOL,             /**< This parameter is a boolean */
251 } PARAM_TYPE;
252
253 /**
254  * Configuration file values
255  **/
256 typedef struct {
257         gchar *paramname;       /**< Name of the parameter, as it appears in
258                                   the config file */
259         gboolean required;      /**< Whether this is a required (as opposed to
260                                   optional) parameter */
261         PARAM_TYPE ptype;       /**< Type of the parameter. */
262         gpointer target;        /**< Pointer to where the data of this
263                                   parameter should be written. If ptype is
264                                   PARAM_BOOL, the data is or'ed rather than
265                                   overwritten. */
266         gint flagval;           /**< Flag mask for this parameter in case ptype
267                                   is PARAM_BOOL. */
268 } PARAM;
269
270 /**
271  * Translate a command name into human readable form
272  *
273  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
274  * @return pointer to the command name
275  **/
276 static inline const char * getcommandname(uint64_t command) {
277         switch (command) {
278         case NBD_CMD_READ:
279                 return "NBD_CMD_READ";
280         case NBD_CMD_WRITE:
281                 return "NBD_CMD_WRITE";
282         case NBD_CMD_DISC:
283                 return "NBD_CMD_DISC";
284         case NBD_CMD_FLUSH:
285                 return "NBD_CMD_FLUSH";
286         default:
287                 break;
288         }
289         return "UNKNOWN";
290 }
291
292 /**
293  * Check whether a client is allowed to connect. Works with an authorization
294  * file which contains one line per machine, no wildcards.
295  *
296  * @param opts The client who's trying to connect.
297  * @return 0 - authorization refused, 1 - OK
298  **/
299 int authorized_client(CLIENT *opts) {
300         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
301         FILE *f ;
302         char line[LINELEN]; 
303         char *tmp;
304         struct in_addr addr;
305         struct in_addr client;
306         struct in_addr cltemp;
307         int len;
308
309         if ((f=fopen(opts->server->authname,"r"))==NULL) {
310                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
311                      opts->server->authname,strerror(errno)) ;
312                 return 1 ; 
313         }
314   
315         inet_aton(opts->clientname, &client);
316         while (fgets(line,LINELEN,f)!=NULL) {
317                 if((tmp=index(line, '/'))) {
318                         if(strlen(line)<=tmp-line) {
319                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
320                                 return 0;
321                         }
322                         *(tmp++)=0;
323                         if(!inet_aton(line,&addr)) {
324                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
325                                 return 0;
326                         }
327                         len=strtol(tmp, NULL, 0);
328                         addr.s_addr>>=32-len;
329                         addr.s_addr<<=32-len;
330                         memcpy(&cltemp,&client,sizeof(client));
331                         cltemp.s_addr>>=32-len;
332                         cltemp.s_addr<<=32-len;
333                         if(addr.s_addr == cltemp.s_addr) {
334                                 return 1;
335                         }
336                 }
337                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
338                         fclose(f);
339                         return 1;
340                 }
341         }
342         fclose(f);
343         return 0;
344 }
345
346 /**
347  * Read data from a file descriptor into a buffer
348  *
349  * @param f a file descriptor
350  * @param buf a buffer
351  * @param len the number of bytes to be read
352  **/
353 static inline void readit(int f, void *buf, size_t len) {
354         ssize_t res;
355         while (len > 0) {
356                 DEBUG("*");
357                 if ((res = read(f, buf, len)) <= 0) {
358                         if(errno != EAGAIN) {
359                                 err("Read failed: %m");
360                         }
361                 } else {
362                         len -= res;
363                         buf += res;
364                 }
365         }
366 }
367
368 /**
369  * Consume data from an FD that we don't want
370  *
371  * @param f a file descriptor
372  * @param buf a buffer
373  * @param len the number of bytes to consume
374  * @param bufsiz the size of the buffer
375  **/
376 static inline void consume(int f, void * buf, size_t len, size_t bufsiz) {
377         size_t curlen;
378         while (len>0) {
379                 curlen = (len>bufsiz)?bufsiz:len;
380                 readit(f, buf, curlen);
381                 len -= curlen;
382         }
383 }
384
385
386 /**
387  * Write data from a buffer into a filedescriptor
388  *
389  * @param f a file descriptor
390  * @param buf a buffer containing data
391  * @param len the number of bytes to be written
392  **/
393 static inline void writeit(int f, void *buf, size_t len) {
394         ssize_t res;
395         while (len > 0) {
396                 DEBUG("+");
397                 if ((res = write(f, buf, len)) <= 0)
398                         err("Send failed: %m");
399                 len -= res;
400                 buf += res;
401         }
402 }
403
404 /**
405  * Print out a message about how to use nbd-server. Split out to a separate
406  * function so that we can call it from multiple places
407  */
408 void usage() {
409         printf("This is nbd-server version " VERSION "\n");
410         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
411                "\t-r|--read-only\t\tread only\n"
412                "\t-m|--multi-file\t\tmultiple file\n"
413                "\t-c|--copy-on-write\tcopy on write\n"
414                "\t-C|--config-file\tspecify an alternate configuration file\n"
415                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
416                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
417                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
418                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
419                "\tif port is set to 0, stdin is used (for running from inetd).\n"
420                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
421                "\t\taddress of the machine trying to connect\n" 
422                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
423         printf("Using configuration file %s\n", CFILE);
424 }
425
426 /* Dumps a config file section of the given SERVER*, and exits. */
427 void dump_section(SERVER* serve, gchar* section_header) {
428         printf("[%s]\n", section_header);
429         printf("\texportname = %s\n", serve->exportname);
430         printf("\tlistenaddr = %s\n", serve->listenaddr);
431         printf("\tport = %d\n", serve->port);
432         if(serve->flags & F_READONLY) {
433                 printf("\treadonly = true\n");
434         }
435         if(serve->flags & F_MULTIFILE) {
436                 printf("\tmultifile = true\n");
437         }
438         if(serve->flags & F_COPYONWRITE) {
439                 printf("\tcopyonwrite = true\n");
440         }
441         if(serve->expected_size) {
442                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
443         }
444         if(serve->authname) {
445                 printf("\tauthfile = %s\n", serve->authname);
446         }
447         exit(EXIT_SUCCESS);
448 }
449
450 /**
451  * Parse the command line.
452  *
453  * @param argc the argc argument to main()
454  * @param argv the argv argument to main()
455  **/
456 SERVER* cmdline(int argc, char *argv[]) {
457         int i=0;
458         int nonspecial=0;
459         int c;
460         struct option long_options[] = {
461                 {"read-only", no_argument, NULL, 'r'},
462                 {"multi-file", no_argument, NULL, 'm'},
463                 {"copy-on-write", no_argument, NULL, 'c'},
464                 {"dont-fork", no_argument, NULL, 'd'},
465                 {"authorize-file", required_argument, NULL, 'l'},
466                 {"config-file", required_argument, NULL, 'C'},
467                 {"pid-file", required_argument, NULL, 'p'},
468                 {"output-config", required_argument, NULL, 'o'},
469                 {"max-connection", required_argument, NULL, 'M'},
470                 {0,0,0,0}
471         };
472         SERVER *serve;
473         off_t es;
474         size_t last;
475         char suffix;
476         gboolean do_output=FALSE;
477         gchar* section_header="";
478         gchar** addr_port;
479
480         if(argc==1) {
481                 return NULL;
482         }
483         serve=g_new0(SERVER, 1);
484         serve->authname = g_strdup(default_authname);
485         serve->virtstyle=VIRT_IPLIT;
486         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
487                 switch (c) {
488                 case 1:
489                         /* non-option argument */
490                         switch(nonspecial++) {
491                         case 0:
492                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
493                                         addr_port=g_strsplit(optarg, ":", 2);
494
495                                         /* Check for "@" - maybe user using this separator
496                                                  for IPv4 address */
497                                         if(!addr_port[1]) {
498                                                 g_strfreev(addr_port);
499                                                 addr_port=g_strsplit(optarg, "@", 2);
500                                         }
501                                 } else {
502                                         addr_port=g_strsplit(optarg, "@", 2);
503                                 }
504
505                                 if(addr_port[1]) {
506                                         serve->port=strtol(addr_port[1], NULL, 0);
507                                         serve->listenaddr=g_strdup(addr_port[0]);
508                                 } else {
509                                         serve->listenaddr=NULL;
510                                         serve->port=strtol(addr_port[0], NULL, 0);
511                                 }
512                                 g_strfreev(addr_port);
513                                 break;
514                         case 1:
515                                 serve->exportname = g_strdup(optarg);
516                                 if(serve->exportname[0] != '/') {
517                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
518                                         exit(EXIT_FAILURE);
519                                 }
520                                 break;
521                         case 2:
522                                 last=strlen(optarg)-1;
523                                 suffix=optarg[last];
524                                 if (suffix == 'k' || suffix == 'K' ||
525                                     suffix == 'm' || suffix == 'M')
526                                         optarg[last] = '\0';
527                                 es = (off_t)atoll(optarg);
528                                 switch (suffix) {
529                                         case 'm':
530                                         case 'M':  es <<= 10;
531                                         case 'k':
532                                         case 'K':  es <<= 10;
533                                         default :  break;
534                                 }
535                                 serve->expected_size = es;
536                                 break;
537                         }
538                         break;
539                 case 'r':
540                         serve->flags |= F_READONLY;
541                         break;
542                 case 'm':
543                         serve->flags |= F_MULTIFILE;
544                         break;
545                 case 'o':
546                         do_output = TRUE;
547                         section_header = g_strdup(optarg);
548                         break;
549                 case 'p':
550                         strncpy(pidftemplate, optarg, 256);
551                         break;
552                 case 'c': 
553                         serve->flags |=F_COPYONWRITE;
554                         break;
555                 case 'd': 
556                         dontfork = 1;
557                         break;
558                 case 'C':
559                         g_free(config_file_pos);
560                         config_file_pos=g_strdup(optarg);
561                         break;
562                 case 'l':
563                         g_free(serve->authname);
564                         serve->authname=g_strdup(optarg);
565                         break;
566                 case 'M':
567                         serve->max_connections = strtol(optarg, NULL, 0);
568                         break;
569                 default:
570                         usage();
571                         exit(EXIT_FAILURE);
572                         break;
573                 }
574         }
575         /* What's left: the port to export, the name of the to be exported
576          * file, and, optionally, the size of the file, in that order. */
577         if(nonspecial<2) {
578                 g_free(serve);
579                 serve=NULL;
580         } else {
581                 do_oldstyle = TRUE;
582         }
583         if(do_output) {
584                 if(!serve) {
585                         g_critical("Need a complete configuration on the command line to output a config file section!");
586                         exit(EXIT_FAILURE);
587                 }
588                 dump_section(serve, section_header);
589         }
590         return serve;
591 }
592
593 /**
594  * Error codes for config file parsing
595  **/
596 typedef enum {
597         CFILE_NOTFOUND,         /**< The configuration file is not found */
598         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
599         CFILE_KEY_MISSING,      /**< A (required) key is missing */
600         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
601         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
602         CFILE_PROGERR,          /**< Programmer error */
603         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
604                                      define any exports */
605         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
606                                      old-style export. */
607 } CFILE_ERRORS;
608
609 /**
610  * Remove a SERVER from memory. Used from the hash table
611  **/
612 void remove_server(gpointer s) {
613         SERVER *server;
614
615         server=(SERVER*)s;
616         g_free(server->exportname);
617         if(server->authname)
618                 g_free(server->authname);
619         if(server->listenaddr)
620                 g_free(server->listenaddr);
621         if(server->prerun)
622                 g_free(server->prerun);
623         if(server->postrun)
624                 g_free(server->postrun);
625         if(server->transactionlog)
626                 g_free(server->transactionlog);
627         g_free(server);
628 }
629
630 /**
631  * duplicate server
632  * @param s the old server we want to duplicate
633  * @return new duplicated server
634  **/
635 SERVER* dup_serve(SERVER *s) {
636         SERVER *serve = NULL;
637
638         serve=g_new0(SERVER, 1);
639         if(serve == NULL)
640                 return NULL;
641
642         if(s->exportname)
643                 serve->exportname = g_strdup(s->exportname);
644
645         serve->expected_size = s->expected_size;
646
647         if(s->listenaddr)
648                 serve->listenaddr = g_strdup(s->listenaddr);
649
650         serve->port = s->port;
651
652         if(s->authname)
653                 serve->authname = strdup(s->authname);
654
655         serve->flags = s->flags;
656         serve->socket = s->socket;
657         serve->socket_family = s->socket_family;
658         serve->virtstyle = s->virtstyle;
659         serve->cidrlen = s->cidrlen;
660
661         if(s->prerun)
662                 serve->prerun = g_strdup(s->prerun);
663
664         if(s->postrun)
665                 serve->postrun = g_strdup(s->postrun);
666
667         if(s->transactionlog)
668                 serve->transactionlog = g_strdup(s->transactionlog);
669         
670         if(s->servename)
671                 serve->servename = g_strdup(s->servename);
672
673         serve->max_connections = s->max_connections;
674
675         return serve;
676 }
677
678 /**
679  * append new server to array
680  * @param s server
681  * @param a server array
682  * @return 0 success, -1 error
683  */
684 int append_serve(SERVER *s, GArray *a) {
685         SERVER *ns = NULL;
686         struct addrinfo hints;
687         struct addrinfo *ai = NULL;
688         struct addrinfo *rp = NULL;
689         char   host[NI_MAXHOST];
690         gchar  *port = NULL;
691         int e;
692         int ret;
693
694         if(!s) {
695                 err("Invalid parsing server");
696                 return -1;
697         }
698
699         port = g_strdup_printf("%d", s->port);
700
701         memset(&hints,'\0',sizeof(hints));
702         hints.ai_family = AF_UNSPEC;
703         hints.ai_socktype = SOCK_STREAM;
704         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
705         hints.ai_protocol = IPPROTO_TCP;
706
707         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
708
709         if (port)
710                 g_free(port);
711
712         if(e == 0) {
713                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
714                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
715
716                         if (e != 0) { // error
717                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
718                                 continue;
719                         }
720
721                         // duplicate server and set listenaddr to resolved IP address
722                         ns = dup_serve (s);
723                         if (ns) {
724                                 ns->listenaddr = g_strdup(host);
725                                 ns->socket_family = rp->ai_family;
726                                 g_array_append_val(a, *ns);
727                                 free(ns);
728                                 ns = NULL;
729                         }
730                 }
731
732                 ret = 0;
733         } else {
734                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
735                 ret = -1;
736         }
737
738         if (ai)
739                 freeaddrinfo(ai);
740
741         return ret;
742 }
743
744 /**
745  * Parse the config file.
746  *
747  * @param f the name of the config file
748  * @param e a GError. @see CFILE_ERRORS for what error values this function can
749  *      return.
750  * @return a Array of SERVER* pointers, If the config file is empty or does not
751  *      exist, returns an empty GHashTable; if the config file contains an
752  *      error, returns NULL, and e is set appropriately
753  **/
754 GArray* parse_cfile(gchar* f, GError** e) {
755         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
756         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
757         SERVER s;
758         gchar *virtstyle=NULL;
759         PARAM lp[] = {
760                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
761                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
762                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
763                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
764                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
765                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
766                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
767                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
768                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
769                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
770                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
771                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
772                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
773                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
774                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
775                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
776                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
777                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
778                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
779         };
780         const int lp_size=sizeof(lp)/sizeof(PARAM);
781         PARAM gp[] = {
782                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
783                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
784                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
785                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
786                 { "port",       FALSE, PARAM_STRING,    &modernport,    0 },
787         };
788         PARAM* p=gp;
789         int p_size=sizeof(gp)/sizeof(PARAM);
790         GKeyFile *cfile;
791         GError *err = NULL;
792         const char *err_msg=NULL;
793         GQuark errdomain;
794         GArray *retval=NULL;
795         gchar **groups;
796         gboolean bval;
797         gint ival;
798         gchar* sval;
799         gchar* startgroup;
800         gint i;
801         gint j;
802
803         errdomain = g_quark_from_string("parse_cfile");
804         cfile = g_key_file_new();
805         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
806         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
807                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
808                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
809                 g_key_file_free(cfile);
810                 return retval;
811         }
812         startgroup = g_key_file_get_start_group(cfile);
813         if(!startgroup || strcmp(startgroup, "generic")) {
814                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
815                 g_key_file_free(cfile);
816                 return NULL;
817         }
818         groups = g_key_file_get_groups(cfile, NULL);
819         for(i=0;groups[i];i++) {
820                 memset(&s, '\0', sizeof(SERVER));
821
822                 /* After the [generic] group, start parsing exports */
823                 if(i==1) {
824                         p=lp;
825                         p_size=lp_size;
826                 } 
827                 for(j=0;j<p_size;j++) {
828                         g_assert(p[j].target != NULL);
829                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
830                         switch(p[j].ptype) {
831                                 case PARAM_INT:
832                                         ival = g_key_file_get_integer(cfile,
833                                                                 groups[i],
834                                                                 p[j].paramname,
835                                                                 &err);
836                                         if(!err) {
837                                                 *((gint*)p[j].target) = ival;
838                                         }
839                                         break;
840                                 case PARAM_STRING:
841                                         sval = g_key_file_get_string(cfile,
842                                                                 groups[i],
843                                                                 p[j].paramname,
844                                                                 &err);
845                                         if(!err) {
846                                                 *((gchar**)p[j].target) = sval;
847                                         }
848                                         break;
849                                 case PARAM_BOOL:
850                                         bval = g_key_file_get_boolean(cfile,
851                                                         groups[i],
852                                                         p[j].paramname, &err);
853                                         if(!err) {
854                                                 if(bval) {
855                                                         *((gint*)p[j].target) |= p[j].flagval;
856                                                 } else {
857                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
858                                                 }
859                                         }
860                                         break;
861                         }
862                         if(err) {
863                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
864                                         if(!p[j].required) {
865                                                 /* Ignore not-found error for optional values */
866                                                 g_clear_error(&err);
867                                                 continue;
868                                         } else {
869                                                 err_msg = MISSING_REQUIRED_ERROR;
870                                         }
871                                 } else {
872                                         err_msg = DEFAULT_ERROR;
873                                 }
874                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
875                                 g_array_free(retval, TRUE);
876                                 g_error_free(err);
877                                 g_key_file_free(cfile);
878                                 return NULL;
879                         }
880                 }
881                 if(virtstyle) {
882                         if(!strncmp(virtstyle, "none", 4)) {
883                                 s.virtstyle=VIRT_NONE;
884                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
885                                 s.virtstyle=VIRT_IPLIT;
886                         } else if(!strncmp(virtstyle, "iphash", 6)) {
887                                 s.virtstyle=VIRT_IPHASH;
888                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
889                                 s.virtstyle=VIRT_CIDR;
890                                 if(strlen(virtstyle)<10) {
891                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
892                                         g_array_free(retval, TRUE);
893                                         g_key_file_free(cfile);
894                                         return NULL;
895                                 }
896                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
897                         } else {
898                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
899                                 g_array_free(retval, TRUE);
900                                 g_key_file_free(cfile);
901                                 return NULL;
902                         }
903                         if(s.port && !do_oldstyle) {
904                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
905                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
906                         }
907                 } else {
908                         s.virtstyle=VIRT_IPLIT;
909                 }
910                 /* Don't need to free this, it's not our string */
911                 virtstyle=NULL;
912                 /* Don't append values for the [generic] group */
913                 if(i>0) {
914                         s.socket_family = AF_UNSPEC;
915                         s.servename = groups[i];
916
917                         append_serve(&s, retval);
918                 } else {
919                         if(!do_oldstyle) {
920                                 lp[1].required = 0;
921                         }
922                 }
923 #ifndef WITH_SDP
924                 if(s.flags & F_SDP) {
925                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
926                         g_array_free(retval, TRUE);
927                         g_key_file_free(cfile);
928                         return NULL;
929                 }
930 #endif
931         }
932         if(i==1) {
933                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
934         }
935         g_key_file_free(cfile);
936         return retval;
937 }
938
939 /**
940  * Signal handler for SIGCHLD
941  * @param s the signal we're handling (must be SIGCHLD, or something
942  * is severely wrong)
943  **/
944 void sigchld_handler(int s) {
945         int status;
946         int* i;
947         pid_t pid;
948
949         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
950                 if(WIFEXITED(status)) {
951                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
952                 }
953                 i=g_hash_table_lookup(children, &pid);
954                 if(!i) {
955                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
956                 } else {
957                         DEBUG("Removing %d from the list of children", pid);
958                         g_hash_table_remove(children, &pid);
959                 }
960         }
961 }
962
963 /**
964  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
965  *
966  * @param key the key
967  * @param value the value corresponding to the above key
968  * @param user_data a pointer which we always set to 1, so that we know what
969  * will happen next.
970  **/
971 void killchild(gpointer key, gpointer value, gpointer user_data) {
972         pid_t *pid=value;
973         int *parent=user_data;
974
975         kill(*pid, SIGTERM);
976         *parent=1;
977 }
978
979 /**
980  * Handle SIGTERM and dispatch it to our children
981  * @param s the signal we're handling (must be SIGTERM, or something
982  * is severely wrong).
983  **/
984 void sigterm_handler(int s) {
985         int parent=0;
986
987         g_hash_table_foreach(children, killchild, &parent);
988
989         if(parent) {
990                 unlink(pidfname);
991         }
992
993         exit(EXIT_SUCCESS);
994 }
995
996 /**
997  * Detect the size of a file.
998  *
999  * @param fhandle An open filedescriptor
1000  * @return the size of the file, or OFFT_MAX if detection was
1001  * impossible.
1002  **/
1003 off_t size_autodetect(int fhandle) {
1004         off_t es;
1005         u64 bytes;
1006         struct stat stat_buf;
1007         int error;
1008
1009 #ifdef HAVE_SYS_MOUNT_H
1010 #ifdef HAVE_SYS_IOCTL_H
1011 #ifdef BLKGETSIZE64
1012         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
1013         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
1014                 return (off_t)bytes;
1015         }
1016 #endif /* BLKGETSIZE64 */
1017 #endif /* HAVE_SYS_IOCTL_H */
1018 #endif /* HAVE_SYS_MOUNT_H */
1019
1020         DEBUG("looking for fhandle size with fstat\n");
1021         stat_buf.st_size = 0;
1022         error = fstat(fhandle, &stat_buf);
1023         if (!error) {
1024                 if(stat_buf.st_size > 0)
1025                         return (off_t)stat_buf.st_size;
1026         } else {
1027                 err("fstat failed: %m");
1028         }
1029
1030         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1031         es = lseek(fhandle, (off_t)0, SEEK_END);
1032         if (es > ((off_t)0)) {
1033                 return es;
1034         } else {
1035                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1036         }
1037
1038         err("Could not find size of exported block device: %m");
1039         return OFFT_MAX;
1040 }
1041
1042 /**
1043  * Get the file handle and offset, given an export offset.
1044  *
1045  * @param export An array of export files
1046  * @param a The offset to get corresponding file/offset for
1047  * @param fhandle [out] File descriptor
1048  * @param foffset [out] Offset into fhandle
1049  * @param maxbytes [out] Tells how many bytes can be read/written
1050  * from fhandle starting at foffset (0 if there is no limit)
1051  * @return 0 on success, -1 on failure
1052  **/
1053 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1054         /* Negative offset not allowed */
1055         if(a < 0)
1056                 return -1;
1057
1058         /* Binary search for last file with starting offset <= a */
1059         FILE_INFO fi;
1060         int start = 0;
1061         int end = export->len - 1;
1062         while( start <= end ) {
1063                 int mid = (start + end) / 2;
1064                 fi = g_array_index(export, FILE_INFO, mid);
1065                 if( fi.startoff < a ) {
1066                         start = mid + 1;
1067                 } else if( fi.startoff > a ) {
1068                         end = mid - 1;
1069                 } else {
1070                         start = end = mid;
1071                         break;
1072                 }
1073         }
1074
1075         /* end should never go negative, since first startoff is 0 and a >= 0 */
1076         g_assert(end >= 0);
1077
1078         fi = g_array_index(export, FILE_INFO, end);
1079         *fhandle = fi.fhandle;
1080         *foffset = a - fi.startoff;
1081         *maxbytes = 0;
1082         if( end+1 < export->len ) {
1083                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1084                 *maxbytes = fi_next.startoff - a;
1085         }
1086
1087         return 0;
1088 }
1089
1090 /**
1091  * seek to a position in a file, with error handling.
1092  * @param handle a filedescriptor
1093  * @param a position to seek to
1094  * @todo get rid of this; lastpoint is a global variable right now, but it
1095  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1096  * easier.
1097  **/
1098 void myseek(int handle,off_t a) {
1099         if (lseek(handle, a, SEEK_SET) < 0) {
1100                 err("Can not seek locally!\n");
1101         }
1102 }
1103
1104 /**
1105  * Write an amount of bytes at a given offset to the right file. This
1106  * abstracts the write-side of the multiple file option.
1107  *
1108  * @param a The offset where the write should start
1109  * @param buf The buffer to write from
1110  * @param len The length of buf
1111  * @param client The client we're serving for
1112  * @param fua Flag to indicate 'Force Unit Access'
1113  * @return The number of bytes actually written, or -1 in case of an error
1114  **/
1115 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1116         int fhandle;
1117         off_t foffset;
1118         size_t maxbytes;
1119         ssize_t retval;
1120
1121         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1122                 return -1;
1123         if(maxbytes && len > maxbytes)
1124                 len = maxbytes;
1125
1126         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1127
1128         myseek(fhandle, foffset);
1129         retval = write(fhandle, buf, len);
1130         if(client->server->flags & F_SYNC) {
1131                 fsync(fhandle);
1132         } else if (fua) {
1133
1134           /* This is where we would do the following
1135            *   #ifdef USE_SYNC_FILE_RANGE
1136            * However, we don't, for the reasons set out below
1137            * by Christoph Hellwig <hch@infradead.org>
1138            *
1139            * [BEGINS] 
1140            * fdatasync is equivalent to fsync except that it does not flush
1141            * non-essential metadata (basically just timestamps in practice), but it
1142            * does flush metadata requried to find the data again, e.g. allocation
1143            * information and extent maps.  sync_file_range does nothing but flush
1144            * out pagecache content - it means you basically won't get your data
1145            * back in case of a crash if you either:
1146            * 
1147            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1148            *  b) are using a sparse file on a filesystem
1149            *  c) are using a fallocate-preallocated file on a filesystem
1150            *  d) use any file on a COW filesystem like btrfs
1151            * 
1152            * e.g. it only does anything useful for you if you do not have a volatile
1153            * write cache, and either use a raw block device node, or just overwrite
1154            * an already fully allocated (and not preallocated) file on a non-COW
1155            * filesystem.
1156            * [ENDS]
1157            *
1158            * What we should do is open a second FD with O_DSYNC set, then write to
1159            * that when appropriate. However, with a Linux client, every REQ_FUA
1160            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1161            * problems.
1162            *
1163            */
1164 #if 0
1165                 sync_file_range(fhandle, foffset, len,
1166                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1167                                 SYNC_FILE_RANGE_WAIT_AFTER);
1168 #else
1169                 fdatasync(fhandle);
1170 #endif
1171         }
1172         return retval;
1173 }
1174
1175 /**
1176  * Call rawexpwrite repeatedly until all data has been written.
1177  *
1178  * @param a The offset where the write should start
1179  * @param buf The buffer to write from
1180  * @param len The length of buf
1181  * @param client The client we're serving for
1182  * @param fua Flag to indicate 'Force Unit Access'
1183  * @return 0 on success, nonzero on failure
1184  **/
1185 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1186         ssize_t ret=0;
1187
1188         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1189                 a += ret;
1190                 buf += ret;
1191                 len -= ret;
1192         }
1193         return (ret < 0 || len != 0);
1194 }
1195
1196 /**
1197  * Read an amount of bytes at a given offset from the right file. This
1198  * abstracts the read-side of the multiple files option.
1199  *
1200  * @param a The offset where the read should start
1201  * @param buf A buffer to read into
1202  * @param len The size of buf
1203  * @param client The client we're serving for
1204  * @return The number of bytes actually read, or -1 in case of an
1205  * error.
1206  **/
1207 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1208         int fhandle;
1209         off_t foffset;
1210         size_t maxbytes;
1211
1212         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1213                 return -1;
1214         if(maxbytes && len > maxbytes)
1215                 len = maxbytes;
1216
1217         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1218
1219         myseek(fhandle, foffset);
1220         return read(fhandle, buf, len);
1221 }
1222
1223 /**
1224  * Call rawexpread repeatedly until all data has been read.
1225  * @return 0 on success, nonzero on failure
1226  **/
1227 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1228         ssize_t ret=0;
1229
1230         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1231                 a += ret;
1232                 buf += ret;
1233                 len -= ret;
1234         }
1235         return (ret < 0 || len != 0);
1236 }
1237
1238 /**
1239  * Read an amount of bytes at a given offset from the right file. This
1240  * abstracts the read-side of the copyonwrite stuff, and calls
1241  * rawexpread() with the right parameters to do the actual work.
1242  * @param a The offset where the read should start
1243  * @param buf A buffer to read into
1244  * @param len The size of buf
1245  * @param client The client we're going to read for
1246  * @return 0 on success, nonzero on failure
1247  **/
1248 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1249         off_t rdlen, offset;
1250         off_t mapcnt, mapl, maph, pagestart;
1251
1252         if (!(client->server->flags & F_COPYONWRITE))
1253                 return(rawexpread_fully(a, buf, len, client));
1254         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1255
1256         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1257
1258         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1259                 pagestart=mapcnt*DIFFPAGESIZE;
1260                 offset=a-pagestart;
1261                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1262                         len : (size_t)DIFFPAGESIZE-offset;
1263                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1264                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1265                                (unsigned long)(client->difmap[mapcnt]));
1266                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1267                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1268                 } else { /* the block is not there */
1269                         DEBUG("Page %llu is not here, we read the original one\n",
1270                                (unsigned long long)mapcnt);
1271                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1272                 }
1273                 len-=rdlen; a+=rdlen; buf+=rdlen;
1274         }
1275         return 0;
1276 }
1277
1278 /**
1279  * Write an amount of bytes at a given offset to the right file. This
1280  * abstracts the write-side of the copyonwrite option, and calls
1281  * rawexpwrite() with the right parameters to do the actual work.
1282  *
1283  * @param a The offset where the write should start
1284  * @param buf The buffer to write from
1285  * @param len The length of buf
1286  * @param client The client we're going to write for.
1287  * @param fua Flag to indicate 'Force Unit Access'
1288  * @return 0 on success, nonzero on failure
1289  **/
1290 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1291         char pagebuf[DIFFPAGESIZE];
1292         off_t mapcnt,mapl,maph;
1293         off_t wrlen,rdlen; 
1294         off_t pagestart;
1295         off_t offset;
1296
1297         if (!(client->server->flags & F_COPYONWRITE))
1298                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1299         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1300
1301         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1302
1303         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1304                 pagestart=mapcnt*DIFFPAGESIZE ;
1305                 offset=a-pagestart ;
1306                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1307                         len : (size_t)DIFFPAGESIZE-offset;
1308
1309                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1310                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1311                                (unsigned long)(client->difmap[mapcnt])) ;
1312                         myseek(client->difffile,
1313                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1314                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1315                 } else { /* the block is not there */
1316                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1317                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1318                         DEBUG("Page %llu is not here, we put it at %lu\n",
1319                                (unsigned long long)mapcnt,
1320                                (unsigned long)(client->difmap[mapcnt]));
1321                         rdlen=DIFFPAGESIZE ;
1322                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1323                                 return -1;
1324                         memcpy(pagebuf+offset,buf,wrlen) ;
1325                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1326                                         DIFFPAGESIZE)
1327                                 return -1;
1328                 }                                                   
1329                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1330         }
1331         if (client->server->flags & F_SYNC) {
1332                 fsync(client->difffile);
1333         } else if (fua) {
1334                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1335                    as we iterate through the above?
1336                  */
1337                 fdatasync(client->difffile);
1338         }
1339         return 0;
1340 }
1341
1342 /**
1343  * Flush data to a client
1344  *
1345  * @param client The client we're going to write for.
1346  * @return 0 on success, nonzero on failure
1347  **/
1348 int expflush(CLIENT *client) {
1349         gint i;
1350
1351         if (client->server->flags & F_COPYONWRITE) {
1352                 return fsync(client->difffile);
1353         }
1354         
1355         for (i = 0; i < client->export->len; i++) {
1356                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1357                 if (fsync(fi.fhandle) < 0)
1358                         return -1;
1359         }
1360         
1361         return 0;
1362 }
1363
1364 /**
1365  * Do the initial negotiation.
1366  *
1367  * @param client The client we're negotiating with.
1368  **/
1369 CLIENT* negotiate(int net, CLIENT *client, GArray* servers, int phase) {
1370         char zeros[128];
1371         uint64_t size_host;
1372         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1373         uint16_t smallflags = 0;
1374         uint64_t magic;
1375
1376         memset(zeros, '\0', sizeof(zeros));
1377         if(phase & NEG_INIT) {
1378                 /* common */
1379                 if (write(net, INIT_PASSWD, 8) < 0) {
1380                         err_nonfatal("Negotiation failed: %m");
1381                         if(client)
1382                                 exit(EXIT_FAILURE);
1383                 }
1384                 if(phase & NEG_MODERN) {
1385                         /* modern */
1386                         magic = htonll(opts_magic);
1387                 } else {
1388                         /* oldstyle */
1389                         magic = htonll(cliserv_magic);
1390                 }
1391                 if (write(net, &magic, sizeof(magic)) < 0) {
1392                         err_nonfatal("Negotiation failed: %m");
1393                         if(phase & NEG_OLD)
1394                                 exit(EXIT_FAILURE);
1395                 }
1396         }
1397         if ((phase & NEG_MODERN) && (phase & NEG_INIT)) {
1398                 /* modern */
1399                 uint32_t reserved;
1400                 uint32_t opt;
1401                 uint32_t namelen;
1402                 char* name;
1403                 int i;
1404
1405                 if(!servers)
1406                         err("programmer error");
1407                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1408                         err("Negotiation failed: %m");
1409                 if (read(net, &reserved, sizeof(reserved)) < 0)
1410                         err("Negotiation failed: %m");
1411                 if (read(net, &magic, sizeof(magic)) < 0)
1412                         err("Negotiation failed: %m");
1413                 magic = ntohll(magic);
1414                 if(magic != opts_magic) {
1415                         close(net);
1416                         return NULL;
1417                 }
1418                 if (read(net, &opt, sizeof(opt)) < 0)
1419                         err("Negotiation failed: %m");
1420                 opt = ntohl(opt);
1421                 if(opt != NBD_OPT_EXPORT_NAME) {
1422                         close(net);
1423                         return NULL;
1424                 }
1425                 if (read(net, &namelen, sizeof(namelen)) < 0)
1426                         err("Negotiation failed: %m");
1427                 namelen = ntohl(namelen);
1428                 name = malloc(namelen+1);
1429                 name[namelen]=0;
1430                 if (read(net, name, namelen) < 0)
1431                         err("Negotiation failed: %m");
1432                 for(i=0; i<servers->len; i++) {
1433                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1434                         if(!strcmp(serve->servename, name)) {
1435                                 CLIENT* client = g_new0(CLIENT, 1);
1436                                 client->server = serve;
1437                                 client->exportsize = OFFT_MAX;
1438                                 client->net = net;
1439                                 client->modern = TRUE;
1440                                 client->transactionlogfd = -1;
1441                                 free(name);
1442                                 return client;
1443                         }
1444                 }
1445                 free(name);
1446                 return NULL;
1447         }
1448         /* common */
1449         size_host = htonll((u64)(client->exportsize));
1450         if (write(net, &size_host, 8) < 0)
1451                 err("Negotiation failed: %m");
1452         if (client->server->flags & F_READONLY)
1453                 flags |= NBD_FLAG_READ_ONLY;
1454         if (client->server->flags & F_FLUSH)
1455                 flags |= NBD_FLAG_SEND_FLUSH;
1456         if (client->server->flags & F_FUA)
1457                 flags |= NBD_FLAG_SEND_FUA;
1458         if (client->server->flags & F_ROTATIONAL)
1459                 flags |= NBD_FLAG_ROTATIONAL;
1460         if (phase & NEG_OLD) {
1461                 /* oldstyle */
1462                 flags = htonl(flags);
1463                 if (write(client->net, &flags, 4) < 0)
1464                         err("Negotiation failed: %m");
1465         } else {
1466                 /* modern */
1467                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1468                 smallflags = htons(smallflags);
1469                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1470                         err("Negotiation failed: %m");
1471                 }
1472         }
1473         /* common */
1474         if (write(client->net, zeros, 124) < 0)
1475                 err("Negotiation failed: %m");
1476         return NULL;
1477 }
1478
1479 /** sending macro. */
1480 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1481         if (client->transactionlogfd != -1) \
1482                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1483 /** error macro. */
1484 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1485 /**
1486  * Serve a file to a single client.
1487  *
1488  * @todo This beast needs to be split up in many tiny little manageable
1489  * pieces. Preferably with a chainsaw.
1490  *
1491  * @param client The client we're going to serve to.
1492  * @return when the client disconnects
1493  **/
1494 int mainloop(CLIENT *client) {
1495         struct nbd_request request;
1496         struct nbd_reply reply;
1497         gboolean go_on=TRUE;
1498 #ifdef DODBG
1499         int i = 0;
1500 #endif
1501         negotiate(client->net, client, NULL, client->modern ? NEG_MODERN : (NEG_OLD | NEG_INIT));
1502         DEBUG("Entering request loop!\n");
1503         reply.magic = htonl(NBD_REPLY_MAGIC);
1504         reply.error = 0;
1505         while (go_on) {
1506                 char buf[BUFSIZE];
1507                 char* p;
1508                 size_t len;
1509                 size_t currlen;
1510                 size_t writelen;
1511                 uint16_t command;
1512 #ifdef DODBG
1513                 i++;
1514                 printf("%d: ", i);
1515 #endif
1516                 readit(client->net, &request, sizeof(request));
1517                 if (client->transactionlogfd != -1)
1518                         writeit(client->transactionlogfd, &request, sizeof(request));
1519
1520                 request.from = ntohll(request.from);
1521                 request.type = ntohl(request.type);
1522                 command = request.type & NBD_CMD_MASK_COMMAND;
1523                 len = ntohl(request.len);
1524
1525                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1526                                 (unsigned long long)request.from,
1527                                 (unsigned long long)request.from / 512, (unsigned int)len);
1528
1529                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1530                         err("Not enough magic.");
1531
1532                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1533
1534                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1535                         if ((request.from + len) > (OFFT_MAX)) {
1536                                 DEBUG("[Number too large!]");
1537                                 ERROR(client, reply, EINVAL);
1538                                 continue;
1539                         }
1540
1541                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1542                                 DEBUG("[RANGE!]");
1543                                 ERROR(client, reply, EINVAL);
1544                                 continue;
1545                         }
1546
1547                         currlen = len;
1548                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1549                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1550                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1551                         }
1552                 }
1553
1554                 switch (command) {
1555
1556                 case NBD_CMD_DISC:
1557                         msg2(LOG_INFO, "Disconnect request received.");
1558                         if (client->server->flags & F_COPYONWRITE) { 
1559                                 if (client->difmap) g_free(client->difmap) ;
1560                                 close(client->difffile);
1561                                 unlink(client->difffilename);
1562                                 free(client->difffilename);
1563                         }
1564                         go_on=FALSE;
1565                         continue;
1566
1567                 case NBD_CMD_WRITE:
1568                         DEBUG("wr: net->buf, ");
1569                         while(len > 0) {
1570                                 readit(client->net, buf, currlen);
1571                                 DEBUG("buf->exp, ");
1572                                 if ((client->server->flags & F_READONLY) ||
1573                                     (client->server->flags & F_AUTOREADONLY)) {
1574                                         DEBUG("[WRITE to READONLY!]");
1575                                         ERROR(client, reply, EPERM);
1576                                         consume(client->net, buf, len-currlen, BUFSIZE);
1577                                         continue;
1578                                 }
1579                                 if (expwrite(request.from, buf, currlen, client,
1580                                              request.type & NBD_CMD_FLAG_FUA)) {
1581                                         DEBUG("Write failed: %m" );
1582                                         ERROR(client, reply, errno);
1583                                         consume(client->net, buf, len-currlen, BUFSIZE);
1584                                         continue;
1585                                 }
1586                                 len -= currlen;
1587                                 request.from += currlen;
1588                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1589                         }
1590                         SEND(client->net, reply);
1591                         DEBUG("OK!\n");
1592                         continue;
1593
1594                 case NBD_CMD_FLUSH:
1595                         DEBUG("fl: ");
1596                         if (expflush(client)) {
1597                                 DEBUG("Flush failed: %m");
1598                                 ERROR(client, reply, errno);
1599                                 continue;
1600                         }
1601                         SEND(client->net, reply);
1602                         DEBUG("OK!\n");
1603                         continue;
1604
1605                 case NBD_CMD_READ:
1606                         DEBUG("exp->buf, ");
1607                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1608                         if (client->transactionlogfd != -1)
1609                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1610                         p = buf + sizeof(struct nbd_reply);
1611                         writelen = currlen + sizeof(struct nbd_reply);
1612                         while(len > 0) {
1613                                 if (expread(request.from, p, currlen, client)) {
1614                                         DEBUG("Read failed: %m");
1615                                         ERROR(client, reply, errno);
1616                                         continue;
1617                                 }
1618                                 
1619                                 DEBUG("buf->net, ");
1620                                 writeit(client->net, buf, writelen);
1621                                 len -= currlen;
1622                                 request.from += currlen;
1623                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1624                                 p = buf;
1625                                 writelen = currlen;
1626                         }
1627                         DEBUG("OK!\n");
1628                         continue;
1629
1630                 default:
1631                         DEBUG ("Ignoring unknown command\n");
1632                         continue;
1633                 }
1634         }
1635         return 0;
1636 }
1637
1638 /**
1639  * Set up client export array, which is an array of FILE_INFO.
1640  * Also, split a single exportfile into multiple ones, if that was asked.
1641  * @param client information on the client which we want to setup export for
1642  **/
1643 void setupexport(CLIENT* client) {
1644         int i;
1645         off_t laststartoff = 0, lastsize = 0;
1646         int multifile = (client->server->flags & F_MULTIFILE);
1647
1648         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1649
1650         /* If multi-file, open as many files as we can.
1651          * If not, open exactly one file.
1652          * Calculate file sizes as we go to get total size. */
1653         for(i=0; ; i++) {
1654                 FILE_INFO fi;
1655                 gchar *tmpname;
1656                 gchar* error_string;
1657                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1658
1659                 if(multifile) {
1660                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1661                 } else {
1662                         tmpname=g_strdup(client->exportname);
1663                 }
1664                 DEBUG( "Opening %s\n", tmpname );
1665                 fi.fhandle = open(tmpname, mode);
1666                 if(fi.fhandle == -1 && mode == O_RDWR) {
1667                         /* Try again because maybe media was read-only */
1668                         fi.fhandle = open(tmpname, O_RDONLY);
1669                         if(fi.fhandle != -1) {
1670                                 /* Opening the base file in copyonwrite mode is
1671                                  * okay */
1672                                 if(!(client->server->flags & F_COPYONWRITE)) {
1673                                         client->server->flags |= F_AUTOREADONLY;
1674                                         client->server->flags |= F_READONLY;
1675                                 }
1676                         }
1677                 }
1678                 if(fi.fhandle == -1) {
1679                         if(multifile && i>0)
1680                                 break;
1681                         error_string=g_strdup_printf(
1682                                 "Could not open exported file %s: %%m",
1683                                 tmpname);
1684                         err(error_string);
1685                 }
1686                 fi.startoff = laststartoff + lastsize;
1687                 g_array_append_val(client->export, fi);
1688                 g_free(tmpname);
1689
1690                 /* Starting offset and size of this file will be used to
1691                  * calculate starting offset of next file */
1692                 laststartoff = fi.startoff;
1693                 lastsize = size_autodetect(fi.fhandle);
1694
1695                 if(!multifile)
1696                         break;
1697         }
1698
1699         /* Set export size to total calculated size */
1700         client->exportsize = laststartoff + lastsize;
1701
1702         /* Export size may be overridden */
1703         if(client->server->expected_size) {
1704                 /* desired size must be <= total calculated size */
1705                 if(client->server->expected_size > client->exportsize) {
1706                         err("Size of exported file is too big\n");
1707                 }
1708
1709                 client->exportsize = client->server->expected_size;
1710         }
1711
1712         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1713         if(multifile) {
1714                 msg3(LOG_INFO, "Total number of files: %d", i);
1715         }
1716 }
1717
1718 int copyonwrite_prepare(CLIENT* client) {
1719         off_t i;
1720         if ((client->difffilename = malloc(1024))==NULL)
1721                 err("Failed to allocate string for diff file name");
1722         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1723                 (int)getpid()) ;
1724         client->difffilename[1023]='\0';
1725         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1726         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1727         if (client->difffile<0) err("Could not create diff file (%m)") ;
1728         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1729                 err("Could not allocate memory") ;
1730         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1731
1732         return 0;
1733 }
1734
1735 /**
1736  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1737  * options
1738  *
1739  * @param command the command to be ran. Read from the config file
1740  * @param file the file name we're about to export
1741  **/
1742 int do_run(gchar* command, gchar* file) {
1743         gchar* cmd;
1744         int retval=0;
1745
1746         if(command && *command) {
1747                 cmd = g_strdup_printf(command, file);
1748                 retval=system(cmd);
1749                 g_free(cmd);
1750         }
1751         return retval;
1752 }
1753
1754 /**
1755  * Serve a connection. 
1756  *
1757  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1758  * follow the road map.
1759  *
1760  * @param client a connected client
1761  **/
1762 void serveconnection(CLIENT *client) {
1763         if (client->server->transactionlog && (client->transactionlogfd == -1))
1764         {
1765                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1766                                                            O_WRONLY | O_CREAT,
1767                                                            S_IRUSR | S_IWUSR)))
1768                         g_warning("Could not open transaction log %s",
1769                                   client->server->transactionlog);
1770         }
1771
1772         if(do_run(client->server->prerun, client->exportname)) {
1773                 exit(EXIT_FAILURE);
1774         }
1775         setupexport(client);
1776
1777         if (client->server->flags & F_COPYONWRITE) {
1778                 copyonwrite_prepare(client);
1779         }
1780
1781         setmysockopt(client->net);
1782
1783         mainloop(client);
1784         do_run(client->server->postrun, client->exportname);
1785
1786         if (-1 != client->transactionlogfd)
1787         {
1788                 close(client->transactionlogfd);
1789                 client->transactionlogfd = -1;
1790         }
1791 }
1792
1793 /**
1794  * Find the name of the file we have to serve. This will use g_strdup_printf
1795  * to put the IP address of the client inside a filename containing
1796  * "%s" (in the form as specified by the "virtstyle" option). That name
1797  * is then written to client->exportname.
1798  *
1799  * @param net A socket connected to an nbd client
1800  * @param client information about the client. The IP address in human-readable
1801  * format will be written to a new char* buffer, the address of which will be
1802  * stored in client->clientname.
1803  **/
1804 void set_peername(int net, CLIENT *client) {
1805         struct sockaddr_storage addrin;
1806         struct sockaddr_storage netaddr;
1807         struct sockaddr_in  *netaddr4 = NULL;
1808         struct sockaddr_in6 *netaddr6 = NULL;
1809         size_t addrinlen = sizeof( addrin );
1810         struct addrinfo hints;
1811         struct addrinfo *ai = NULL;
1812         char peername[NI_MAXHOST];
1813         char netname[NI_MAXHOST];
1814         char *tmp = NULL;
1815         int i;
1816         int e;
1817         int shift;
1818
1819         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1820                 err("getsockname failed: %m");
1821
1822         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1823                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1824
1825         memset(&hints, '\0', sizeof (hints));
1826         hints.ai_flags = AI_ADDRCONFIG;
1827         e = getaddrinfo(peername, NULL, &hints, &ai);
1828
1829         if(e != 0) {
1830                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1831                 freeaddrinfo(ai);
1832                 return;
1833         }
1834
1835         switch(client->server->virtstyle) {
1836                 case VIRT_NONE:
1837                         client->exportname=g_strdup(client->server->exportname);
1838                         break;
1839                 case VIRT_IPHASH:
1840                         for(i=0;i<strlen(peername);i++) {
1841                                 if(peername[i]=='.') {
1842                                         peername[i]='/';
1843                                 }
1844                         }
1845                 case VIRT_IPLIT:
1846                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1847                         break;
1848                 case VIRT_CIDR:
1849                         memcpy(&netaddr, &addrin, addrinlen);
1850                         if(ai->ai_family == AF_INET) {
1851                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1852                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1853                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1854
1855                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1856                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1857                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1858                         }else if(ai->ai_family == AF_INET6) {
1859                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1860
1861                                 shift = 128-(client->server->cidrlen);
1862                                 i = 3;
1863                                 while(shift >= 32) {
1864                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1865                                         shift-=32;
1866                                         i--;
1867                                 }
1868                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1869                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1870
1871                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1872                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1873                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1874                         }
1875
1876                         if(tmp != NULL)
1877                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1878
1879                         break;
1880         }
1881
1882         freeaddrinfo(ai);
1883         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1884              peername, client->exportname);
1885         client->clientname=g_strdup(peername);
1886 }
1887
1888 /**
1889  * Destroy a pid_t*
1890  * @param data a pointer to pid_t which should be freed
1891  **/
1892 void destroy_pid_t(gpointer data) {
1893         g_free(data);
1894 }
1895
1896 /**
1897  * Loop through the available servers, and serve them. Never returns.
1898  **/
1899 int serveloop(GArray* servers) {
1900         struct sockaddr_storage addrin;
1901         socklen_t addrinlen=sizeof(addrin);
1902         int i;
1903         int max;
1904         int sock;
1905         fd_set mset;
1906         fd_set rset;
1907
1908         /* 
1909          * Set up the master fd_set. The set of descriptors we need
1910          * to select() for never changes anyway and it buys us a *lot*
1911          * of time to only build this once. However, if we ever choose
1912          * to not fork() for clients anymore, we may have to revisit
1913          * this.
1914          */
1915         max=0;
1916         FD_ZERO(&mset);
1917         for(i=0;i<servers->len;i++) {
1918                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1919                         FD_SET(sock, &mset);
1920                         max=sock>max?sock:max;
1921                 }
1922         }
1923         if(modernsock) {
1924                 FD_SET(modernsock, &mset);
1925                 max=modernsock>max?modernsock:max;
1926         }
1927         for(;;) {
1928                 CLIENT *client = NULL;
1929                 pid_t *pid;
1930
1931                 memcpy(&rset, &mset, sizeof(fd_set));
1932                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1933                         int net = 0;
1934                         SERVER* serve=NULL;
1935
1936                         DEBUG("accept, ");
1937                         if(FD_ISSET(modernsock, &rset)) {
1938                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1939                                         err("accept: %m");
1940                                 client = negotiate(net, NULL, servers, NEG_INIT | NEG_MODERN);
1941                                 if(!client) {
1942                                         err_nonfatal("negotiation failed");
1943                                         close(net);
1944                                         net=0;
1945                                         continue;
1946                                 }
1947                                 serve = client->server;
1948                         }
1949                         for(i=0;i<servers->len && !net;i++) {
1950                                 serve=&(g_array_index(servers, SERVER, i));
1951                                 if(FD_ISSET(serve->socket, &rset)) {
1952                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1953                                                 err("accept: %m");
1954                                 }
1955                         }
1956                         if(net) {
1957                                 int sock_flags;
1958
1959                                 if(serve->max_connections > 0 &&
1960                                    g_hash_table_size(children) >= serve->max_connections) {
1961                                         msg2(LOG_INFO, "Max connections reached");
1962                                         close(net);
1963                                         continue;
1964                                 }
1965                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1966                                         err("fcntl F_GETFL");
1967                                 }
1968                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1969                                         err("fcntl F_SETFL ~O_NONBLOCK");
1970                                 }
1971                                 if(!client) {
1972                                         client = g_new0(CLIENT, 1);
1973                                         client->server=serve;
1974                                         client->exportsize=OFFT_MAX;
1975                                         client->net=net;
1976                                         client->transactionlogfd = -1;
1977                                 }
1978                                 set_peername(net, client);
1979                                 if (!authorized_client(client)) {
1980                                         msg2(LOG_INFO,"Unauthorized client") ;
1981                                         close(net);
1982                                         continue;
1983                                 }
1984                                 msg2(LOG_INFO,"Authorized client") ;
1985                                 pid=g_malloc(sizeof(pid_t));
1986
1987                                 if (!dontfork) {
1988                                         if ((*pid=fork())<0) {
1989                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1990                                                 close(net);
1991                                                 continue;
1992                                         }
1993                                         if (*pid>0) { /* parent */
1994                                                 close(net);
1995                                                 g_hash_table_insert(children, pid, pid);
1996                                                 continue;
1997                                         }
1998                                         /* child */
1999                                         g_hash_table_destroy(children);
2000                                         for(i=0;i<servers->len;i++) {
2001                                                 serve=&g_array_index(servers, SERVER, i);
2002                                                 close(serve->socket);
2003                                         }
2004                                         /* FALSE does not free the
2005                                            actual data. This is required,
2006                                            because the client has a
2007                                            direct reference into that
2008                                            data, and otherwise we get a
2009                                            segfault... */
2010                                         g_array_free(servers, FALSE);
2011                                 }
2012
2013                                 msg2(LOG_INFO,"Starting to serve");
2014                                 serveconnection(client);
2015                                 exit(EXIT_SUCCESS);
2016                         }
2017                 }
2018         }
2019 }
2020
2021 void dosockopts(int socket) {
2022 #ifndef sun
2023         int yes=1;
2024 #else
2025         char yes='1';
2026 #endif /* sun */
2027         int sock_flags;
2028
2029         /* lose the pesky "Address already in use" error message */
2030         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2031                 err("setsockopt SO_REUSEADDR");
2032         }
2033         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2034                 err("setsockopt SO_KEEPALIVE");
2035         }
2036
2037         /* make the listening socket non-blocking */
2038         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2039                 err("fcntl F_GETFL");
2040         }
2041         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2042                 err("fcntl F_SETFL O_NONBLOCK");
2043         }
2044 }
2045
2046 /**
2047  * Connect a server's socket.
2048  *
2049  * @param serve the server we want to connect.
2050  **/
2051 int setup_serve(SERVER *serve) {
2052         struct addrinfo hints;
2053         struct addrinfo *ai = NULL;
2054         gchar *port = NULL;
2055         int e;
2056
2057         if(!do_oldstyle) {
2058                 return serve->servename ? 1 : 0;
2059         }
2060         memset(&hints,'\0',sizeof(hints));
2061         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2062         hints.ai_socktype = SOCK_STREAM;
2063         hints.ai_family = serve->socket_family;
2064
2065         port = g_strdup_printf ("%d", serve->port);
2066         if (port == NULL)
2067                 return 0;
2068
2069         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2070
2071         g_free(port);
2072
2073         if(e != 0) {
2074                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2075                 serve->socket = -1;
2076                 freeaddrinfo(ai);
2077                 exit(EXIT_FAILURE);
2078         }
2079
2080         if(serve->socket_family == AF_UNSPEC)
2081                 serve->socket_family = ai->ai_family;
2082
2083 #ifdef WITH_SDP
2084         if ((serve->flags) && F_SDP) {
2085                 if (ai->ai_family == AF_INET)
2086                         ai->ai_family = AF_INET_SDP;
2087                 else (ai->ai_family == AF_INET6)
2088                         ai->ai_family = AF_INET6_SDP;
2089         }
2090 #endif
2091         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2092                 err("socket: %m");
2093
2094         dosockopts(serve->socket);
2095
2096         DEBUG("Waiting for connections... bind, ");
2097         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2098         if (e != 0 && errno != EADDRINUSE)
2099                 err("bind: %m");
2100         DEBUG("listen, ");
2101         if (listen(serve->socket, 1) < 0)
2102                 err("listen: %m");
2103
2104         freeaddrinfo (ai);
2105         if(serve->servename) {
2106                 return 1;
2107         } else {
2108                 return 0;
2109         }
2110 }
2111
2112 void open_modern(void) {
2113         struct addrinfo hints;
2114         struct addrinfo* ai = NULL;
2115         struct sock_flags;
2116         int e;
2117
2118         memset(&hints, '\0', sizeof(hints));
2119         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2120         hints.ai_socktype = SOCK_STREAM;
2121         hints.ai_family = AF_UNSPEC;
2122         hints.ai_protocol = IPPROTO_TCP;
2123         e = getaddrinfo(modern_listen, modernport, &hints, &ai);
2124         if(e != 0) {
2125                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2126                 exit(EXIT_FAILURE);
2127         }
2128         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2129                 err("socket: %m");
2130         }
2131
2132         dosockopts(modernsock);
2133
2134         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2135                 err("bind: %m");
2136         }
2137         if(listen(modernsock, 10) <0) {
2138                 err("listen: %m");
2139         }
2140
2141         freeaddrinfo(ai);
2142 }
2143
2144 /**
2145  * Connect our servers.
2146  **/
2147 void setup_servers(GArray* servers) {
2148         int i;
2149         struct sigaction sa;
2150         int want_modern=0;
2151
2152         for(i=0;i<servers->len;i++) {
2153                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2154         }
2155         if(want_modern) {
2156                 open_modern();
2157         }
2158         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2159
2160         sa.sa_handler = sigchld_handler;
2161         sigemptyset(&sa.sa_mask);
2162         sa.sa_flags = SA_RESTART;
2163         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2164                 err("sigaction: %m");
2165         sa.sa_handler = sigterm_handler;
2166         sigemptyset(&sa.sa_mask);
2167         sa.sa_flags = SA_RESTART;
2168         if(sigaction(SIGTERM, &sa, NULL) == -1)
2169                 err("sigaction: %m");
2170 }
2171
2172 /**
2173  * Go daemon (unless we specified at compile time that we didn't want this)
2174  * @param serve the first server of our configuration. If its port is zero,
2175  *      then do not daemonize, because we're doing inetd then. This parameter
2176  *      is only used to create a PID file of the form
2177  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2178  **/
2179 #if !defined(NODAEMON)
2180 void daemonize(SERVER* serve) {
2181         FILE*pidf;
2182
2183         if(serve && !(serve->port)) {
2184                 return;
2185         }
2186         if(daemon(0,0)<0) {
2187                 err("daemon");
2188         }
2189         if(!*pidftemplate) {
2190                 if(serve) {
2191                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2192                 } else {
2193                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2194                 }
2195         }
2196         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2197         pidf=fopen(pidfname, "w");
2198         if(pidf) {
2199                 fprintf(pidf,"%d\n", (int)getpid());
2200                 fclose(pidf);
2201         } else {
2202                 perror("fopen");
2203                 fprintf(stderr, "Not fatal; continuing");
2204         }
2205 }
2206 #else
2207 #define daemonize(serve)
2208 #endif /* !defined(NODAEMON) */
2209
2210 /*
2211  * Everything beyond this point (in the file) is run in non-daemon mode.
2212  * The stuff above daemonize() isn't.
2213  */
2214
2215 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2216
2217 void serve_err(SERVER* serve, const char* msg) {
2218         g_message("Export of %s on port %d failed:", serve->exportname,
2219                         serve->port);
2220         err(msg);
2221 }
2222
2223 /**
2224  * Set up user-ID and/or group-ID
2225  **/
2226 void dousers(void) {
2227         struct passwd *pw;
2228         struct group *gr;
2229         gchar* str;
2230         if(rungroup) {
2231                 gr=getgrnam(rungroup);
2232                 if(!gr) {
2233                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2234                         err(str);
2235                 }
2236                 if(setgid(gr->gr_gid)<0) {
2237                         err("Could not set GID: %m"); 
2238                 }
2239         }
2240         if(runuser) {
2241                 pw=getpwnam(runuser);
2242                 if(!pw) {
2243                         str = g_strdup_printf("Invalid user name: %s", runuser);
2244                         err(str);
2245                 }
2246                 if(setuid(pw->pw_uid)<0) {
2247                         err("Could not set UID: %m");
2248                 }
2249         }
2250 }
2251
2252 #ifndef ISSERVER
2253 void glib_message_syslog_redirect(const gchar *log_domain,
2254                                   GLogLevelFlags log_level,
2255                                   const gchar *message,
2256                                   gpointer user_data)
2257 {
2258     int level=LOG_DEBUG;
2259     
2260     switch( log_level )
2261     {
2262       case G_LOG_FLAG_FATAL:
2263       case G_LOG_LEVEL_CRITICAL:
2264       case G_LOG_LEVEL_ERROR:    
2265         level=LOG_ERR; 
2266         break;
2267       case G_LOG_LEVEL_WARNING:
2268         level=LOG_WARNING;
2269         break;
2270       case G_LOG_LEVEL_MESSAGE:
2271       case G_LOG_LEVEL_INFO:
2272         level=LOG_INFO;
2273         break;
2274       case G_LOG_LEVEL_DEBUG:
2275         level=LOG_DEBUG;
2276       default:
2277         level=LOG_ERR;
2278     }
2279     syslog(level, "%s", message);
2280 }
2281 #endif
2282
2283 /**
2284  * Main entry point...
2285  **/
2286 int main(int argc, char *argv[]) {
2287         SERVER *serve;
2288         GArray *servers;
2289         GError *err=NULL;
2290
2291         if (sizeof( struct nbd_request )!=28) {
2292                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2293                 exit(EXIT_FAILURE) ;
2294         }
2295
2296         memset(pidftemplate, '\0', 256);
2297
2298         logging();
2299         config_file_pos = g_strdup(CFILE);
2300         serve=cmdline(argc, argv);
2301         servers = parse_cfile(config_file_pos, &err);
2302         
2303         if(serve) {
2304                 serve->socket_family = AF_UNSPEC;
2305
2306                 append_serve(serve, servers);
2307      
2308                 if (!(serve->port)) {
2309                         CLIENT *client;
2310 #ifndef ISSERVER
2311                         /* You really should define ISSERVER if you're going to use
2312                          * inetd mode, but if you don't, closing stdout and stderr
2313                          * (which inetd had connected to the client socket) will let it
2314                          * work. */
2315                         close(1);
2316                         close(2);
2317                         open("/dev/null", O_WRONLY);
2318                         open("/dev/null", O_WRONLY);
2319                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2320 #endif
2321                         client=g_malloc(sizeof(CLIENT));
2322                         client->server=serve;
2323                         client->net=0;
2324                         client->exportsize=OFFT_MAX;
2325                         set_peername(0,client);
2326                         serveconnection(client);
2327                         return 0;
2328                 }
2329         }
2330     
2331         if(!servers || !servers->len) {
2332                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2333                                 && err->code == CFILE_NOTFOUND)) {
2334                         g_warning("Could not parse config file: %s", 
2335                                         err ? err->message : "Unknown error");
2336                 }
2337         }
2338         if(serve) {
2339                 g_warning("Specifying an export on the command line is deprecated.");
2340                 g_warning("Please use a configuration file instead.");
2341         }
2342
2343         if((!serve) && (!servers||!servers->len)) {
2344                 g_message("No configured exports; quitting.");
2345                 exit(EXIT_FAILURE);
2346         }
2347         if (!dontfork)
2348                 daemonize(serve);
2349         setup_servers(servers);
2350         dousers();
2351         serveloop(servers);
2352         return 0 ;
2353 }