Release 2.9.23
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 #define F_TEMPORARY 1024  /**< Whether the backing file is temporary and should be created then unlinked */
164 GHashTable *children;
165 char pidfname[256]; /**< name of our PID file */
166 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
167 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
168
169 #define NEG_INIT        (1 << 0)
170 #define NEG_OLD         (1 << 1)
171 #define NEG_MODERN      (1 << 2)
172
173 int modernsock=0;         /**< Socket for the modern handler. Not used
174                                if a client was only specified on the
175                                command line; only port used if
176                                oldstyle is set to false (and then the
177                                command-line client isn't used, gna gna) */
178 char* modern_listen;      /**< listenaddr value for modernsock */
179 char* modernport=NBD_DEFAULT_PORT; /**< Port number on which to listen for
180                                       new-style nbd-client connections */
181
182 /**
183  * Types of virtuatlization
184  **/
185 typedef enum {
186         VIRT_NONE=0,    /**< No virtualization */
187         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
188         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
189                              doing the same as in IPLIT */
190         VIRT_CIDR,      /**< Every subnet in its own directory */
191 } VIRT_STYLE;
192
193 /**
194  * Variables associated with a server.
195  **/
196 typedef struct {
197         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
198         off_t expected_size; /**< size of the exported file as it was told to
199                                us through configuration */
200         gchar* listenaddr;   /**< The IP address we're listening on */
201         unsigned int port;   /**< port we're exporting this file at */
202         char* authname;      /**< filename of the authorization file */
203         int flags;           /**< flags associated with this exported file */
204         int socket;          /**< The socket of this server. */
205         int socket_family;   /**< family of the socket */
206         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
207         uint8_t cidrlen;     /**< The length of the mask when we use
208                                   CIDR-style virtualization */
209         gchar* prerun;       /**< command to be ran after connecting a client,
210                                   but before starting to serve */
211         gchar* postrun;      /**< command that will be ran after the client
212                                   disconnects */
213         gchar* servename;    /**< name of the export as selected by nbd-client */
214         int max_connections; /**< maximum number of opened connections */
215         gchar* transactionlog;/**< filename for transaction log */
216 } SERVER;
217
218 /**
219  * Variables associated with a client socket.
220  **/
221 typedef struct {
222         int fhandle;      /**< file descriptor */
223         off_t startoff;   /**< starting offset of this file */
224 } FILE_INFO;
225
226 typedef struct {
227         off_t exportsize;    /**< size of the file we're exporting */
228         char *clientname;    /**< peer */
229         char *exportname;    /**< (processed) filename of the file we're exporting */
230         GArray *export;    /**< array of FILE_INFO of exported files;
231                                array size is always 1 unless we're
232                                doing the multiple file option */
233         int net;             /**< The actual client socket */
234         SERVER *server;      /**< The server this client is getting data from */
235         char* difffilename;  /**< filename of the copy-on-write file, if any */
236         int difffile;        /**< filedescriptor of copyonwrite file. @todo
237                                shouldn't this be an array too? (cfr export) Or
238                                make -m and -c mutually exclusive */
239         u32 difffilelen;     /**< number of pages in difffile */
240         u32 *difmap;         /**< see comment on the global difmap for this one */
241         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
242         int transactionlogfd;/**< fd for transaction log */
243 } CLIENT;
244
245 /**
246  * Type of configuration file values
247  **/
248 typedef enum {
249         PARAM_INT,              /**< This parameter is an integer */
250         PARAM_STRING,           /**< This parameter is a string */
251         PARAM_BOOL,             /**< This parameter is a boolean */
252 } PARAM_TYPE;
253
254 /**
255  * Configuration file values
256  **/
257 typedef struct {
258         gchar *paramname;       /**< Name of the parameter, as it appears in
259                                   the config file */
260         gboolean required;      /**< Whether this is a required (as opposed to
261                                   optional) parameter */
262         PARAM_TYPE ptype;       /**< Type of the parameter. */
263         gpointer target;        /**< Pointer to where the data of this
264                                   parameter should be written. If ptype is
265                                   PARAM_BOOL, the data is or'ed rather than
266                                   overwritten. */
267         gint flagval;           /**< Flag mask for this parameter in case ptype
268                                   is PARAM_BOOL. */
269 } PARAM;
270
271 /**
272  * Translate a command name into human readable form
273  *
274  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
275  * @return pointer to the command name
276  **/
277 static inline const char * getcommandname(uint64_t command) {
278         switch (command) {
279         case NBD_CMD_READ:
280                 return "NBD_CMD_READ";
281         case NBD_CMD_WRITE:
282                 return "NBD_CMD_WRITE";
283         case NBD_CMD_DISC:
284                 return "NBD_CMD_DISC";
285         case NBD_CMD_FLUSH:
286                 return "NBD_CMD_FLUSH";
287         default:
288                 break;
289         }
290         return "UNKNOWN";
291 }
292
293 /**
294  * Check whether a client is allowed to connect. Works with an authorization
295  * file which contains one line per machine, no wildcards.
296  *
297  * @param opts The client who's trying to connect.
298  * @return 0 - authorization refused, 1 - OK
299  **/
300 int authorized_client(CLIENT *opts) {
301         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
302         FILE *f ;
303         char line[LINELEN]; 
304         char *tmp;
305         struct in_addr addr;
306         struct in_addr client;
307         struct in_addr cltemp;
308         int len;
309
310         if ((f=fopen(opts->server->authname,"r"))==NULL) {
311                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
312                      opts->server->authname,strerror(errno)) ;
313                 return 1 ; 
314         }
315   
316         inet_aton(opts->clientname, &client);
317         while (fgets(line,LINELEN,f)!=NULL) {
318                 if((tmp=index(line, '/'))) {
319                         if(strlen(line)<=tmp-line) {
320                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
321                                 return 0;
322                         }
323                         *(tmp++)=0;
324                         if(!inet_aton(line,&addr)) {
325                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
326                                 return 0;
327                         }
328                         len=strtol(tmp, NULL, 0);
329                         addr.s_addr>>=32-len;
330                         addr.s_addr<<=32-len;
331                         memcpy(&cltemp,&client,sizeof(client));
332                         cltemp.s_addr>>=32-len;
333                         cltemp.s_addr<<=32-len;
334                         if(addr.s_addr == cltemp.s_addr) {
335                                 return 1;
336                         }
337                 }
338                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
339                         fclose(f);
340                         return 1;
341                 }
342         }
343         fclose(f);
344         return 0;
345 }
346
347 /**
348  * Read data from a file descriptor into a buffer
349  *
350  * @param f a file descriptor
351  * @param buf a buffer
352  * @param len the number of bytes to be read
353  **/
354 static inline void readit(int f, void *buf, size_t len) {
355         ssize_t res;
356         while (len > 0) {
357                 DEBUG("*");
358                 if ((res = read(f, buf, len)) <= 0) {
359                         if(errno != EAGAIN) {
360                                 err("Read failed: %m");
361                         }
362                 } else {
363                         len -= res;
364                         buf += res;
365                 }
366         }
367 }
368
369 /**
370  * Consume data from an FD that we don't want
371  *
372  * @param f a file descriptor
373  * @param buf a buffer
374  * @param len the number of bytes to consume
375  * @param bufsiz the size of the buffer
376  **/
377 static inline void consume(int f, void * buf, size_t len, size_t bufsiz) {
378         size_t curlen;
379         while (len>0) {
380                 curlen = (len>bufsiz)?bufsiz:len;
381                 readit(f, buf, curlen);
382                 len -= curlen;
383         }
384 }
385
386
387 /**
388  * Write data from a buffer into a filedescriptor
389  *
390  * @param f a file descriptor
391  * @param buf a buffer containing data
392  * @param len the number of bytes to be written
393  **/
394 static inline void writeit(int f, void *buf, size_t len) {
395         ssize_t res;
396         while (len > 0) {
397                 DEBUG("+");
398                 if ((res = write(f, buf, len)) <= 0)
399                         err("Send failed: %m");
400                 len -= res;
401                 buf += res;
402         }
403 }
404
405 /**
406  * Print out a message about how to use nbd-server. Split out to a separate
407  * function so that we can call it from multiple places
408  */
409 void usage() {
410         printf("This is nbd-server version " VERSION "\n");
411         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
412                "\t-r|--read-only\t\tread only\n"
413                "\t-m|--multi-file\t\tmultiple file\n"
414                "\t-c|--copy-on-write\tcopy on write\n"
415                "\t-C|--config-file\tspecify an alternate configuration file\n"
416                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
417                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
418                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
419                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
420                "\tif port is set to 0, stdin is used (for running from inetd).\n"
421                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
422                "\t\taddress of the machine trying to connect\n" 
423                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
424         printf("Using configuration file %s\n", CFILE);
425 }
426
427 /* Dumps a config file section of the given SERVER*, and exits. */
428 void dump_section(SERVER* serve, gchar* section_header) {
429         printf("[%s]\n", section_header);
430         printf("\texportname = %s\n", serve->exportname);
431         printf("\tlistenaddr = %s\n", serve->listenaddr);
432         printf("\tport = %d\n", serve->port);
433         if(serve->flags & F_READONLY) {
434                 printf("\treadonly = true\n");
435         }
436         if(serve->flags & F_MULTIFILE) {
437                 printf("\tmultifile = true\n");
438         }
439         if(serve->flags & F_COPYONWRITE) {
440                 printf("\tcopyonwrite = true\n");
441         }
442         if(serve->expected_size) {
443                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
444         }
445         if(serve->authname) {
446                 printf("\tauthfile = %s\n", serve->authname);
447         }
448         exit(EXIT_SUCCESS);
449 }
450
451 /**
452  * Parse the command line.
453  *
454  * @param argc the argc argument to main()
455  * @param argv the argv argument to main()
456  **/
457 SERVER* cmdline(int argc, char *argv[]) {
458         int i=0;
459         int nonspecial=0;
460         int c;
461         struct option long_options[] = {
462                 {"read-only", no_argument, NULL, 'r'},
463                 {"multi-file", no_argument, NULL, 'm'},
464                 {"copy-on-write", no_argument, NULL, 'c'},
465                 {"dont-fork", no_argument, NULL, 'd'},
466                 {"authorize-file", required_argument, NULL, 'l'},
467                 {"config-file", required_argument, NULL, 'C'},
468                 {"pid-file", required_argument, NULL, 'p'},
469                 {"output-config", required_argument, NULL, 'o'},
470                 {"max-connection", required_argument, NULL, 'M'},
471                 {0,0,0,0}
472         };
473         SERVER *serve;
474         off_t es;
475         size_t last;
476         char suffix;
477         gboolean do_output=FALSE;
478         gchar* section_header="";
479         gchar** addr_port;
480
481         if(argc==1) {
482                 return NULL;
483         }
484         serve=g_new0(SERVER, 1);
485         serve->authname = g_strdup(default_authname);
486         serve->virtstyle=VIRT_IPLIT;
487         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
488                 switch (c) {
489                 case 1:
490                         /* non-option argument */
491                         switch(nonspecial++) {
492                         case 0:
493                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
494                                         addr_port=g_strsplit(optarg, ":", 2);
495
496                                         /* Check for "@" - maybe user using this separator
497                                                  for IPv4 address */
498                                         if(!addr_port[1]) {
499                                                 g_strfreev(addr_port);
500                                                 addr_port=g_strsplit(optarg, "@", 2);
501                                         }
502                                 } else {
503                                         addr_port=g_strsplit(optarg, "@", 2);
504                                 }
505
506                                 if(addr_port[1]) {
507                                         serve->port=strtol(addr_port[1], NULL, 0);
508                                         serve->listenaddr=g_strdup(addr_port[0]);
509                                 } else {
510                                         serve->listenaddr=NULL;
511                                         serve->port=strtol(addr_port[0], NULL, 0);
512                                 }
513                                 g_strfreev(addr_port);
514                                 break;
515                         case 1:
516                                 serve->exportname = g_strdup(optarg);
517                                 if(serve->exportname[0] != '/') {
518                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
519                                         exit(EXIT_FAILURE);
520                                 }
521                                 break;
522                         case 2:
523                                 last=strlen(optarg)-1;
524                                 suffix=optarg[last];
525                                 if (suffix == 'k' || suffix == 'K' ||
526                                     suffix == 'm' || suffix == 'M')
527                                         optarg[last] = '\0';
528                                 es = (off_t)atoll(optarg);
529                                 switch (suffix) {
530                                         case 'm':
531                                         case 'M':  es <<= 10;
532                                         case 'k':
533                                         case 'K':  es <<= 10;
534                                         default :  break;
535                                 }
536                                 serve->expected_size = es;
537                                 break;
538                         }
539                         break;
540                 case 'r':
541                         serve->flags |= F_READONLY;
542                         break;
543                 case 'm':
544                         serve->flags |= F_MULTIFILE;
545                         break;
546                 case 'o':
547                         do_output = TRUE;
548                         section_header = g_strdup(optarg);
549                         break;
550                 case 'p':
551                         strncpy(pidftemplate, optarg, 256);
552                         break;
553                 case 'c': 
554                         serve->flags |=F_COPYONWRITE;
555                         break;
556                 case 'd': 
557                         dontfork = 1;
558                         break;
559                 case 'C':
560                         g_free(config_file_pos);
561                         config_file_pos=g_strdup(optarg);
562                         break;
563                 case 'l':
564                         g_free(serve->authname);
565                         serve->authname=g_strdup(optarg);
566                         break;
567                 case 'M':
568                         serve->max_connections = strtol(optarg, NULL, 0);
569                         break;
570                 default:
571                         usage();
572                         exit(EXIT_FAILURE);
573                         break;
574                 }
575         }
576         /* What's left: the port to export, the name of the to be exported
577          * file, and, optionally, the size of the file, in that order. */
578         if(nonspecial<2) {
579                 g_free(serve);
580                 serve=NULL;
581         } else {
582                 do_oldstyle = TRUE;
583         }
584         if(do_output) {
585                 if(!serve) {
586                         g_critical("Need a complete configuration on the command line to output a config file section!");
587                         exit(EXIT_FAILURE);
588                 }
589                 dump_section(serve, section_header);
590         }
591         return serve;
592 }
593
594 /**
595  * Error codes for config file parsing
596  **/
597 typedef enum {
598         CFILE_NOTFOUND,         /**< The configuration file is not found */
599         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
600         CFILE_KEY_MISSING,      /**< A (required) key is missing */
601         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
602         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
603         CFILE_PROGERR,          /**< Programmer error */
604         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
605                                      define any exports */
606         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
607                                      old-style export. */
608 } CFILE_ERRORS;
609
610 /**
611  * Remove a SERVER from memory. Used from the hash table
612  **/
613 void remove_server(gpointer s) {
614         SERVER *server;
615
616         server=(SERVER*)s;
617         g_free(server->exportname);
618         if(server->authname)
619                 g_free(server->authname);
620         if(server->listenaddr)
621                 g_free(server->listenaddr);
622         if(server->prerun)
623                 g_free(server->prerun);
624         if(server->postrun)
625                 g_free(server->postrun);
626         if(server->transactionlog)
627                 g_free(server->transactionlog);
628         g_free(server);
629 }
630
631 /**
632  * duplicate server
633  * @param s the old server we want to duplicate
634  * @return new duplicated server
635  **/
636 SERVER* dup_serve(SERVER *s) {
637         SERVER *serve = NULL;
638
639         serve=g_new0(SERVER, 1);
640         if(serve == NULL)
641                 return NULL;
642
643         if(s->exportname)
644                 serve->exportname = g_strdup(s->exportname);
645
646         serve->expected_size = s->expected_size;
647
648         if(s->listenaddr)
649                 serve->listenaddr = g_strdup(s->listenaddr);
650
651         serve->port = s->port;
652
653         if(s->authname)
654                 serve->authname = strdup(s->authname);
655
656         serve->flags = s->flags;
657         serve->socket = s->socket;
658         serve->socket_family = s->socket_family;
659         serve->virtstyle = s->virtstyle;
660         serve->cidrlen = s->cidrlen;
661
662         if(s->prerun)
663                 serve->prerun = g_strdup(s->prerun);
664
665         if(s->postrun)
666                 serve->postrun = g_strdup(s->postrun);
667
668         if(s->transactionlog)
669                 serve->transactionlog = g_strdup(s->transactionlog);
670         
671         if(s->servename)
672                 serve->servename = g_strdup(s->servename);
673
674         serve->max_connections = s->max_connections;
675
676         return serve;
677 }
678
679 /**
680  * append new server to array
681  * @param s server
682  * @param a server array
683  * @return 0 success, -1 error
684  */
685 int append_serve(SERVER *s, GArray *a) {
686         SERVER *ns = NULL;
687         struct addrinfo hints;
688         struct addrinfo *ai = NULL;
689         struct addrinfo *rp = NULL;
690         char   host[NI_MAXHOST];
691         gchar  *port = NULL;
692         int e;
693         int ret;
694
695         if(!s) {
696                 err("Invalid parsing server");
697                 return -1;
698         }
699
700         port = g_strdup_printf("%d", s->port);
701
702         memset(&hints,'\0',sizeof(hints));
703         hints.ai_family = AF_UNSPEC;
704         hints.ai_socktype = SOCK_STREAM;
705         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
706         hints.ai_protocol = IPPROTO_TCP;
707
708         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
709
710         if (port)
711                 g_free(port);
712
713         if(e == 0) {
714                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
715                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
716
717                         if (e != 0) { // error
718                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
719                                 continue;
720                         }
721
722                         // duplicate server and set listenaddr to resolved IP address
723                         ns = dup_serve (s);
724                         if (ns) {
725                                 ns->listenaddr = g_strdup(host);
726                                 ns->socket_family = rp->ai_family;
727                                 g_array_append_val(a, *ns);
728                                 free(ns);
729                                 ns = NULL;
730                         }
731                 }
732
733                 ret = 0;
734         } else {
735                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
736                 ret = -1;
737         }
738
739         if (ai)
740                 freeaddrinfo(ai);
741
742         return ret;
743 }
744
745 /**
746  * Parse the config file.
747  *
748  * @param f the name of the config file
749  * @param e a GError. @see CFILE_ERRORS for what error values this function can
750  *      return.
751  * @return a Array of SERVER* pointers, If the config file is empty or does not
752  *      exist, returns an empty GHashTable; if the config file contains an
753  *      error, returns NULL, and e is set appropriately
754  **/
755 GArray* parse_cfile(gchar* f, GError** e) {
756         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
757         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
758         SERVER s;
759         gchar *virtstyle=NULL;
760         PARAM lp[] = {
761                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
762                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
763                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
764                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
765                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
766                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
767                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
768                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
769                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
770                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
771                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
772                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
773                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
774                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
775                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
776                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
777                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
778                 { "temporary",  FALSE,  PARAM_BOOL,     &(s.flags),             F_TEMPORARY },
779                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
780                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
781         };
782         const int lp_size=sizeof(lp)/sizeof(PARAM);
783         PARAM gp[] = {
784                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
785                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
786                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
787                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
788                 { "port",       FALSE, PARAM_STRING,    &modernport,    0 },
789         };
790         PARAM* p=gp;
791         int p_size=sizeof(gp)/sizeof(PARAM);
792         GKeyFile *cfile;
793         GError *err = NULL;
794         const char *err_msg=NULL;
795         GQuark errdomain;
796         GArray *retval=NULL;
797         gchar **groups;
798         gboolean bval;
799         gint ival;
800         gchar* sval;
801         gchar* startgroup;
802         gint i;
803         gint j;
804
805         errdomain = g_quark_from_string("parse_cfile");
806         cfile = g_key_file_new();
807         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
808         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
809                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
810                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
811                 g_key_file_free(cfile);
812                 return retval;
813         }
814         startgroup = g_key_file_get_start_group(cfile);
815         if(!startgroup || strcmp(startgroup, "generic")) {
816                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
817                 g_key_file_free(cfile);
818                 return NULL;
819         }
820         groups = g_key_file_get_groups(cfile, NULL);
821         for(i=0;groups[i];i++) {
822                 memset(&s, '\0', sizeof(SERVER));
823
824                 /* After the [generic] group, start parsing exports */
825                 if(i==1) {
826                         p=lp;
827                         p_size=lp_size;
828                 } 
829                 for(j=0;j<p_size;j++) {
830                         g_assert(p[j].target != NULL);
831                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
832                         switch(p[j].ptype) {
833                                 case PARAM_INT:
834                                         ival = g_key_file_get_integer(cfile,
835                                                                 groups[i],
836                                                                 p[j].paramname,
837                                                                 &err);
838                                         if(!err) {
839                                                 *((gint*)p[j].target) = ival;
840                                         }
841                                         break;
842                                 case PARAM_STRING:
843                                         sval = g_key_file_get_string(cfile,
844                                                                 groups[i],
845                                                                 p[j].paramname,
846                                                                 &err);
847                                         if(!err) {
848                                                 *((gchar**)p[j].target) = sval;
849                                         }
850                                         break;
851                                 case PARAM_BOOL:
852                                         bval = g_key_file_get_boolean(cfile,
853                                                         groups[i],
854                                                         p[j].paramname, &err);
855                                         if(!err) {
856                                                 if(bval) {
857                                                         *((gint*)p[j].target) |= p[j].flagval;
858                                                 } else {
859                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
860                                                 }
861                                         }
862                                         break;
863                         }
864                         if(err) {
865                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
866                                         if(!p[j].required) {
867                                                 /* Ignore not-found error for optional values */
868                                                 g_clear_error(&err);
869                                                 continue;
870                                         } else {
871                                                 err_msg = MISSING_REQUIRED_ERROR;
872                                         }
873                                 } else {
874                                         err_msg = DEFAULT_ERROR;
875                                 }
876                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
877                                 g_array_free(retval, TRUE);
878                                 g_error_free(err);
879                                 g_key_file_free(cfile);
880                                 return NULL;
881                         }
882                 }
883                 if(virtstyle) {
884                         if(!strncmp(virtstyle, "none", 4)) {
885                                 s.virtstyle=VIRT_NONE;
886                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
887                                 s.virtstyle=VIRT_IPLIT;
888                         } else if(!strncmp(virtstyle, "iphash", 6)) {
889                                 s.virtstyle=VIRT_IPHASH;
890                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
891                                 s.virtstyle=VIRT_CIDR;
892                                 if(strlen(virtstyle)<10) {
893                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
894                                         g_array_free(retval, TRUE);
895                                         g_key_file_free(cfile);
896                                         return NULL;
897                                 }
898                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
899                         } else {
900                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
901                                 g_array_free(retval, TRUE);
902                                 g_key_file_free(cfile);
903                                 return NULL;
904                         }
905                         if(s.port && !do_oldstyle) {
906                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
907                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
908                         }
909                 } else {
910                         s.virtstyle=VIRT_IPLIT;
911                 }
912                 /* Don't need to free this, it's not our string */
913                 virtstyle=NULL;
914                 /* Don't append values for the [generic] group */
915                 if(i>0) {
916                         s.socket_family = AF_UNSPEC;
917                         s.servename = groups[i];
918
919                         append_serve(&s, retval);
920                 } else {
921                         if(!do_oldstyle) {
922                                 lp[1].required = 0;
923                         }
924                 }
925 #ifndef WITH_SDP
926                 if(s.flags & F_SDP) {
927                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
928                         g_array_free(retval, TRUE);
929                         g_key_file_free(cfile);
930                         return NULL;
931                 }
932 #endif
933         }
934         if(i==1) {
935                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
936         }
937         g_key_file_free(cfile);
938         return retval;
939 }
940
941 /**
942  * Signal handler for SIGCHLD
943  * @param s the signal we're handling (must be SIGCHLD, or something
944  * is severely wrong)
945  **/
946 void sigchld_handler(int s) {
947         int status;
948         int* i;
949         pid_t pid;
950
951         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
952                 if(WIFEXITED(status)) {
953                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
954                 }
955                 i=g_hash_table_lookup(children, &pid);
956                 if(!i) {
957                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
958                 } else {
959                         DEBUG("Removing %d from the list of children", pid);
960                         g_hash_table_remove(children, &pid);
961                 }
962         }
963 }
964
965 /**
966  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
967  *
968  * @param key the key
969  * @param value the value corresponding to the above key
970  * @param user_data a pointer which we always set to 1, so that we know what
971  * will happen next.
972  **/
973 void killchild(gpointer key, gpointer value, gpointer user_data) {
974         pid_t *pid=value;
975         int *parent=user_data;
976
977         kill(*pid, SIGTERM);
978         *parent=1;
979 }
980
981 /**
982  * Handle SIGTERM and dispatch it to our children
983  * @param s the signal we're handling (must be SIGTERM, or something
984  * is severely wrong).
985  **/
986 void sigterm_handler(int s) {
987         int parent=0;
988
989         g_hash_table_foreach(children, killchild, &parent);
990
991         if(parent) {
992                 unlink(pidfname);
993         }
994
995         exit(EXIT_SUCCESS);
996 }
997
998 /**
999  * Detect the size of a file.
1000  *
1001  * @param fhandle An open filedescriptor
1002  * @return the size of the file, or OFFT_MAX if detection was
1003  * impossible.
1004  **/
1005 off_t size_autodetect(int fhandle) {
1006         off_t es;
1007         u64 bytes;
1008         struct stat stat_buf;
1009         int error;
1010
1011 #ifdef HAVE_SYS_MOUNT_H
1012 #ifdef HAVE_SYS_IOCTL_H
1013 #ifdef BLKGETSIZE64
1014         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
1015         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
1016                 return (off_t)bytes;
1017         }
1018 #endif /* BLKGETSIZE64 */
1019 #endif /* HAVE_SYS_IOCTL_H */
1020 #endif /* HAVE_SYS_MOUNT_H */
1021
1022         DEBUG("looking for fhandle size with fstat\n");
1023         stat_buf.st_size = 0;
1024         error = fstat(fhandle, &stat_buf);
1025         if (!error) {
1026                 /* always believe stat if a regular file as it might really
1027                  * be zero length */
1028                 if (S_ISREG(stat_buf.st_mode) || (stat_buf.st_size > 0))
1029                         return (off_t)stat_buf.st_size;
1030         } else {
1031                 err("fstat failed: %m");
1032         }
1033
1034         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1035         es = lseek(fhandle, (off_t)0, SEEK_END);
1036         if (es > ((off_t)0)) {
1037                 return es;
1038         } else {
1039                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1040         }
1041
1042         err("Could not find size of exported block device: %m");
1043         return OFFT_MAX;
1044 }
1045
1046 /**
1047  * Get the file handle and offset, given an export offset.
1048  *
1049  * @param export An array of export files
1050  * @param a The offset to get corresponding file/offset for
1051  * @param fhandle [out] File descriptor
1052  * @param foffset [out] Offset into fhandle
1053  * @param maxbytes [out] Tells how many bytes can be read/written
1054  * from fhandle starting at foffset (0 if there is no limit)
1055  * @return 0 on success, -1 on failure
1056  **/
1057 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1058         /* Negative offset not allowed */
1059         if(a < 0)
1060                 return -1;
1061
1062         /* Binary search for last file with starting offset <= a */
1063         FILE_INFO fi;
1064         int start = 0;
1065         int end = export->len - 1;
1066         while( start <= end ) {
1067                 int mid = (start + end) / 2;
1068                 fi = g_array_index(export, FILE_INFO, mid);
1069                 if( fi.startoff < a ) {
1070                         start = mid + 1;
1071                 } else if( fi.startoff > a ) {
1072                         end = mid - 1;
1073                 } else {
1074                         start = end = mid;
1075                         break;
1076                 }
1077         }
1078
1079         /* end should never go negative, since first startoff is 0 and a >= 0 */
1080         g_assert(end >= 0);
1081
1082         fi = g_array_index(export, FILE_INFO, end);
1083         *fhandle = fi.fhandle;
1084         *foffset = a - fi.startoff;
1085         *maxbytes = 0;
1086         if( end+1 < export->len ) {
1087                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1088                 *maxbytes = fi_next.startoff - a;
1089         }
1090
1091         return 0;
1092 }
1093
1094 /**
1095  * seek to a position in a file, with error handling.
1096  * @param handle a filedescriptor
1097  * @param a position to seek to
1098  * @todo get rid of this; lastpoint is a global variable right now, but it
1099  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1100  * easier.
1101  **/
1102 void myseek(int handle,off_t a) {
1103         if (lseek(handle, a, SEEK_SET) < 0) {
1104                 err("Can not seek locally!\n");
1105         }
1106 }
1107
1108 /**
1109  * Write an amount of bytes at a given offset to the right file. This
1110  * abstracts the write-side of the multiple file option.
1111  *
1112  * @param a The offset where the write should start
1113  * @param buf The buffer to write from
1114  * @param len The length of buf
1115  * @param client The client we're serving for
1116  * @param fua Flag to indicate 'Force Unit Access'
1117  * @return The number of bytes actually written, or -1 in case of an error
1118  **/
1119 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1120         int fhandle;
1121         off_t foffset;
1122         size_t maxbytes;
1123         ssize_t retval;
1124
1125         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1126                 return -1;
1127         if(maxbytes && len > maxbytes)
1128                 len = maxbytes;
1129
1130         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1131
1132         myseek(fhandle, foffset);
1133         retval = write(fhandle, buf, len);
1134         if(client->server->flags & F_SYNC) {
1135                 fsync(fhandle);
1136         } else if (fua) {
1137
1138           /* This is where we would do the following
1139            *   #ifdef USE_SYNC_FILE_RANGE
1140            * However, we don't, for the reasons set out below
1141            * by Christoph Hellwig <hch@infradead.org>
1142            *
1143            * [BEGINS] 
1144            * fdatasync is equivalent to fsync except that it does not flush
1145            * non-essential metadata (basically just timestamps in practice), but it
1146            * does flush metadata requried to find the data again, e.g. allocation
1147            * information and extent maps.  sync_file_range does nothing but flush
1148            * out pagecache content - it means you basically won't get your data
1149            * back in case of a crash if you either:
1150            * 
1151            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1152            *  b) are using a sparse file on a filesystem
1153            *  c) are using a fallocate-preallocated file on a filesystem
1154            *  d) use any file on a COW filesystem like btrfs
1155            * 
1156            * e.g. it only does anything useful for you if you do not have a volatile
1157            * write cache, and either use a raw block device node, or just overwrite
1158            * an already fully allocated (and not preallocated) file on a non-COW
1159            * filesystem.
1160            * [ENDS]
1161            *
1162            * What we should do is open a second FD with O_DSYNC set, then write to
1163            * that when appropriate. However, with a Linux client, every REQ_FUA
1164            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1165            * problems.
1166            *
1167            */
1168 #if 0
1169                 sync_file_range(fhandle, foffset, len,
1170                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1171                                 SYNC_FILE_RANGE_WAIT_AFTER);
1172 #else
1173                 fdatasync(fhandle);
1174 #endif
1175         }
1176         return retval;
1177 }
1178
1179 /**
1180  * Call rawexpwrite repeatedly until all data has been written.
1181  *
1182  * @param a The offset where the write should start
1183  * @param buf The buffer to write from
1184  * @param len The length of buf
1185  * @param client The client we're serving for
1186  * @param fua Flag to indicate 'Force Unit Access'
1187  * @return 0 on success, nonzero on failure
1188  **/
1189 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1190         ssize_t ret=0;
1191
1192         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1193                 a += ret;
1194                 buf += ret;
1195                 len -= ret;
1196         }
1197         return (ret < 0 || len != 0);
1198 }
1199
1200 /**
1201  * Read an amount of bytes at a given offset from the right file. This
1202  * abstracts the read-side of the multiple files option.
1203  *
1204  * @param a The offset where the read should start
1205  * @param buf A buffer to read into
1206  * @param len The size of buf
1207  * @param client The client we're serving for
1208  * @return The number of bytes actually read, or -1 in case of an
1209  * error.
1210  **/
1211 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1212         int fhandle;
1213         off_t foffset;
1214         size_t maxbytes;
1215
1216         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1217                 return -1;
1218         if(maxbytes && len > maxbytes)
1219                 len = maxbytes;
1220
1221         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1222
1223         myseek(fhandle, foffset);
1224         return read(fhandle, buf, len);
1225 }
1226
1227 /**
1228  * Call rawexpread repeatedly until all data has been read.
1229  * @return 0 on success, nonzero on failure
1230  **/
1231 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1232         ssize_t ret=0;
1233
1234         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1235                 a += ret;
1236                 buf += ret;
1237                 len -= ret;
1238         }
1239         return (ret < 0 || len != 0);
1240 }
1241
1242 /**
1243  * Read an amount of bytes at a given offset from the right file. This
1244  * abstracts the read-side of the copyonwrite stuff, and calls
1245  * rawexpread() with the right parameters to do the actual work.
1246  * @param a The offset where the read should start
1247  * @param buf A buffer to read into
1248  * @param len The size of buf
1249  * @param client The client we're going to read for
1250  * @return 0 on success, nonzero on failure
1251  **/
1252 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1253         off_t rdlen, offset;
1254         off_t mapcnt, mapl, maph, pagestart;
1255
1256         if (!(client->server->flags & F_COPYONWRITE))
1257                 return(rawexpread_fully(a, buf, len, client));
1258         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1259
1260         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1261
1262         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1263                 pagestart=mapcnt*DIFFPAGESIZE;
1264                 offset=a-pagestart;
1265                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1266                         len : (size_t)DIFFPAGESIZE-offset;
1267                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1268                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1269                                (unsigned long)(client->difmap[mapcnt]));
1270                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1271                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1272                 } else { /* the block is not there */
1273                         DEBUG("Page %llu is not here, we read the original one\n",
1274                                (unsigned long long)mapcnt);
1275                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1276                 }
1277                 len-=rdlen; a+=rdlen; buf+=rdlen;
1278         }
1279         return 0;
1280 }
1281
1282 /**
1283  * Write an amount of bytes at a given offset to the right file. This
1284  * abstracts the write-side of the copyonwrite option, and calls
1285  * rawexpwrite() with the right parameters to do the actual work.
1286  *
1287  * @param a The offset where the write should start
1288  * @param buf The buffer to write from
1289  * @param len The length of buf
1290  * @param client The client we're going to write for.
1291  * @param fua Flag to indicate 'Force Unit Access'
1292  * @return 0 on success, nonzero on failure
1293  **/
1294 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1295         char pagebuf[DIFFPAGESIZE];
1296         off_t mapcnt,mapl,maph;
1297         off_t wrlen,rdlen; 
1298         off_t pagestart;
1299         off_t offset;
1300
1301         if (!(client->server->flags & F_COPYONWRITE))
1302                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1303         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1304
1305         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1306
1307         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1308                 pagestart=mapcnt*DIFFPAGESIZE ;
1309                 offset=a-pagestart ;
1310                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1311                         len : (size_t)DIFFPAGESIZE-offset;
1312
1313                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1314                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1315                                (unsigned long)(client->difmap[mapcnt])) ;
1316                         myseek(client->difffile,
1317                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1318                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1319                 } else { /* the block is not there */
1320                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1321                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1322                         DEBUG("Page %llu is not here, we put it at %lu\n",
1323                                (unsigned long long)mapcnt,
1324                                (unsigned long)(client->difmap[mapcnt]));
1325                         rdlen=DIFFPAGESIZE ;
1326                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1327                                 return -1;
1328                         memcpy(pagebuf+offset,buf,wrlen) ;
1329                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1330                                         DIFFPAGESIZE)
1331                                 return -1;
1332                 }                                                   
1333                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1334         }
1335         if (client->server->flags & F_SYNC) {
1336                 fsync(client->difffile);
1337         } else if (fua) {
1338                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1339                    as we iterate through the above?
1340                  */
1341                 fdatasync(client->difffile);
1342         }
1343         return 0;
1344 }
1345
1346 /**
1347  * Flush data to a client
1348  *
1349  * @param client The client we're going to write for.
1350  * @return 0 on success, nonzero on failure
1351  **/
1352 int expflush(CLIENT *client) {
1353         gint i;
1354
1355         if (client->server->flags & F_COPYONWRITE) {
1356                 return fsync(client->difffile);
1357         }
1358         
1359         for (i = 0; i < client->export->len; i++) {
1360                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1361                 if (fsync(fi.fhandle) < 0)
1362                         return -1;
1363         }
1364         
1365         return 0;
1366 }
1367
1368 /**
1369  * Do the initial negotiation.
1370  *
1371  * @param client The client we're negotiating with.
1372  **/
1373 CLIENT* negotiate(int net, CLIENT *client, GArray* servers, int phase) {
1374         char zeros[128];
1375         uint64_t size_host;
1376         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1377         uint16_t smallflags = 0;
1378         uint64_t magic;
1379
1380         memset(zeros, '\0', sizeof(zeros));
1381         if(phase & NEG_INIT) {
1382                 /* common */
1383                 if (write(net, INIT_PASSWD, 8) < 0) {
1384                         err_nonfatal("Negotiation failed: %m");
1385                         if(client)
1386                                 exit(EXIT_FAILURE);
1387                 }
1388                 if(phase & NEG_MODERN) {
1389                         /* modern */
1390                         magic = htonll(opts_magic);
1391                 } else {
1392                         /* oldstyle */
1393                         magic = htonll(cliserv_magic);
1394                 }
1395                 if (write(net, &magic, sizeof(magic)) < 0) {
1396                         err_nonfatal("Negotiation failed: %m");
1397                         if(phase & NEG_OLD)
1398                                 exit(EXIT_FAILURE);
1399                 }
1400         }
1401         if ((phase & NEG_MODERN) && (phase & NEG_INIT)) {
1402                 /* modern */
1403                 uint32_t reserved;
1404                 uint32_t opt;
1405                 uint32_t namelen;
1406                 char* name;
1407                 int i;
1408
1409                 if(!servers)
1410                         err("programmer error");
1411                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1412                         err("Negotiation failed: %m");
1413                 if (read(net, &reserved, sizeof(reserved)) < 0)
1414                         err("Negotiation failed: %m");
1415                 if (read(net, &magic, sizeof(magic)) < 0)
1416                         err("Negotiation failed: %m");
1417                 magic = ntohll(magic);
1418                 if(magic != opts_magic) {
1419                         close(net);
1420                         return NULL;
1421                 }
1422                 if (read(net, &opt, sizeof(opt)) < 0)
1423                         err("Negotiation failed: %m");
1424                 opt = ntohl(opt);
1425                 if(opt != NBD_OPT_EXPORT_NAME) {
1426                         close(net);
1427                         return NULL;
1428                 }
1429                 if (read(net, &namelen, sizeof(namelen)) < 0)
1430                         err("Negotiation failed: %m");
1431                 namelen = ntohl(namelen);
1432                 name = malloc(namelen+1);
1433                 name[namelen]=0;
1434                 if (read(net, name, namelen) < 0)
1435                         err("Negotiation failed: %m");
1436                 for(i=0; i<servers->len; i++) {
1437                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1438                         if(!strcmp(serve->servename, name)) {
1439                                 CLIENT* client = g_new0(CLIENT, 1);
1440                                 client->server = serve;
1441                                 client->exportsize = OFFT_MAX;
1442                                 client->net = net;
1443                                 client->modern = TRUE;
1444                                 client->transactionlogfd = -1;
1445                                 free(name);
1446                                 return client;
1447                         }
1448                 }
1449                 free(name);
1450                 return NULL;
1451         }
1452         /* common */
1453         size_host = htonll((u64)(client->exportsize));
1454         if (write(net, &size_host, 8) < 0)
1455                 err("Negotiation failed: %m");
1456         if (client->server->flags & F_READONLY)
1457                 flags |= NBD_FLAG_READ_ONLY;
1458         if (client->server->flags & F_FLUSH)
1459                 flags |= NBD_FLAG_SEND_FLUSH;
1460         if (client->server->flags & F_FUA)
1461                 flags |= NBD_FLAG_SEND_FUA;
1462         if (client->server->flags & F_ROTATIONAL)
1463                 flags |= NBD_FLAG_ROTATIONAL;
1464         if (phase & NEG_OLD) {
1465                 /* oldstyle */
1466                 flags = htonl(flags);
1467                 if (write(client->net, &flags, 4) < 0)
1468                         err("Negotiation failed: %m");
1469         } else {
1470                 /* modern */
1471                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1472                 smallflags = htons(smallflags);
1473                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1474                         err("Negotiation failed: %m");
1475                 }
1476         }
1477         /* common */
1478         if (write(client->net, zeros, 124) < 0)
1479                 err("Negotiation failed: %m");
1480         return NULL;
1481 }
1482
1483 /** sending macro. */
1484 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1485         if (client->transactionlogfd != -1) \
1486                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1487 /** error macro. */
1488 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1489 /**
1490  * Serve a file to a single client.
1491  *
1492  * @todo This beast needs to be split up in many tiny little manageable
1493  * pieces. Preferably with a chainsaw.
1494  *
1495  * @param client The client we're going to serve to.
1496  * @return when the client disconnects
1497  **/
1498 int mainloop(CLIENT *client) {
1499         struct nbd_request request;
1500         struct nbd_reply reply;
1501         gboolean go_on=TRUE;
1502 #ifdef DODBG
1503         int i = 0;
1504 #endif
1505         negotiate(client->net, client, NULL, client->modern ? NEG_MODERN : (NEG_OLD | NEG_INIT));
1506         DEBUG("Entering request loop!\n");
1507         reply.magic = htonl(NBD_REPLY_MAGIC);
1508         reply.error = 0;
1509         while (go_on) {
1510                 char buf[BUFSIZE];
1511                 char* p;
1512                 size_t len;
1513                 size_t currlen;
1514                 size_t writelen;
1515                 uint16_t command;
1516 #ifdef DODBG
1517                 i++;
1518                 printf("%d: ", i);
1519 #endif
1520                 readit(client->net, &request, sizeof(request));
1521                 if (client->transactionlogfd != -1)
1522                         writeit(client->transactionlogfd, &request, sizeof(request));
1523
1524                 request.from = ntohll(request.from);
1525                 request.type = ntohl(request.type);
1526                 command = request.type & NBD_CMD_MASK_COMMAND;
1527                 len = ntohl(request.len);
1528
1529                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1530                                 (unsigned long long)request.from,
1531                                 (unsigned long long)request.from / 512, (unsigned int)len);
1532
1533                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1534                         err("Not enough magic.");
1535
1536                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1537
1538                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1539                         if ((request.from + len) > (OFFT_MAX)) {
1540                                 DEBUG("[Number too large!]");
1541                                 ERROR(client, reply, EINVAL);
1542                                 continue;
1543                         }
1544
1545                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1546                                 DEBUG("[RANGE!]");
1547                                 ERROR(client, reply, EINVAL);
1548                                 continue;
1549                         }
1550
1551                         currlen = len;
1552                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1553                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1554                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1555                         }
1556                 }
1557
1558                 switch (command) {
1559
1560                 case NBD_CMD_DISC:
1561                         msg2(LOG_INFO, "Disconnect request received.");
1562                         if (client->server->flags & F_COPYONWRITE) { 
1563                                 if (client->difmap) g_free(client->difmap) ;
1564                                 close(client->difffile);
1565                                 unlink(client->difffilename);
1566                                 free(client->difffilename);
1567                         }
1568                         go_on=FALSE;
1569                         continue;
1570
1571                 case NBD_CMD_WRITE:
1572                         DEBUG("wr: net->buf, ");
1573                         while(len > 0) {
1574                                 readit(client->net, buf, currlen);
1575                                 DEBUG("buf->exp, ");
1576                                 if ((client->server->flags & F_READONLY) ||
1577                                     (client->server->flags & F_AUTOREADONLY)) {
1578                                         DEBUG("[WRITE to READONLY!]");
1579                                         ERROR(client, reply, EPERM);
1580                                         consume(client->net, buf, len-currlen, BUFSIZE);
1581                                         continue;
1582                                 }
1583                                 if (expwrite(request.from, buf, currlen, client,
1584                                              request.type & NBD_CMD_FLAG_FUA)) {
1585                                         DEBUG("Write failed: %m" );
1586                                         ERROR(client, reply, errno);
1587                                         consume(client->net, buf, len-currlen, BUFSIZE);
1588                                         continue;
1589                                 }
1590                                 len -= currlen;
1591                                 request.from += currlen;
1592                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1593                         }
1594                         SEND(client->net, reply);
1595                         DEBUG("OK!\n");
1596                         continue;
1597
1598                 case NBD_CMD_FLUSH:
1599                         DEBUG("fl: ");
1600                         if (expflush(client)) {
1601                                 DEBUG("Flush failed: %m");
1602                                 ERROR(client, reply, errno);
1603                                 continue;
1604                         }
1605                         SEND(client->net, reply);
1606                         DEBUG("OK!\n");
1607                         continue;
1608
1609                 case NBD_CMD_READ:
1610                         DEBUG("exp->buf, ");
1611                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1612                         if (client->transactionlogfd != -1)
1613                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1614                         p = buf + sizeof(struct nbd_reply);
1615                         writelen = currlen + sizeof(struct nbd_reply);
1616                         while(len > 0) {
1617                                 if (expread(request.from, p, currlen, client)) {
1618                                         DEBUG("Read failed: %m");
1619                                         ERROR(client, reply, errno);
1620                                         continue;
1621                                 }
1622                                 
1623                                 DEBUG("buf->net, ");
1624                                 writeit(client->net, buf, writelen);
1625                                 len -= currlen;
1626                                 request.from += currlen;
1627                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1628                                 p = buf;
1629                                 writelen = currlen;
1630                         }
1631                         DEBUG("OK!\n");
1632                         continue;
1633
1634                 default:
1635                         DEBUG ("Ignoring unknown command\n");
1636                         continue;
1637                 }
1638         }
1639         return 0;
1640 }
1641
1642 /**
1643  * Set up client export array, which is an array of FILE_INFO.
1644  * Also, split a single exportfile into multiple ones, if that was asked.
1645  * @param client information on the client which we want to setup export for
1646  **/
1647 void setupexport(CLIENT* client) {
1648         int i;
1649         off_t laststartoff = 0, lastsize = 0;
1650         int multifile = (client->server->flags & F_MULTIFILE);
1651         int temporary = (client->server->flags & F_TEMPORARY) && !multifile;
1652         int cancreate = (client->server->expected_size) && !multifile;
1653
1654         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1655
1656         /* If multi-file, open as many files as we can.
1657          * If not, open exactly one file.
1658          * Calculate file sizes as we go to get total size. */
1659         for(i=0; ; i++) {
1660                 FILE_INFO fi;
1661                 gchar *tmpname;
1662                 gchar* error_string;
1663
1664                 if (i)
1665                   cancreate = 0;
1666                 /* if expected_size is specified, and this is the first file, we can create the file */
1667                 mode_t mode = (client->server->flags & F_READONLY) ?
1668                   O_RDONLY : (O_RDWR | (cancreate?O_CREAT:0));
1669
1670                 if (temporary) {
1671                         tmpname=g_strdup_printf("%s.%d-XXXXXX", client->exportname, i);
1672                         DEBUG( "Opening %s\n", tmpname );
1673                         fi.fhandle = mkstemp(tmpname);
1674                 } else {
1675                         if(multifile) {
1676                                 tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1677                         } else {
1678                                 tmpname=g_strdup(client->exportname);
1679                         }
1680                         DEBUG( "Opening %s\n", tmpname );
1681                         fi.fhandle = open(tmpname, mode, 0x600);
1682                         if(fi.fhandle == -1 && mode == O_RDWR) {
1683                                 /* Try again because maybe media was read-only */
1684                                 fi.fhandle = open(tmpname, O_RDONLY);
1685                                 if(fi.fhandle != -1) {
1686                                         /* Opening the base file in copyonwrite mode is
1687                                          * okay */
1688                                         if(!(client->server->flags & F_COPYONWRITE)) {
1689                                                 client->server->flags |= F_AUTOREADONLY;
1690                                                 client->server->flags |= F_READONLY;
1691                                         }
1692                                 }
1693                         }
1694                 }
1695                 if(fi.fhandle == -1) {
1696                         if(multifile && i>0)
1697                                 break;
1698                         error_string=g_strdup_printf(
1699                                 "Could not open exported file %s: %%m",
1700                                 tmpname);
1701                         err(error_string);
1702                 }
1703
1704                 if (temporary)
1705                         unlink(tmpname); /* File will stick around whilst FD open */
1706
1707                 fi.startoff = laststartoff + lastsize;
1708                 g_array_append_val(client->export, fi);
1709                 g_free(tmpname);
1710
1711                 /* Starting offset and size of this file will be used to
1712                  * calculate starting offset of next file */
1713                 laststartoff = fi.startoff;
1714                 lastsize = size_autodetect(fi.fhandle);
1715
1716                 /* If we created the file, it will be length zero */
1717                 if (!lastsize && cancreate) {
1718                         /* we can ignore errors as we recalculate the size */
1719                         ftruncate (fi.fhandle, client->server->expected_size);
1720                         lastsize = size_autodetect(fi.fhandle);
1721                         if (lastsize != client->server->expected_size)
1722                                 err("Could not expand file");
1723                         break; /* don't look for any more files */
1724                 }
1725
1726                 if(!multifile || temporary)
1727                         break;
1728         }
1729
1730         /* Set export size to total calculated size */
1731         client->exportsize = laststartoff + lastsize;
1732
1733         /* Export size may be overridden */
1734         if(client->server->expected_size) {
1735                 /* desired size must be <= total calculated size */
1736                 if(client->server->expected_size > client->exportsize) {
1737                         err("Size of exported file is too big\n");
1738                 }
1739
1740                 client->exportsize = client->server->expected_size;
1741         }
1742
1743         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1744         if(multifile) {
1745                 msg3(LOG_INFO, "Total number of files: %d", i);
1746         }
1747 }
1748
1749 int copyonwrite_prepare(CLIENT* client) {
1750         off_t i;
1751         if ((client->difffilename = malloc(1024))==NULL)
1752                 err("Failed to allocate string for diff file name");
1753         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1754                 (int)getpid()) ;
1755         client->difffilename[1023]='\0';
1756         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1757         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1758         if (client->difffile<0) err("Could not create diff file (%m)") ;
1759         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1760                 err("Could not allocate memory") ;
1761         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1762
1763         return 0;
1764 }
1765
1766 /**
1767  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1768  * options
1769  *
1770  * @param command the command to be ran. Read from the config file
1771  * @param file the file name we're about to export
1772  **/
1773 int do_run(gchar* command, gchar* file) {
1774         gchar* cmd;
1775         int retval=0;
1776
1777         if(command && *command) {
1778                 cmd = g_strdup_printf(command, file);
1779                 retval=system(cmd);
1780                 g_free(cmd);
1781         }
1782         return retval;
1783 }
1784
1785 /**
1786  * Serve a connection. 
1787  *
1788  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1789  * follow the road map.
1790  *
1791  * @param client a connected client
1792  **/
1793 void serveconnection(CLIENT *client) {
1794         if (client->server->transactionlog && (client->transactionlogfd == -1))
1795         {
1796                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1797                                                            O_WRONLY | O_CREAT,
1798                                                            S_IRUSR | S_IWUSR)))
1799                         g_warning("Could not open transaction log %s",
1800                                   client->server->transactionlog);
1801         }
1802
1803         if(do_run(client->server->prerun, client->exportname)) {
1804                 exit(EXIT_FAILURE);
1805         }
1806         setupexport(client);
1807
1808         if (client->server->flags & F_COPYONWRITE) {
1809                 copyonwrite_prepare(client);
1810         }
1811
1812         setmysockopt(client->net);
1813
1814         mainloop(client);
1815         do_run(client->server->postrun, client->exportname);
1816
1817         if (-1 != client->transactionlogfd)
1818         {
1819                 close(client->transactionlogfd);
1820                 client->transactionlogfd = -1;
1821         }
1822 }
1823
1824 /**
1825  * Find the name of the file we have to serve. This will use g_strdup_printf
1826  * to put the IP address of the client inside a filename containing
1827  * "%s" (in the form as specified by the "virtstyle" option). That name
1828  * is then written to client->exportname.
1829  *
1830  * @param net A socket connected to an nbd client
1831  * @param client information about the client. The IP address in human-readable
1832  * format will be written to a new char* buffer, the address of which will be
1833  * stored in client->clientname.
1834  **/
1835 void set_peername(int net, CLIENT *client) {
1836         struct sockaddr_storage addrin;
1837         struct sockaddr_storage netaddr;
1838         struct sockaddr_in  *netaddr4 = NULL;
1839         struct sockaddr_in6 *netaddr6 = NULL;
1840         size_t addrinlen = sizeof( addrin );
1841         struct addrinfo hints;
1842         struct addrinfo *ai = NULL;
1843         char peername[NI_MAXHOST];
1844         char netname[NI_MAXHOST];
1845         char *tmp = NULL;
1846         int i;
1847         int e;
1848         int shift;
1849
1850         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1851                 err("getsockname failed: %m");
1852
1853         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1854                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1855
1856         memset(&hints, '\0', sizeof (hints));
1857         hints.ai_flags = AI_ADDRCONFIG;
1858         e = getaddrinfo(peername, NULL, &hints, &ai);
1859
1860         if(e != 0) {
1861                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1862                 freeaddrinfo(ai);
1863                 return;
1864         }
1865
1866         switch(client->server->virtstyle) {
1867                 case VIRT_NONE:
1868                         client->exportname=g_strdup(client->server->exportname);
1869                         break;
1870                 case VIRT_IPHASH:
1871                         for(i=0;i<strlen(peername);i++) {
1872                                 if(peername[i]=='.') {
1873                                         peername[i]='/';
1874                                 }
1875                         }
1876                 case VIRT_IPLIT:
1877                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1878                         break;
1879                 case VIRT_CIDR:
1880                         memcpy(&netaddr, &addrin, addrinlen);
1881                         if(ai->ai_family == AF_INET) {
1882                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1883                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1884                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1885
1886                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1887                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1888                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1889                         }else if(ai->ai_family == AF_INET6) {
1890                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1891
1892                                 shift = 128-(client->server->cidrlen);
1893                                 i = 3;
1894                                 while(shift >= 32) {
1895                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1896                                         shift-=32;
1897                                         i--;
1898                                 }
1899                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1900                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1901
1902                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1903                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1904                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1905                         }
1906
1907                         if(tmp != NULL)
1908                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1909
1910                         break;
1911         }
1912
1913         freeaddrinfo(ai);
1914         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1915              peername, client->exportname);
1916         client->clientname=g_strdup(peername);
1917 }
1918
1919 /**
1920  * Destroy a pid_t*
1921  * @param data a pointer to pid_t which should be freed
1922  **/
1923 void destroy_pid_t(gpointer data) {
1924         g_free(data);
1925 }
1926
1927 /**
1928  * Loop through the available servers, and serve them. Never returns.
1929  **/
1930 int serveloop(GArray* servers) {
1931         struct sockaddr_storage addrin;
1932         socklen_t addrinlen=sizeof(addrin);
1933         int i;
1934         int max;
1935         int sock;
1936         fd_set mset;
1937         fd_set rset;
1938
1939         /* 
1940          * Set up the master fd_set. The set of descriptors we need
1941          * to select() for never changes anyway and it buys us a *lot*
1942          * of time to only build this once. However, if we ever choose
1943          * to not fork() for clients anymore, we may have to revisit
1944          * this.
1945          */
1946         max=0;
1947         FD_ZERO(&mset);
1948         for(i=0;i<servers->len;i++) {
1949                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1950                         FD_SET(sock, &mset);
1951                         max=sock>max?sock:max;
1952                 }
1953         }
1954         if(modernsock) {
1955                 FD_SET(modernsock, &mset);
1956                 max=modernsock>max?modernsock:max;
1957         }
1958         for(;;) {
1959                 CLIENT *client = NULL;
1960                 pid_t *pid;
1961
1962                 memcpy(&rset, &mset, sizeof(fd_set));
1963                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1964                         int net = 0;
1965                         SERVER* serve=NULL;
1966
1967                         DEBUG("accept, ");
1968                         if(FD_ISSET(modernsock, &rset)) {
1969                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1970                                         err("accept: %m");
1971                                 client = negotiate(net, NULL, servers, NEG_INIT | NEG_MODERN);
1972                                 if(!client) {
1973                                         err_nonfatal("negotiation failed");
1974                                         close(net);
1975                                         net=0;
1976                                         continue;
1977                                 }
1978                                 serve = client->server;
1979                         }
1980                         for(i=0;i<servers->len && !net;i++) {
1981                                 serve=&(g_array_index(servers, SERVER, i));
1982                                 if(FD_ISSET(serve->socket, &rset)) {
1983                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1984                                                 err("accept: %m");
1985                                 }
1986                         }
1987                         if(net) {
1988                                 int sock_flags;
1989
1990                                 if(serve->max_connections > 0 &&
1991                                    g_hash_table_size(children) >= serve->max_connections) {
1992                                         msg2(LOG_INFO, "Max connections reached");
1993                                         close(net);
1994                                         continue;
1995                                 }
1996                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1997                                         err("fcntl F_GETFL");
1998                                 }
1999                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
2000                                         err("fcntl F_SETFL ~O_NONBLOCK");
2001                                 }
2002                                 if(!client) {
2003                                         client = g_new0(CLIENT, 1);
2004                                         client->server=serve;
2005                                         client->exportsize=OFFT_MAX;
2006                                         client->net=net;
2007                                         client->transactionlogfd = -1;
2008                                 }
2009                                 set_peername(net, client);
2010                                 if (!authorized_client(client)) {
2011                                         msg2(LOG_INFO,"Unauthorized client") ;
2012                                         close(net);
2013                                         continue;
2014                                 }
2015                                 msg2(LOG_INFO,"Authorized client") ;
2016                                 pid=g_malloc(sizeof(pid_t));
2017
2018                                 if (!dontfork) {
2019                                         if ((*pid=fork())<0) {
2020                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
2021                                                 close(net);
2022                                                 continue;
2023                                         }
2024                                         if (*pid>0) { /* parent */
2025                                                 close(net);
2026                                                 g_hash_table_insert(children, pid, pid);
2027                                                 continue;
2028                                         }
2029                                         /* child */
2030                                         g_hash_table_destroy(children);
2031                                         for(i=0;i<servers->len;i++) {
2032                                                 serve=&g_array_index(servers, SERVER, i);
2033                                                 close(serve->socket);
2034                                         }
2035                                         /* FALSE does not free the
2036                                            actual data. This is required,
2037                                            because the client has a
2038                                            direct reference into that
2039                                            data, and otherwise we get a
2040                                            segfault... */
2041                                         g_array_free(servers, FALSE);
2042                                 }
2043
2044                                 msg2(LOG_INFO,"Starting to serve");
2045                                 serveconnection(client);
2046                                 exit(EXIT_SUCCESS);
2047                         }
2048                 }
2049         }
2050 }
2051
2052 void dosockopts(int socket) {
2053 #ifndef sun
2054         int yes=1;
2055 #else
2056         char yes='1';
2057 #endif /* sun */
2058         int sock_flags;
2059
2060         /* lose the pesky "Address already in use" error message */
2061         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2062                 err("setsockopt SO_REUSEADDR");
2063         }
2064         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2065                 err("setsockopt SO_KEEPALIVE");
2066         }
2067
2068         /* make the listening socket non-blocking */
2069         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2070                 err("fcntl F_GETFL");
2071         }
2072         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2073                 err("fcntl F_SETFL O_NONBLOCK");
2074         }
2075 }
2076
2077 /**
2078  * Connect a server's socket.
2079  *
2080  * @param serve the server we want to connect.
2081  **/
2082 int setup_serve(SERVER *serve) {
2083         struct addrinfo hints;
2084         struct addrinfo *ai = NULL;
2085         gchar *port = NULL;
2086         int e;
2087
2088         if(!do_oldstyle) {
2089                 return serve->servename ? 1 : 0;
2090         }
2091         memset(&hints,'\0',sizeof(hints));
2092         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2093         hints.ai_socktype = SOCK_STREAM;
2094         hints.ai_family = serve->socket_family;
2095
2096         port = g_strdup_printf ("%d", serve->port);
2097         if (port == NULL)
2098                 return 0;
2099
2100         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2101
2102         g_free(port);
2103
2104         if(e != 0) {
2105                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2106                 serve->socket = -1;
2107                 freeaddrinfo(ai);
2108                 exit(EXIT_FAILURE);
2109         }
2110
2111         if(serve->socket_family == AF_UNSPEC)
2112                 serve->socket_family = ai->ai_family;
2113
2114 #ifdef WITH_SDP
2115         if ((serve->flags) && F_SDP) {
2116                 if (ai->ai_family == AF_INET)
2117                         ai->ai_family = AF_INET_SDP;
2118                 else (ai->ai_family == AF_INET6)
2119                         ai->ai_family = AF_INET6_SDP;
2120         }
2121 #endif
2122         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2123                 err("socket: %m");
2124
2125         dosockopts(serve->socket);
2126
2127         DEBUG("Waiting for connections... bind, ");
2128         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2129         if (e != 0 && errno != EADDRINUSE)
2130                 err("bind: %m");
2131         DEBUG("listen, ");
2132         if (listen(serve->socket, 1) < 0)
2133                 err("listen: %m");
2134
2135         freeaddrinfo (ai);
2136         if(serve->servename) {
2137                 return 1;
2138         } else {
2139                 return 0;
2140         }
2141 }
2142
2143 void open_modern(void) {
2144         struct addrinfo hints;
2145         struct addrinfo* ai = NULL;
2146         struct sock_flags;
2147         int e;
2148
2149         memset(&hints, '\0', sizeof(hints));
2150         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2151         hints.ai_socktype = SOCK_STREAM;
2152         hints.ai_family = AF_UNSPEC;
2153         hints.ai_protocol = IPPROTO_TCP;
2154         e = getaddrinfo(modern_listen, modernport, &hints, &ai);
2155         if(e != 0) {
2156                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2157                 exit(EXIT_FAILURE);
2158         }
2159         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2160                 err("socket: %m");
2161         }
2162
2163         dosockopts(modernsock);
2164
2165         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2166                 err("bind: %m");
2167         }
2168         if(listen(modernsock, 10) <0) {
2169                 err("listen: %m");
2170         }
2171
2172         freeaddrinfo(ai);
2173 }
2174
2175 /**
2176  * Connect our servers.
2177  **/
2178 void setup_servers(GArray* servers) {
2179         int i;
2180         struct sigaction sa;
2181         int want_modern=0;
2182
2183         for(i=0;i<servers->len;i++) {
2184                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2185         }
2186         if(want_modern) {
2187                 open_modern();
2188         }
2189         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2190
2191         sa.sa_handler = sigchld_handler;
2192         sigemptyset(&sa.sa_mask);
2193         sa.sa_flags = SA_RESTART;
2194         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2195                 err("sigaction: %m");
2196         sa.sa_handler = sigterm_handler;
2197         sigemptyset(&sa.sa_mask);
2198         sa.sa_flags = SA_RESTART;
2199         if(sigaction(SIGTERM, &sa, NULL) == -1)
2200                 err("sigaction: %m");
2201 }
2202
2203 /**
2204  * Go daemon (unless we specified at compile time that we didn't want this)
2205  * @param serve the first server of our configuration. If its port is zero,
2206  *      then do not daemonize, because we're doing inetd then. This parameter
2207  *      is only used to create a PID file of the form
2208  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2209  **/
2210 #if !defined(NODAEMON)
2211 void daemonize(SERVER* serve) {
2212         FILE*pidf;
2213
2214         if(serve && !(serve->port)) {
2215                 return;
2216         }
2217         if(daemon(0,0)<0) {
2218                 err("daemon");
2219         }
2220         if(!*pidftemplate) {
2221                 if(serve) {
2222                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2223                 } else {
2224                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2225                 }
2226         }
2227         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2228         pidf=fopen(pidfname, "w");
2229         if(pidf) {
2230                 fprintf(pidf,"%d\n", (int)getpid());
2231                 fclose(pidf);
2232         } else {
2233                 perror("fopen");
2234                 fprintf(stderr, "Not fatal; continuing");
2235         }
2236 }
2237 #else
2238 #define daemonize(serve)
2239 #endif /* !defined(NODAEMON) */
2240
2241 /*
2242  * Everything beyond this point (in the file) is run in non-daemon mode.
2243  * The stuff above daemonize() isn't.
2244  */
2245
2246 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2247
2248 void serve_err(SERVER* serve, const char* msg) {
2249         g_message("Export of %s on port %d failed:", serve->exportname,
2250                         serve->port);
2251         err(msg);
2252 }
2253
2254 /**
2255  * Set up user-ID and/or group-ID
2256  **/
2257 void dousers(void) {
2258         struct passwd *pw;
2259         struct group *gr;
2260         gchar* str;
2261         if(rungroup) {
2262                 gr=getgrnam(rungroup);
2263                 if(!gr) {
2264                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2265                         err(str);
2266                 }
2267                 if(setgid(gr->gr_gid)<0) {
2268                         err("Could not set GID: %m"); 
2269                 }
2270         }
2271         if(runuser) {
2272                 pw=getpwnam(runuser);
2273                 if(!pw) {
2274                         str = g_strdup_printf("Invalid user name: %s", runuser);
2275                         err(str);
2276                 }
2277                 if(setuid(pw->pw_uid)<0) {
2278                         err("Could not set UID: %m");
2279                 }
2280         }
2281 }
2282
2283 #ifndef ISSERVER
2284 void glib_message_syslog_redirect(const gchar *log_domain,
2285                                   GLogLevelFlags log_level,
2286                                   const gchar *message,
2287                                   gpointer user_data)
2288 {
2289     int level=LOG_DEBUG;
2290     
2291     switch( log_level )
2292     {
2293       case G_LOG_FLAG_FATAL:
2294       case G_LOG_LEVEL_CRITICAL:
2295       case G_LOG_LEVEL_ERROR:    
2296         level=LOG_ERR; 
2297         break;
2298       case G_LOG_LEVEL_WARNING:
2299         level=LOG_WARNING;
2300         break;
2301       case G_LOG_LEVEL_MESSAGE:
2302       case G_LOG_LEVEL_INFO:
2303         level=LOG_INFO;
2304         break;
2305       case G_LOG_LEVEL_DEBUG:
2306         level=LOG_DEBUG;
2307         break;
2308       default:
2309         level=LOG_ERR;
2310     }
2311     syslog(level, "%s", message);
2312 }
2313 #endif
2314
2315 /**
2316  * Main entry point...
2317  **/
2318 int main(int argc, char *argv[]) {
2319         SERVER *serve;
2320         GArray *servers;
2321         GError *err=NULL;
2322
2323         if (sizeof( struct nbd_request )!=28) {
2324                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2325                 exit(EXIT_FAILURE) ;
2326         }
2327
2328         memset(pidftemplate, '\0', 256);
2329
2330         logging();
2331         config_file_pos = g_strdup(CFILE);
2332         serve=cmdline(argc, argv);
2333         servers = parse_cfile(config_file_pos, &err);
2334         
2335         if(serve) {
2336                 serve->socket_family = AF_UNSPEC;
2337
2338                 append_serve(serve, servers);
2339      
2340                 if (!(serve->port)) {
2341                         CLIENT *client;
2342 #ifndef ISSERVER
2343                         /* You really should define ISSERVER if you're going to use
2344                          * inetd mode, but if you don't, closing stdout and stderr
2345                          * (which inetd had connected to the client socket) will let it
2346                          * work. */
2347                         close(1);
2348                         close(2);
2349                         open("/dev/null", O_WRONLY);
2350                         open("/dev/null", O_WRONLY);
2351                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2352 #endif
2353                         client=g_malloc(sizeof(CLIENT));
2354                         client->server=serve;
2355                         client->net=0;
2356                         client->exportsize=OFFT_MAX;
2357                         set_peername(0,client);
2358                         serveconnection(client);
2359                         return 0;
2360                 }
2361         }
2362     
2363         if(!servers || !servers->len) {
2364                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2365                                 && err->code == CFILE_NOTFOUND)) {
2366                         g_warning("Could not parse config file: %s", 
2367                                         err ? err->message : "Unknown error");
2368                 }
2369         }
2370         if(serve) {
2371                 g_warning("Specifying an export on the command line is deprecated.");
2372                 g_warning("Please use a configuration file instead.");
2373         }
2374
2375         if((!serve) && (!servers||!servers->len)) {
2376                 g_message("No configured exports; quitting.");
2377                 exit(EXIT_FAILURE);
2378         }
2379         if (!dontfork)
2380                 daemonize(serve);
2381         setup_servers(servers);
2382         dousers();
2383         serveloop(servers);
2384         return 0 ;
2385 }