Update proto.txt
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 int modernsock=0;         /**< Socket for the modern handler. Not used
169                                if a client was only specified on the
170                                command line; only port used if
171                                oldstyle is set to false (and then the
172                                command-line client isn't used, gna gna) */
173 char* modern_listen;      /**< listenaddr value for modernsock */
174 char* modernport=NBD_DEFAULT_PORT; /**< Port number on which to listen for
175                                       new-style nbd-client connections */
176
177 /**
178  * Types of virtuatlization
179  **/
180 typedef enum {
181         VIRT_NONE=0,    /**< No virtualization */
182         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
183         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
184                              doing the same as in IPLIT */
185         VIRT_CIDR,      /**< Every subnet in its own directory */
186 } VIRT_STYLE;
187
188 /**
189  * Variables associated with a server.
190  **/
191 typedef struct {
192         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
193         off_t expected_size; /**< size of the exported file as it was told to
194                                us through configuration */
195         gchar* listenaddr;   /**< The IP address we're listening on */
196         unsigned int port;   /**< port we're exporting this file at */
197         char* authname;      /**< filename of the authorization file */
198         int flags;           /**< flags associated with this exported file */
199         int socket;          /**< The socket of this server. */
200         int socket_family;   /**< family of the socket */
201         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
202         uint8_t cidrlen;     /**< The length of the mask when we use
203                                   CIDR-style virtualization */
204         gchar* prerun;       /**< command to be ran after connecting a client,
205                                   but before starting to serve */
206         gchar* postrun;      /**< command that will be ran after the client
207                                   disconnects */
208         gchar* servename;    /**< name of the export as selected by nbd-client */
209         int max_connections; /**< maximum number of opened connections */
210         gchar* transactionlog;/**< filename for transaction log */
211 } SERVER;
212
213 /**
214  * Variables associated with a client socket.
215  **/
216 typedef struct {
217         int fhandle;      /**< file descriptor */
218         off_t startoff;   /**< starting offset of this file */
219 } FILE_INFO;
220
221 typedef struct {
222         off_t exportsize;    /**< size of the file we're exporting */
223         char *clientname;    /**< peer */
224         char *exportname;    /**< (processed) filename of the file we're exporting */
225         GArray *export;    /**< array of FILE_INFO of exported files;
226                                array size is always 1 unless we're
227                                doing the multiple file option */
228         int net;             /**< The actual client socket */
229         SERVER *server;      /**< The server this client is getting data from */
230         char* difffilename;  /**< filename of the copy-on-write file, if any */
231         int difffile;        /**< filedescriptor of copyonwrite file. @todo
232                                shouldn't this be an array too? (cfr export) Or
233                                make -m and -c mutually exclusive */
234         u32 difffilelen;     /**< number of pages in difffile */
235         u32 *difmap;         /**< see comment on the global difmap for this one */
236         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
237         int transactionlogfd;/**< fd for transaction log */
238 } CLIENT;
239
240 /**
241  * Type of configuration file values
242  **/
243 typedef enum {
244         PARAM_INT,              /**< This parameter is an integer */
245         PARAM_STRING,           /**< This parameter is a string */
246         PARAM_BOOL,             /**< This parameter is a boolean */
247 } PARAM_TYPE;
248
249 /**
250  * Configuration file values
251  **/
252 typedef struct {
253         gchar *paramname;       /**< Name of the parameter, as it appears in
254                                   the config file */
255         gboolean required;      /**< Whether this is a required (as opposed to
256                                   optional) parameter */
257         PARAM_TYPE ptype;       /**< Type of the parameter. */
258         gpointer target;        /**< Pointer to where the data of this
259                                   parameter should be written. If ptype is
260                                   PARAM_BOOL, the data is or'ed rather than
261                                   overwritten. */
262         gint flagval;           /**< Flag mask for this parameter in case ptype
263                                   is PARAM_BOOL. */
264 } PARAM;
265
266 /**
267  * Translate a command name into human readable form
268  *
269  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
270  * @return pointer to the command name
271  **/
272 static inline const char * getcommandname(uint64_t command) {
273         switch (command) {
274         case NBD_CMD_READ:
275                 return "NBD_CMD_READ";
276         case NBD_CMD_WRITE:
277                 return "NBD_CMD_WRITE";
278         case NBD_CMD_DISC:
279                 return "NBD_CMD_DISC";
280         case NBD_CMD_FLUSH:
281                 return "NBD_CMD_FLUSH";
282         default:
283                 break;
284         }
285         return "UNKNOWN";
286 }
287
288 /**
289  * Check whether a client is allowed to connect. Works with an authorization
290  * file which contains one line per machine, no wildcards.
291  *
292  * @param opts The client who's trying to connect.
293  * @return 0 - authorization refused, 1 - OK
294  **/
295 int authorized_client(CLIENT *opts) {
296         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
297         FILE *f ;
298         char line[LINELEN]; 
299         char *tmp;
300         struct in_addr addr;
301         struct in_addr client;
302         struct in_addr cltemp;
303         int len;
304
305         if ((f=fopen(opts->server->authname,"r"))==NULL) {
306                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
307                      opts->server->authname,strerror(errno)) ;
308                 return 1 ; 
309         }
310   
311         inet_aton(opts->clientname, &client);
312         while (fgets(line,LINELEN,f)!=NULL) {
313                 if((tmp=index(line, '/'))) {
314                         if(strlen(line)<=tmp-line) {
315                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
316                                 return 0;
317                         }
318                         *(tmp++)=0;
319                         if(!inet_aton(line,&addr)) {
320                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
321                                 return 0;
322                         }
323                         len=strtol(tmp, NULL, 0);
324                         addr.s_addr>>=32-len;
325                         addr.s_addr<<=32-len;
326                         memcpy(&cltemp,&client,sizeof(client));
327                         cltemp.s_addr>>=32-len;
328                         cltemp.s_addr<<=32-len;
329                         if(addr.s_addr == cltemp.s_addr) {
330                                 return 1;
331                         }
332                 }
333                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
334                         fclose(f);
335                         return 1;
336                 }
337         }
338         fclose(f);
339         return 0;
340 }
341
342 /**
343  * Read data from a file descriptor into a buffer
344  *
345  * @param f a file descriptor
346  * @param buf a buffer
347  * @param len the number of bytes to be read
348  **/
349 static inline void readit(int f, void *buf, size_t len) {
350         ssize_t res;
351         while (len > 0) {
352                 DEBUG("*");
353                 if ((res = read(f, buf, len)) <= 0) {
354                         if(errno != EAGAIN) {
355                                 err("Read failed: %m");
356                         }
357                 } else {
358                         len -= res;
359                         buf += res;
360                 }
361         }
362 }
363
364 /**
365  * Consume data from an FD that we don't want
366  *
367  * @param f a file descriptor
368  * @param buf a buffer
369  * @param len the number of bytes to consume
370  * @param bufsiz the size of the buffer
371  **/
372 static inline void consume(int f, void * buf, size_t len, size_t bufsiz) {
373         size_t curlen;
374         while (len>0) {
375                 curlen = (len>bufsiz)?bufsiz:len;
376                 readit(f, buf, curlen);
377                 len -= curlen;
378         }
379 }
380
381
382 /**
383  * Write data from a buffer into a filedescriptor
384  *
385  * @param f a file descriptor
386  * @param buf a buffer containing data
387  * @param len the number of bytes to be written
388  **/
389 static inline void writeit(int f, void *buf, size_t len) {
390         ssize_t res;
391         while (len > 0) {
392                 DEBUG("+");
393                 if ((res = write(f, buf, len)) <= 0)
394                         err("Send failed: %m");
395                 len -= res;
396                 buf += res;
397         }
398 }
399
400 /**
401  * Print out a message about how to use nbd-server. Split out to a separate
402  * function so that we can call it from multiple places
403  */
404 void usage() {
405         printf("This is nbd-server version " VERSION "\n");
406         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
407                "\t-r|--read-only\t\tread only\n"
408                "\t-m|--multi-file\t\tmultiple file\n"
409                "\t-c|--copy-on-write\tcopy on write\n"
410                "\t-C|--config-file\tspecify an alternate configuration file\n"
411                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
412                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
413                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
414                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
415                "\tif port is set to 0, stdin is used (for running from inetd).\n"
416                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
417                "\t\taddress of the machine trying to connect\n" 
418                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
419         printf("Using configuration file %s\n", CFILE);
420 }
421
422 /* Dumps a config file section of the given SERVER*, and exits. */
423 void dump_section(SERVER* serve, gchar* section_header) {
424         printf("[%s]\n", section_header);
425         printf("\texportname = %s\n", serve->exportname);
426         printf("\tlistenaddr = %s\n", serve->listenaddr);
427         printf("\tport = %d\n", serve->port);
428         if(serve->flags & F_READONLY) {
429                 printf("\treadonly = true\n");
430         }
431         if(serve->flags & F_MULTIFILE) {
432                 printf("\tmultifile = true\n");
433         }
434         if(serve->flags & F_COPYONWRITE) {
435                 printf("\tcopyonwrite = true\n");
436         }
437         if(serve->expected_size) {
438                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
439         }
440         if(serve->authname) {
441                 printf("\tauthfile = %s\n", serve->authname);
442         }
443         exit(EXIT_SUCCESS);
444 }
445
446 /**
447  * Parse the command line.
448  *
449  * @param argc the argc argument to main()
450  * @param argv the argv argument to main()
451  **/
452 SERVER* cmdline(int argc, char *argv[]) {
453         int i=0;
454         int nonspecial=0;
455         int c;
456         struct option long_options[] = {
457                 {"read-only", no_argument, NULL, 'r'},
458                 {"multi-file", no_argument, NULL, 'm'},
459                 {"copy-on-write", no_argument, NULL, 'c'},
460                 {"dont-fork", no_argument, NULL, 'd'},
461                 {"authorize-file", required_argument, NULL, 'l'},
462                 {"config-file", required_argument, NULL, 'C'},
463                 {"pid-file", required_argument, NULL, 'p'},
464                 {"output-config", required_argument, NULL, 'o'},
465                 {"max-connection", required_argument, NULL, 'M'},
466                 {0,0,0,0}
467         };
468         SERVER *serve;
469         off_t es;
470         size_t last;
471         char suffix;
472         gboolean do_output=FALSE;
473         gchar* section_header="";
474         gchar** addr_port;
475
476         if(argc==1) {
477                 return NULL;
478         }
479         serve=g_new0(SERVER, 1);
480         serve->authname = g_strdup(default_authname);
481         serve->virtstyle=VIRT_IPLIT;
482         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
483                 switch (c) {
484                 case 1:
485                         /* non-option argument */
486                         switch(nonspecial++) {
487                         case 0:
488                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
489                                         addr_port=g_strsplit(optarg, ":", 2);
490
491                                         /* Check for "@" - maybe user using this separator
492                                                  for IPv4 address */
493                                         if(!addr_port[1]) {
494                                                 g_strfreev(addr_port);
495                                                 addr_port=g_strsplit(optarg, "@", 2);
496                                         }
497                                 } else {
498                                         addr_port=g_strsplit(optarg, "@", 2);
499                                 }
500
501                                 if(addr_port[1]) {
502                                         serve->port=strtol(addr_port[1], NULL, 0);
503                                         serve->listenaddr=g_strdup(addr_port[0]);
504                                 } else {
505                                         serve->listenaddr=NULL;
506                                         serve->port=strtol(addr_port[0], NULL, 0);
507                                 }
508                                 g_strfreev(addr_port);
509                                 break;
510                         case 1:
511                                 serve->exportname = g_strdup(optarg);
512                                 if(serve->exportname[0] != '/') {
513                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
514                                         exit(EXIT_FAILURE);
515                                 }
516                                 break;
517                         case 2:
518                                 last=strlen(optarg)-1;
519                                 suffix=optarg[last];
520                                 if (suffix == 'k' || suffix == 'K' ||
521                                     suffix == 'm' || suffix == 'M')
522                                         optarg[last] = '\0';
523                                 es = (off_t)atoll(optarg);
524                                 switch (suffix) {
525                                         case 'm':
526                                         case 'M':  es <<= 10;
527                                         case 'k':
528                                         case 'K':  es <<= 10;
529                                         default :  break;
530                                 }
531                                 serve->expected_size = es;
532                                 break;
533                         }
534                         break;
535                 case 'r':
536                         serve->flags |= F_READONLY;
537                         break;
538                 case 'm':
539                         serve->flags |= F_MULTIFILE;
540                         break;
541                 case 'o':
542                         do_output = TRUE;
543                         section_header = g_strdup(optarg);
544                         break;
545                 case 'p':
546                         strncpy(pidftemplate, optarg, 256);
547                         break;
548                 case 'c': 
549                         serve->flags |=F_COPYONWRITE;
550                         break;
551                 case 'd': 
552                         dontfork = 1;
553                         break;
554                 case 'C':
555                         g_free(config_file_pos);
556                         config_file_pos=g_strdup(optarg);
557                         break;
558                 case 'l':
559                         g_free(serve->authname);
560                         serve->authname=g_strdup(optarg);
561                         break;
562                 case 'M':
563                         serve->max_connections = strtol(optarg, NULL, 0);
564                         break;
565                 default:
566                         usage();
567                         exit(EXIT_FAILURE);
568                         break;
569                 }
570         }
571         /* What's left: the port to export, the name of the to be exported
572          * file, and, optionally, the size of the file, in that order. */
573         if(nonspecial<2) {
574                 g_free(serve);
575                 serve=NULL;
576         } else {
577                 do_oldstyle = TRUE;
578         }
579         if(do_output) {
580                 if(!serve) {
581                         g_critical("Need a complete configuration on the command line to output a config file section!");
582                         exit(EXIT_FAILURE);
583                 }
584                 dump_section(serve, section_header);
585         }
586         return serve;
587 }
588
589 /**
590  * Error codes for config file parsing
591  **/
592 typedef enum {
593         CFILE_NOTFOUND,         /**< The configuration file is not found */
594         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
595         CFILE_KEY_MISSING,      /**< A (required) key is missing */
596         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
597         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
598         CFILE_PROGERR,          /**< Programmer error */
599         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
600                                      define any exports */
601         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
602                                      old-style export. */
603 } CFILE_ERRORS;
604
605 /**
606  * Remove a SERVER from memory. Used from the hash table
607  **/
608 void remove_server(gpointer s) {
609         SERVER *server;
610
611         server=(SERVER*)s;
612         g_free(server->exportname);
613         if(server->authname)
614                 g_free(server->authname);
615         if(server->listenaddr)
616                 g_free(server->listenaddr);
617         if(server->prerun)
618                 g_free(server->prerun);
619         if(server->postrun)
620                 g_free(server->postrun);
621         if(server->transactionlog)
622                 g_free(server->transactionlog);
623         g_free(server);
624 }
625
626 /**
627  * duplicate server
628  * @param s the old server we want to duplicate
629  * @return new duplicated server
630  **/
631 SERVER* dup_serve(SERVER *s) {
632         SERVER *serve = NULL;
633
634         serve=g_new0(SERVER, 1);
635         if(serve == NULL)
636                 return NULL;
637
638         if(s->exportname)
639                 serve->exportname = g_strdup(s->exportname);
640
641         serve->expected_size = s->expected_size;
642
643         if(s->listenaddr)
644                 serve->listenaddr = g_strdup(s->listenaddr);
645
646         serve->port = s->port;
647
648         if(s->authname)
649                 serve->authname = strdup(s->authname);
650
651         serve->flags = s->flags;
652         serve->socket = s->socket;
653         serve->socket_family = s->socket_family;
654         serve->virtstyle = s->virtstyle;
655         serve->cidrlen = s->cidrlen;
656
657         if(s->prerun)
658                 serve->prerun = g_strdup(s->prerun);
659
660         if(s->postrun)
661                 serve->postrun = g_strdup(s->postrun);
662
663         if(s->transactionlog)
664                 serve->transactionlog = g_strdup(s->transactionlog);
665         
666         if(s->servename)
667                 serve->servename = g_strdup(s->servename);
668
669         serve->max_connections = s->max_connections;
670
671         return serve;
672 }
673
674 /**
675  * append new server to array
676  * @param s server
677  * @param a server array
678  * @return 0 success, -1 error
679  */
680 int append_serve(SERVER *s, GArray *a) {
681         SERVER *ns = NULL;
682         struct addrinfo hints;
683         struct addrinfo *ai = NULL;
684         struct addrinfo *rp = NULL;
685         char   host[NI_MAXHOST];
686         gchar  *port = NULL;
687         int e;
688         int ret;
689
690         if(!s) {
691                 err("Invalid parsing server");
692                 return -1;
693         }
694
695         port = g_strdup_printf("%d", s->port);
696
697         memset(&hints,'\0',sizeof(hints));
698         hints.ai_family = AF_UNSPEC;
699         hints.ai_socktype = SOCK_STREAM;
700         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
701         hints.ai_protocol = IPPROTO_TCP;
702
703         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
704
705         if (port)
706                 g_free(port);
707
708         if(e == 0) {
709                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
710                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
711
712                         if (e != 0) { // error
713                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
714                                 continue;
715                         }
716
717                         // duplicate server and set listenaddr to resolved IP address
718                         ns = dup_serve (s);
719                         if (ns) {
720                                 ns->listenaddr = g_strdup(host);
721                                 ns->socket_family = rp->ai_family;
722                                 g_array_append_val(a, *ns);
723                                 free(ns);
724                                 ns = NULL;
725                         }
726                 }
727
728                 ret = 0;
729         } else {
730                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
731                 ret = -1;
732         }
733
734         if (ai)
735                 freeaddrinfo(ai);
736
737         return ret;
738 }
739
740 /**
741  * Parse the config file.
742  *
743  * @param f the name of the config file
744  * @param e a GError. @see CFILE_ERRORS for what error values this function can
745  *      return.
746  * @return a Array of SERVER* pointers, If the config file is empty or does not
747  *      exist, returns an empty GHashTable; if the config file contains an
748  *      error, returns NULL, and e is set appropriately
749  **/
750 GArray* parse_cfile(gchar* f, GError** e) {
751         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
752         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
753         SERVER s;
754         gchar *virtstyle=NULL;
755         PARAM lp[] = {
756                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
757                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
758                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
759                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
760                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
761                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
762                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
763                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
764                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
765                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
766                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
767                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
768                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
769                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
770                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
771                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
772                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
773                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
774                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
775         };
776         const int lp_size=sizeof(lp)/sizeof(PARAM);
777         PARAM gp[] = {
778                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
779                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
780                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
781                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
782                 { "port",       FALSE, PARAM_STRING,    &modernport,    0 },
783         };
784         PARAM* p=gp;
785         int p_size=sizeof(gp)/sizeof(PARAM);
786         GKeyFile *cfile;
787         GError *err = NULL;
788         const char *err_msg=NULL;
789         GQuark errdomain;
790         GArray *retval=NULL;
791         gchar **groups;
792         gboolean value;
793         gchar* startgroup;
794         gint i;
795         gint j;
796
797         errdomain = g_quark_from_string("parse_cfile");
798         cfile = g_key_file_new();
799         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
800         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
801                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
802                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
803                 g_key_file_free(cfile);
804                 return retval;
805         }
806         startgroup = g_key_file_get_start_group(cfile);
807         if(!startgroup || strcmp(startgroup, "generic")) {
808                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
809                 g_key_file_free(cfile);
810                 return NULL;
811         }
812         groups = g_key_file_get_groups(cfile, NULL);
813         for(i=0;groups[i];i++) {
814                 memset(&s, '\0', sizeof(SERVER));
815
816                 /* After the [generic] group, start parsing exports */
817                 if(i==1) {
818                         p=lp;
819                         p_size=lp_size;
820                 } 
821                 for(j=0;j<p_size;j++) {
822                         g_assert(p[j].target != NULL);
823                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
824                         switch(p[j].ptype) {
825                                 case PARAM_INT:
826                                         *((gint*)p[j].target) =
827                                                 g_key_file_get_integer(cfile,
828                                                                 groups[i],
829                                                                 p[j].paramname,
830                                                                 &err);
831                                         break;
832                                 case PARAM_STRING:
833                                         *((gchar**)p[j].target) =
834                                                 g_key_file_get_string(cfile,
835                                                                 groups[i],
836                                                                 p[j].paramname,
837                                                                 &err);
838                                         break;
839                                 case PARAM_BOOL:
840                                         value = g_key_file_get_boolean(cfile,
841                                                         groups[i],
842                                                         p[j].paramname, &err);
843                                         if(!err) {
844                                                 if(value) {
845                                                         *((gint*)p[j].target) |= p[j].flagval;
846                                                 } else {
847                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
848                                                 }
849                                         }
850                                         break;
851                         }
852                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, modernport)) {
853                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies new-style port for oldstyle export");
854                                 g_key_file_free(cfile);
855                                 return NULL;
856                         }
857                         if(err) {
858                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
859                                         if(!p[j].required) {
860                                                 /* Ignore not-found error for optional values */
861                                                 g_clear_error(&err);
862                                                 continue;
863                                         } else {
864                                                 err_msg = MISSING_REQUIRED_ERROR;
865                                         }
866                                 } else {
867                                         err_msg = DEFAULT_ERROR;
868                                 }
869                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
870                                 g_array_free(retval, TRUE);
871                                 g_error_free(err);
872                                 g_key_file_free(cfile);
873                                 return NULL;
874                         }
875                 }
876                 if(virtstyle) {
877                         if(!strncmp(virtstyle, "none", 4)) {
878                                 s.virtstyle=VIRT_NONE;
879                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
880                                 s.virtstyle=VIRT_IPLIT;
881                         } else if(!strncmp(virtstyle, "iphash", 6)) {
882                                 s.virtstyle=VIRT_IPHASH;
883                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
884                                 s.virtstyle=VIRT_CIDR;
885                                 if(strlen(virtstyle)<10) {
886                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
887                                         g_array_free(retval, TRUE);
888                                         g_key_file_free(cfile);
889                                         return NULL;
890                                 }
891                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
892                         } else {
893                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
894                                 g_array_free(retval, TRUE);
895                                 g_key_file_free(cfile);
896                                 return NULL;
897                         }
898                         if(s.port && !do_oldstyle) {
899                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
900                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
901                         }
902                 } else {
903                         s.virtstyle=VIRT_IPLIT;
904                 }
905                 /* Don't need to free this, it's not our string */
906                 virtstyle=NULL;
907                 /* Don't append values for the [generic] group */
908                 if(i>0) {
909                         s.socket_family = AF_UNSPEC;
910                         s.servename = groups[i];
911
912                         append_serve(&s, retval);
913                 } else {
914                         if(!do_oldstyle) {
915                                 lp[1].required = 0;
916                         }
917                 }
918 #ifndef WITH_SDP
919                 if(s.flags & F_SDP) {
920                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
921                         g_array_free(retval, TRUE);
922                         g_key_file_free(cfile);
923                         return NULL;
924                 }
925 #endif
926         }
927         if(i==1) {
928                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
929         }
930         g_key_file_free(cfile);
931         return retval;
932 }
933
934 /**
935  * Signal handler for SIGCHLD
936  * @param s the signal we're handling (must be SIGCHLD, or something
937  * is severely wrong)
938  **/
939 void sigchld_handler(int s) {
940         int status;
941         int* i;
942         pid_t pid;
943
944         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
945                 if(WIFEXITED(status)) {
946                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
947                 }
948                 i=g_hash_table_lookup(children, &pid);
949                 if(!i) {
950                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
951                 } else {
952                         DEBUG("Removing %d from the list of children", pid);
953                         g_hash_table_remove(children, &pid);
954                 }
955         }
956 }
957
958 /**
959  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
960  *
961  * @param key the key
962  * @param value the value corresponding to the above key
963  * @param user_data a pointer which we always set to 1, so that we know what
964  * will happen next.
965  **/
966 void killchild(gpointer key, gpointer value, gpointer user_data) {
967         pid_t *pid=value;
968         int *parent=user_data;
969
970         kill(*pid, SIGTERM);
971         *parent=1;
972 }
973
974 /**
975  * Handle SIGTERM and dispatch it to our children
976  * @param s the signal we're handling (must be SIGTERM, or something
977  * is severely wrong).
978  **/
979 void sigterm_handler(int s) {
980         int parent=0;
981
982         g_hash_table_foreach(children, killchild, &parent);
983
984         if(parent) {
985                 unlink(pidfname);
986         }
987
988         exit(EXIT_SUCCESS);
989 }
990
991 /**
992  * Detect the size of a file.
993  *
994  * @param fhandle An open filedescriptor
995  * @return the size of the file, or OFFT_MAX if detection was
996  * impossible.
997  **/
998 off_t size_autodetect(int fhandle) {
999         off_t es;
1000         u64 bytes;
1001         struct stat stat_buf;
1002         int error;
1003
1004 #ifdef HAVE_SYS_MOUNT_H
1005 #ifdef HAVE_SYS_IOCTL_H
1006 #ifdef BLKGETSIZE64
1007         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
1008         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
1009                 return (off_t)bytes;
1010         }
1011 #endif /* BLKGETSIZE64 */
1012 #endif /* HAVE_SYS_IOCTL_H */
1013 #endif /* HAVE_SYS_MOUNT_H */
1014
1015         DEBUG("looking for fhandle size with fstat\n");
1016         stat_buf.st_size = 0;
1017         error = fstat(fhandle, &stat_buf);
1018         if (!error) {
1019                 if(stat_buf.st_size > 0)
1020                         return (off_t)stat_buf.st_size;
1021         } else {
1022                 err("fstat failed: %m");
1023         }
1024
1025         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1026         es = lseek(fhandle, (off_t)0, SEEK_END);
1027         if (es > ((off_t)0)) {
1028                 return es;
1029         } else {
1030                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1031         }
1032
1033         err("Could not find size of exported block device: %m");
1034         return OFFT_MAX;
1035 }
1036
1037 /**
1038  * Get the file handle and offset, given an export offset.
1039  *
1040  * @param export An array of export files
1041  * @param a The offset to get corresponding file/offset for
1042  * @param fhandle [out] File descriptor
1043  * @param foffset [out] Offset into fhandle
1044  * @param maxbytes [out] Tells how many bytes can be read/written
1045  * from fhandle starting at foffset (0 if there is no limit)
1046  * @return 0 on success, -1 on failure
1047  **/
1048 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1049         /* Negative offset not allowed */
1050         if(a < 0)
1051                 return -1;
1052
1053         /* Binary search for last file with starting offset <= a */
1054         FILE_INFO fi;
1055         int start = 0;
1056         int end = export->len - 1;
1057         while( start <= end ) {
1058                 int mid = (start + end) / 2;
1059                 fi = g_array_index(export, FILE_INFO, mid);
1060                 if( fi.startoff < a ) {
1061                         start = mid + 1;
1062                 } else if( fi.startoff > a ) {
1063                         end = mid - 1;
1064                 } else {
1065                         start = end = mid;
1066                         break;
1067                 }
1068         }
1069
1070         /* end should never go negative, since first startoff is 0 and a >= 0 */
1071         g_assert(end >= 0);
1072
1073         fi = g_array_index(export, FILE_INFO, end);
1074         *fhandle = fi.fhandle;
1075         *foffset = a - fi.startoff;
1076         *maxbytes = 0;
1077         if( end+1 < export->len ) {
1078                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1079                 *maxbytes = fi_next.startoff - a;
1080         }
1081
1082         return 0;
1083 }
1084
1085 /**
1086  * seek to a position in a file, with error handling.
1087  * @param handle a filedescriptor
1088  * @param a position to seek to
1089  * @todo get rid of this; lastpoint is a global variable right now, but it
1090  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1091  * easier.
1092  **/
1093 void myseek(int handle,off_t a) {
1094         if (lseek(handle, a, SEEK_SET) < 0) {
1095                 err("Can not seek locally!\n");
1096         }
1097 }
1098
1099 /**
1100  * Write an amount of bytes at a given offset to the right file. This
1101  * abstracts the write-side of the multiple file option.
1102  *
1103  * @param a The offset where the write should start
1104  * @param buf The buffer to write from
1105  * @param len The length of buf
1106  * @param client The client we're serving for
1107  * @param fua Flag to indicate 'Force Unit Access'
1108  * @return The number of bytes actually written, or -1 in case of an error
1109  **/
1110 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1111         int fhandle;
1112         off_t foffset;
1113         size_t maxbytes;
1114         ssize_t retval;
1115
1116         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1117                 return -1;
1118         if(maxbytes && len > maxbytes)
1119                 len = maxbytes;
1120
1121         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1122
1123         myseek(fhandle, foffset);
1124         retval = write(fhandle, buf, len);
1125         if(client->server->flags & F_SYNC) {
1126                 fsync(fhandle);
1127         } else if (fua) {
1128
1129           /* This is where we would do the following
1130            *   #ifdef USE_SYNC_FILE_RANGE
1131            * However, we don't, for the reasons set out below
1132            * by Christoph Hellwig <hch@infradead.org>
1133            *
1134            * [BEGINS] 
1135            * fdatasync is equivalent to fsync except that it does not flush
1136            * non-essential metadata (basically just timestamps in practice), but it
1137            * does flush metadata requried to find the data again, e.g. allocation
1138            * information and extent maps.  sync_file_range does nothing but flush
1139            * out pagecache content - it means you basically won't get your data
1140            * back in case of a crash if you either:
1141            * 
1142            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1143            *  b) are using a sparse file on a filesystem
1144            *  c) are using a fallocate-preallocated file on a filesystem
1145            *  d) use any file on a COW filesystem like btrfs
1146            * 
1147            * e.g. it only does anything useful for you if you do not have a volatile
1148            * write cache, and either use a raw block device node, or just overwrite
1149            * an already fully allocated (and not preallocated) file on a non-COW
1150            * filesystem.
1151            * [ENDS]
1152            *
1153            * What we should do is open a second FD with O_DSYNC set, then write to
1154            * that when appropriate. However, with a Linux client, every REQ_FUA
1155            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1156            * problems.
1157            *
1158            */
1159 #if 0
1160                 sync_file_range(fhandle, foffset, len,
1161                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1162                                 SYNC_FILE_RANGE_WAIT_AFTER);
1163 #else
1164                 fdatasync(fhandle);
1165 #endif
1166         }
1167         return retval;
1168 }
1169
1170 /**
1171  * Call rawexpwrite repeatedly until all data has been written.
1172  *
1173  * @param a The offset where the write should start
1174  * @param buf The buffer to write from
1175  * @param len The length of buf
1176  * @param client The client we're serving for
1177  * @param fua Flag to indicate 'Force Unit Access'
1178  * @return 0 on success, nonzero on failure
1179  **/
1180 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1181         ssize_t ret=0;
1182
1183         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1184                 a += ret;
1185                 buf += ret;
1186                 len -= ret;
1187         }
1188         return (ret < 0 || len != 0);
1189 }
1190
1191 /**
1192  * Read an amount of bytes at a given offset from the right file. This
1193  * abstracts the read-side of the multiple files option.
1194  *
1195  * @param a The offset where the read should start
1196  * @param buf A buffer to read into
1197  * @param len The size of buf
1198  * @param client The client we're serving for
1199  * @return The number of bytes actually read, or -1 in case of an
1200  * error.
1201  **/
1202 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1203         int fhandle;
1204         off_t foffset;
1205         size_t maxbytes;
1206
1207         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1208                 return -1;
1209         if(maxbytes && len > maxbytes)
1210                 len = maxbytes;
1211
1212         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1213
1214         myseek(fhandle, foffset);
1215         return read(fhandle, buf, len);
1216 }
1217
1218 /**
1219  * Call rawexpread repeatedly until all data has been read.
1220  * @return 0 on success, nonzero on failure
1221  **/
1222 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1223         ssize_t ret=0;
1224
1225         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1226                 a += ret;
1227                 buf += ret;
1228                 len -= ret;
1229         }
1230         return (ret < 0 || len != 0);
1231 }
1232
1233 /**
1234  * Read an amount of bytes at a given offset from the right file. This
1235  * abstracts the read-side of the copyonwrite stuff, and calls
1236  * rawexpread() with the right parameters to do the actual work.
1237  * @param a The offset where the read should start
1238  * @param buf A buffer to read into
1239  * @param len The size of buf
1240  * @param client The client we're going to read for
1241  * @return 0 on success, nonzero on failure
1242  **/
1243 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1244         off_t rdlen, offset;
1245         off_t mapcnt, mapl, maph, pagestart;
1246
1247         if (!(client->server->flags & F_COPYONWRITE))
1248                 return(rawexpread_fully(a, buf, len, client));
1249         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1250
1251         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1252
1253         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1254                 pagestart=mapcnt*DIFFPAGESIZE;
1255                 offset=a-pagestart;
1256                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1257                         len : (size_t)DIFFPAGESIZE-offset;
1258                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1259                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1260                                (unsigned long)(client->difmap[mapcnt]));
1261                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1262                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1263                 } else { /* the block is not there */
1264                         DEBUG("Page %llu is not here, we read the original one\n",
1265                                (unsigned long long)mapcnt);
1266                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1267                 }
1268                 len-=rdlen; a+=rdlen; buf+=rdlen;
1269         }
1270         return 0;
1271 }
1272
1273 /**
1274  * Write an amount of bytes at a given offset to the right file. This
1275  * abstracts the write-side of the copyonwrite option, and calls
1276  * rawexpwrite() with the right parameters to do the actual work.
1277  *
1278  * @param a The offset where the write should start
1279  * @param buf The buffer to write from
1280  * @param len The length of buf
1281  * @param client The client we're going to write for.
1282  * @param fua Flag to indicate 'Force Unit Access'
1283  * @return 0 on success, nonzero on failure
1284  **/
1285 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1286         char pagebuf[DIFFPAGESIZE];
1287         off_t mapcnt,mapl,maph;
1288         off_t wrlen,rdlen; 
1289         off_t pagestart;
1290         off_t offset;
1291
1292         if (!(client->server->flags & F_COPYONWRITE))
1293                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1294         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1295
1296         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1297
1298         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1299                 pagestart=mapcnt*DIFFPAGESIZE ;
1300                 offset=a-pagestart ;
1301                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1302                         len : (size_t)DIFFPAGESIZE-offset;
1303
1304                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1305                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1306                                (unsigned long)(client->difmap[mapcnt])) ;
1307                         myseek(client->difffile,
1308                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1309                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1310                 } else { /* the block is not there */
1311                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1312                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1313                         DEBUG("Page %llu is not here, we put it at %lu\n",
1314                                (unsigned long long)mapcnt,
1315                                (unsigned long)(client->difmap[mapcnt]));
1316                         rdlen=DIFFPAGESIZE ;
1317                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1318                                 return -1;
1319                         memcpy(pagebuf+offset,buf,wrlen) ;
1320                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1321                                         DIFFPAGESIZE)
1322                                 return -1;
1323                 }                                                   
1324                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1325         }
1326         if (client->server->flags & F_SYNC) {
1327                 fsync(client->difffile);
1328         } else if (fua) {
1329                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1330                    as we iterate through the above?
1331                  */
1332                 fdatasync(client->difffile);
1333         }
1334         return 0;
1335 }
1336
1337 /**
1338  * Flush data to a client
1339  *
1340  * @param client The client we're going to write for.
1341  * @return 0 on success, nonzero on failure
1342  **/
1343 int expflush(CLIENT *client) {
1344         gint i;
1345
1346         if (client->server->flags & F_COPYONWRITE) {
1347                 return fsync(client->difffile);
1348         }
1349         
1350         for (i = 0; i < client->export->len; i++) {
1351                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1352                 if (fsync(fi.fhandle) < 0)
1353                         return -1;
1354         }
1355         
1356         return 0;
1357 }
1358
1359 /**
1360  * Do the initial negotiation.
1361  *
1362  * @param client The client we're negotiating with.
1363  **/
1364 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1365         char zeros[128];
1366         uint64_t size_host;
1367         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1368         uint16_t smallflags = 0;
1369         uint64_t magic;
1370
1371         memset(zeros, '\0', sizeof(zeros));
1372         if(!client || !client->modern) {
1373                 /* common */
1374                 if (write(net, INIT_PASSWD, 8) < 0) {
1375                         err_nonfatal("Negotiation failed: %m");
1376                         if(client)
1377                                 exit(EXIT_FAILURE);
1378                 }
1379                 if(!client || client->modern) {
1380                         /* modern */
1381                         magic = htonll(opts_magic);
1382                 } else {
1383                         /* oldstyle */
1384                         magic = htonll(cliserv_magic);
1385                 }
1386                 if (write(net, &magic, sizeof(magic)) < 0) {
1387                         err_nonfatal("Negotiation failed: %m");
1388                         if(client)
1389                                 exit(EXIT_FAILURE);
1390                 }
1391         }
1392         if(!client) {
1393                 /* modern */
1394                 uint32_t reserved;
1395                 uint32_t opt;
1396                 uint32_t namelen;
1397                 char* name;
1398                 int i;
1399
1400                 if(!servers)
1401                         err("programmer error");
1402                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1403                         err("Negotiation failed: %m");
1404                 if (read(net, &reserved, sizeof(reserved)) < 0)
1405                         err("Negotiation failed: %m");
1406                 if (read(net, &magic, sizeof(magic)) < 0)
1407                         err("Negotiation failed: %m");
1408                 magic = ntohll(magic);
1409                 if(magic != opts_magic) {
1410                         close(net);
1411                         return NULL;
1412                 }
1413                 if (read(net, &opt, sizeof(opt)) < 0)
1414                         err("Negotiation failed: %m");
1415                 opt = ntohl(opt);
1416                 if(opt != NBD_OPT_EXPORT_NAME) {
1417                         close(net);
1418                         return NULL;
1419                 }
1420                 if (read(net, &namelen, sizeof(namelen)) < 0)
1421                         err("Negotiation failed: %m");
1422                 namelen = ntohl(namelen);
1423                 name = malloc(namelen+1);
1424                 name[namelen]=0;
1425                 if (read(net, name, namelen) < 0)
1426                         err("Negotiation failed: %m");
1427                 for(i=0; i<servers->len; i++) {
1428                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1429                         if(!strcmp(serve->servename, name)) {
1430                                 CLIENT* client = g_new0(CLIENT, 1);
1431                                 client->server = serve;
1432                                 client->exportsize = OFFT_MAX;
1433                                 client->net = net;
1434                                 client->modern = TRUE;
1435                                 client->transactionlogfd = -1;
1436                                 free(name);
1437                                 return client;
1438                         }
1439                 }
1440                 free(name);
1441                 return NULL;
1442         }
1443         /* common */
1444         size_host = htonll((u64)(client->exportsize));
1445         if (write(net, &size_host, 8) < 0)
1446                 err("Negotiation failed: %m");
1447         if (client->server->flags & F_READONLY)
1448                 flags |= NBD_FLAG_READ_ONLY;
1449         if (client->server->flags & F_FLUSH)
1450                 flags |= NBD_FLAG_SEND_FLUSH;
1451         if (client->server->flags & F_FUA)
1452                 flags |= NBD_FLAG_SEND_FUA;
1453         if (client->server->flags & F_ROTATIONAL)
1454                 flags |= NBD_FLAG_ROTATIONAL;
1455         if (!client->modern) {
1456                 /* oldstyle */
1457                 flags = htonl(flags);
1458                 if (write(client->net, &flags, 4) < 0)
1459                         err("Negotiation failed: %m");
1460         } else {
1461                 /* modern */
1462                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1463                 smallflags = htons(smallflags);
1464                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1465                         err("Negotiation failed: %m");
1466                 }
1467         }
1468         /* common */
1469         if (write(client->net, zeros, 124) < 0)
1470                 err("Negotiation failed: %m");
1471         return NULL;
1472 }
1473
1474 /** sending macro. */
1475 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1476         if (client->transactionlogfd != -1) \
1477                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1478 /** error macro. */
1479 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1480 /**
1481  * Serve a file to a single client.
1482  *
1483  * @todo This beast needs to be split up in many tiny little manageable
1484  * pieces. Preferably with a chainsaw.
1485  *
1486  * @param client The client we're going to serve to.
1487  * @return when the client disconnects
1488  **/
1489 int mainloop(CLIENT *client) {
1490         struct nbd_request request;
1491         struct nbd_reply reply;
1492         gboolean go_on=TRUE;
1493 #ifdef DODBG
1494         int i = 0;
1495 #endif
1496         negotiate(client->net, client, NULL);
1497         DEBUG("Entering request loop!\n");
1498         reply.magic = htonl(NBD_REPLY_MAGIC);
1499         reply.error = 0;
1500         while (go_on) {
1501                 char buf[BUFSIZE];
1502                 char* p;
1503                 size_t len;
1504                 size_t currlen;
1505                 size_t writelen;
1506                 uint16_t command;
1507 #ifdef DODBG
1508                 i++;
1509                 printf("%d: ", i);
1510 #endif
1511                 readit(client->net, &request, sizeof(request));
1512                 if (client->transactionlogfd != -1)
1513                         writeit(client->transactionlogfd, &request, sizeof(request));
1514
1515                 request.from = ntohll(request.from);
1516                 request.type = ntohl(request.type);
1517                 command = request.type & NBD_CMD_MASK_COMMAND;
1518                 len = ntohl(request.len);
1519
1520                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1521                                 (unsigned long long)request.from,
1522                                 (unsigned long long)request.from / 512, (unsigned int)len);
1523
1524                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1525                         err("Not enough magic.");
1526
1527                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1528
1529                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1530                         if ((request.from + len) > (OFFT_MAX)) {
1531                                 DEBUG("[Number too large!]");
1532                                 ERROR(client, reply, EINVAL);
1533                                 continue;
1534                         }
1535
1536                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1537                                 DEBUG("[RANGE!]");
1538                                 ERROR(client, reply, EINVAL);
1539                                 continue;
1540                         }
1541
1542                         currlen = len;
1543                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1544                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1545                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1546                         }
1547                 }
1548
1549                 switch (command) {
1550
1551                 case NBD_CMD_DISC:
1552                         msg2(LOG_INFO, "Disconnect request received.");
1553                         if (client->server->flags & F_COPYONWRITE) { 
1554                                 if (client->difmap) g_free(client->difmap) ;
1555                                 close(client->difffile);
1556                                 unlink(client->difffilename);
1557                                 free(client->difffilename);
1558                         }
1559                         go_on=FALSE;
1560                         continue;
1561
1562                 case NBD_CMD_WRITE:
1563                         DEBUG("wr: net->buf, ");
1564                         while(len > 0) {
1565                                 readit(client->net, buf, currlen);
1566                                 DEBUG("buf->exp, ");
1567                                 if ((client->server->flags & F_READONLY) ||
1568                                     (client->server->flags & F_AUTOREADONLY)) {
1569                                         DEBUG("[WRITE to READONLY!]");
1570                                         ERROR(client, reply, EPERM);
1571                                         consume(client->net, buf, len-currlen, BUFSIZE);
1572                                         continue;
1573                                 }
1574                                 if (expwrite(request.from, buf, currlen, client,
1575                                              request.type & NBD_CMD_FLAG_FUA)) {
1576                                         DEBUG("Write failed: %m" );
1577                                         ERROR(client, reply, errno);
1578                                         consume(client->net, buf, len-currlen, BUFSIZE);
1579                                         continue;
1580                                 }
1581                                 len -= currlen;
1582                                 request.from += currlen;
1583                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1584                         }
1585                         SEND(client->net, reply);
1586                         DEBUG("OK!\n");
1587                         continue;
1588
1589                 case NBD_CMD_FLUSH:
1590                         DEBUG("fl: ");
1591                         if (expflush(client)) {
1592                                 DEBUG("Flush failed: %m");
1593                                 ERROR(client, reply, errno);
1594                                 continue;
1595                         }
1596                         SEND(client->net, reply);
1597                         DEBUG("OK!\n");
1598                         continue;
1599
1600                 case NBD_CMD_READ:
1601                         DEBUG("exp->buf, ");
1602                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1603                         if (client->transactionlogfd != -1)
1604                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1605                         p = buf + sizeof(struct nbd_reply);
1606                         writelen = currlen + sizeof(struct nbd_reply);
1607                         while(len > 0) {
1608                                 if (expread(request.from, p, currlen, client)) {
1609                                         DEBUG("Read failed: %m");
1610                                         ERROR(client, reply, errno);
1611                                         continue;
1612                                 }
1613                                 
1614                                 DEBUG("buf->net, ");
1615                                 writeit(client->net, buf, writelen);
1616                                 len -= currlen;
1617                                 request.from += currlen;
1618                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1619                                 p = buf;
1620                                 writelen = currlen;
1621                         }
1622                         DEBUG("OK!\n");
1623                         continue;
1624
1625                 default:
1626                         DEBUG ("Ignoring unknown command\n");
1627                         continue;
1628                 }
1629         }
1630         return 0;
1631 }
1632
1633 /**
1634  * Set up client export array, which is an array of FILE_INFO.
1635  * Also, split a single exportfile into multiple ones, if that was asked.
1636  * @param client information on the client which we want to setup export for
1637  **/
1638 void setupexport(CLIENT* client) {
1639         int i;
1640         off_t laststartoff = 0, lastsize = 0;
1641         int multifile = (client->server->flags & F_MULTIFILE);
1642
1643         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1644
1645         /* If multi-file, open as many files as we can.
1646          * If not, open exactly one file.
1647          * Calculate file sizes as we go to get total size. */
1648         for(i=0; ; i++) {
1649                 FILE_INFO fi;
1650                 gchar *tmpname;
1651                 gchar* error_string;
1652                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1653
1654                 if(multifile) {
1655                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1656                 } else {
1657                         tmpname=g_strdup(client->exportname);
1658                 }
1659                 DEBUG( "Opening %s\n", tmpname );
1660                 fi.fhandle = open(tmpname, mode);
1661                 if(fi.fhandle == -1 && mode == O_RDWR) {
1662                         /* Try again because maybe media was read-only */
1663                         fi.fhandle = open(tmpname, O_RDONLY);
1664                         if(fi.fhandle != -1) {
1665                                 /* Opening the base file in copyonwrite mode is
1666                                  * okay */
1667                                 if(!(client->server->flags & F_COPYONWRITE)) {
1668                                         client->server->flags |= F_AUTOREADONLY;
1669                                         client->server->flags |= F_READONLY;
1670                                 }
1671                         }
1672                 }
1673                 if(fi.fhandle == -1) {
1674                         if(multifile && i>0)
1675                                 break;
1676                         error_string=g_strdup_printf(
1677                                 "Could not open exported file %s: %%m",
1678                                 tmpname);
1679                         err(error_string);
1680                 }
1681                 fi.startoff = laststartoff + lastsize;
1682                 g_array_append_val(client->export, fi);
1683                 g_free(tmpname);
1684
1685                 /* Starting offset and size of this file will be used to
1686                  * calculate starting offset of next file */
1687                 laststartoff = fi.startoff;
1688                 lastsize = size_autodetect(fi.fhandle);
1689
1690                 if(!multifile)
1691                         break;
1692         }
1693
1694         /* Set export size to total calculated size */
1695         client->exportsize = laststartoff + lastsize;
1696
1697         /* Export size may be overridden */
1698         if(client->server->expected_size) {
1699                 /* desired size must be <= total calculated size */
1700                 if(client->server->expected_size > client->exportsize) {
1701                         err("Size of exported file is too big\n");
1702                 }
1703
1704                 client->exportsize = client->server->expected_size;
1705         }
1706
1707         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1708         if(multifile) {
1709                 msg3(LOG_INFO, "Total number of files: %d", i);
1710         }
1711 }
1712
1713 int copyonwrite_prepare(CLIENT* client) {
1714         off_t i;
1715         if ((client->difffilename = malloc(1024))==NULL)
1716                 err("Failed to allocate string for diff file name");
1717         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1718                 (int)getpid()) ;
1719         client->difffilename[1023]='\0';
1720         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1721         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1722         if (client->difffile<0) err("Could not create diff file (%m)") ;
1723         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1724                 err("Could not allocate memory") ;
1725         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1726
1727         return 0;
1728 }
1729
1730 /**
1731  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1732  * options
1733  *
1734  * @param command the command to be ran. Read from the config file
1735  * @param file the file name we're about to export
1736  **/
1737 int do_run(gchar* command, gchar* file) {
1738         gchar* cmd;
1739         int retval=0;
1740
1741         if(command && *command) {
1742                 cmd = g_strdup_printf(command, file);
1743                 retval=system(cmd);
1744                 g_free(cmd);
1745         }
1746         return retval;
1747 }
1748
1749 /**
1750  * Serve a connection. 
1751  *
1752  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1753  * follow the road map.
1754  *
1755  * @param client a connected client
1756  **/
1757 void serveconnection(CLIENT *client) {
1758         if (client->server->transactionlog && (client->transactionlogfd == -1))
1759         {
1760                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1761                                                            O_WRONLY | O_CREAT,
1762                                                            S_IRUSR | S_IWUSR)))
1763                         g_warning("Could not open transaction log %s",
1764                                   client->server->transactionlog);
1765         }
1766
1767         if(do_run(client->server->prerun, client->exportname)) {
1768                 exit(EXIT_FAILURE);
1769         }
1770         setupexport(client);
1771
1772         if (client->server->flags & F_COPYONWRITE) {
1773                 copyonwrite_prepare(client);
1774         }
1775
1776         setmysockopt(client->net);
1777
1778         mainloop(client);
1779         do_run(client->server->postrun, client->exportname);
1780
1781         if (-1 != client->transactionlogfd)
1782         {
1783                 close(client->transactionlogfd);
1784                 client->transactionlogfd = -1;
1785         }
1786 }
1787
1788 /**
1789  * Find the name of the file we have to serve. This will use g_strdup_printf
1790  * to put the IP address of the client inside a filename containing
1791  * "%s" (in the form as specified by the "virtstyle" option). That name
1792  * is then written to client->exportname.
1793  *
1794  * @param net A socket connected to an nbd client
1795  * @param client information about the client. The IP address in human-readable
1796  * format will be written to a new char* buffer, the address of which will be
1797  * stored in client->clientname.
1798  **/
1799 void set_peername(int net, CLIENT *client) {
1800         struct sockaddr_storage addrin;
1801         struct sockaddr_storage netaddr;
1802         struct sockaddr_in  *netaddr4 = NULL;
1803         struct sockaddr_in6 *netaddr6 = NULL;
1804         size_t addrinlen = sizeof( addrin );
1805         struct addrinfo hints;
1806         struct addrinfo *ai = NULL;
1807         char peername[NI_MAXHOST];
1808         char netname[NI_MAXHOST];
1809         char *tmp = NULL;
1810         int i;
1811         int e;
1812         int shift;
1813
1814         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1815                 err("getsockname failed: %m");
1816
1817         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1818                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1819
1820         memset(&hints, '\0', sizeof (hints));
1821         hints.ai_flags = AI_ADDRCONFIG;
1822         e = getaddrinfo(peername, NULL, &hints, &ai);
1823
1824         if(e != 0) {
1825                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1826                 freeaddrinfo(ai);
1827                 return;
1828         }
1829
1830         switch(client->server->virtstyle) {
1831                 case VIRT_NONE:
1832                         client->exportname=g_strdup(client->server->exportname);
1833                         break;
1834                 case VIRT_IPHASH:
1835                         for(i=0;i<strlen(peername);i++) {
1836                                 if(peername[i]=='.') {
1837                                         peername[i]='/';
1838                                 }
1839                         }
1840                 case VIRT_IPLIT:
1841                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1842                         break;
1843                 case VIRT_CIDR:
1844                         memcpy(&netaddr, &addrin, addrinlen);
1845                         if(ai->ai_family == AF_INET) {
1846                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1847                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1848                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1849
1850                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1851                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1852                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1853                         }else if(ai->ai_family == AF_INET6) {
1854                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1855
1856                                 shift = 128-(client->server->cidrlen);
1857                                 i = 3;
1858                                 while(shift >= 32) {
1859                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1860                                         shift-=32;
1861                                         i--;
1862                                 }
1863                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1864                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1865
1866                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1867                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1868                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1869                         }
1870
1871                         if(tmp != NULL)
1872                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1873
1874                         break;
1875         }
1876
1877         freeaddrinfo(ai);
1878         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1879              peername, client->exportname);
1880         client->clientname=g_strdup(peername);
1881 }
1882
1883 /**
1884  * Destroy a pid_t*
1885  * @param data a pointer to pid_t which should be freed
1886  **/
1887 void destroy_pid_t(gpointer data) {
1888         g_free(data);
1889 }
1890
1891 /**
1892  * Loop through the available servers, and serve them. Never returns.
1893  **/
1894 int serveloop(GArray* servers) {
1895         struct sockaddr_storage addrin;
1896         socklen_t addrinlen=sizeof(addrin);
1897         int i;
1898         int max;
1899         int sock;
1900         fd_set mset;
1901         fd_set rset;
1902
1903         /* 
1904          * Set up the master fd_set. The set of descriptors we need
1905          * to select() for never changes anyway and it buys us a *lot*
1906          * of time to only build this once. However, if we ever choose
1907          * to not fork() for clients anymore, we may have to revisit
1908          * this.
1909          */
1910         max=0;
1911         FD_ZERO(&mset);
1912         for(i=0;i<servers->len;i++) {
1913                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1914                         FD_SET(sock, &mset);
1915                         max=sock>max?sock:max;
1916                 }
1917         }
1918         if(modernsock) {
1919                 FD_SET(modernsock, &mset);
1920                 max=modernsock>max?modernsock:max;
1921         }
1922         for(;;) {
1923                 CLIENT *client = NULL;
1924                 pid_t *pid;
1925
1926                 memcpy(&rset, &mset, sizeof(fd_set));
1927                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1928                         int net = 0;
1929                         SERVER* serve=NULL;
1930
1931                         DEBUG("accept, ");
1932                         if(FD_ISSET(modernsock, &rset)) {
1933                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1934                                         err("accept: %m");
1935                                 client = negotiate(net, NULL, servers);
1936                                 if(!client) {
1937                                         err_nonfatal("negotiation failed");
1938                                         close(net);
1939                                         net=0;
1940                                         continue;
1941                                 }
1942                                 serve = client->server;
1943                         }
1944                         for(i=0;i<servers->len && !net;i++) {
1945                                 serve=&(g_array_index(servers, SERVER, i));
1946                                 if(FD_ISSET(serve->socket, &rset)) {
1947                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1948                                                 err("accept: %m");
1949                                 }
1950                         }
1951                         if(net) {
1952                                 int sock_flags;
1953
1954                                 if(serve->max_connections > 0 &&
1955                                    g_hash_table_size(children) >= serve->max_connections) {
1956                                         msg2(LOG_INFO, "Max connections reached");
1957                                         close(net);
1958                                         continue;
1959                                 }
1960                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1961                                         err("fcntl F_GETFL");
1962                                 }
1963                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1964                                         err("fcntl F_SETFL ~O_NONBLOCK");
1965                                 }
1966                                 if(!client) {
1967                                         client = g_new0(CLIENT, 1);
1968                                         client->server=serve;
1969                                         client->exportsize=OFFT_MAX;
1970                                         client->net=net;
1971                                         client->transactionlogfd = -1;
1972                                 }
1973                                 set_peername(net, client);
1974                                 if (!authorized_client(client)) {
1975                                         msg2(LOG_INFO,"Unauthorized client") ;
1976                                         close(net);
1977                                         continue;
1978                                 }
1979                                 msg2(LOG_INFO,"Authorized client") ;
1980                                 pid=g_malloc(sizeof(pid_t));
1981
1982                                 if (!dontfork) {
1983                                         if ((*pid=fork())<0) {
1984                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1985                                                 close(net);
1986                                                 continue;
1987                                         }
1988                                         if (*pid>0) { /* parent */
1989                                                 close(net);
1990                                                 g_hash_table_insert(children, pid, pid);
1991                                                 continue;
1992                                         }
1993                                         /* child */
1994                                         g_hash_table_destroy(children);
1995                                         for(i=0;i<servers->len;i++) {
1996                                                 serve=&g_array_index(servers, SERVER, i);
1997                                                 close(serve->socket);
1998                                         }
1999                                         /* FALSE does not free the
2000                                            actual data. This is required,
2001                                            because the client has a
2002                                            direct reference into that
2003                                            data, and otherwise we get a
2004                                            segfault... */
2005                                         g_array_free(servers, FALSE);
2006                                 }
2007
2008                                 msg2(LOG_INFO,"Starting to serve");
2009                                 serveconnection(client);
2010                                 exit(EXIT_SUCCESS);
2011                         }
2012                 }
2013         }
2014 }
2015
2016 void dosockopts(int socket) {
2017 #ifndef sun
2018         int yes=1;
2019 #else
2020         char yes='1';
2021 #endif /* sun */
2022         int sock_flags;
2023
2024         /* lose the pesky "Address already in use" error message */
2025         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2026                 err("setsockopt SO_REUSEADDR");
2027         }
2028         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2029                 err("setsockopt SO_KEEPALIVE");
2030         }
2031
2032         /* make the listening socket non-blocking */
2033         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2034                 err("fcntl F_GETFL");
2035         }
2036         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2037                 err("fcntl F_SETFL O_NONBLOCK");
2038         }
2039 }
2040
2041 /**
2042  * Connect a server's socket.
2043  *
2044  * @param serve the server we want to connect.
2045  **/
2046 int setup_serve(SERVER *serve) {
2047         struct addrinfo hints;
2048         struct addrinfo *ai = NULL;
2049         gchar *port = NULL;
2050         int e;
2051
2052         if(!do_oldstyle) {
2053                 return serve->servename ? 1 : 0;
2054         }
2055         memset(&hints,'\0',sizeof(hints));
2056         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2057         hints.ai_socktype = SOCK_STREAM;
2058         hints.ai_family = serve->socket_family;
2059
2060         port = g_strdup_printf ("%d", serve->port);
2061         if (port == NULL)
2062                 return 0;
2063
2064         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2065
2066         g_free(port);
2067
2068         if(e != 0) {
2069                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2070                 serve->socket = -1;
2071                 freeaddrinfo(ai);
2072                 exit(EXIT_FAILURE);
2073         }
2074
2075         if(serve->socket_family == AF_UNSPEC)
2076                 serve->socket_family = ai->ai_family;
2077
2078 #ifdef WITH_SDP
2079         if ((serve->flags) && F_SDP) {
2080                 if (ai->ai_family == AF_INET)
2081                         ai->ai_family = AF_INET_SDP;
2082                 else (ai->ai_family == AF_INET6)
2083                         ai->ai_family = AF_INET6_SDP;
2084         }
2085 #endif
2086         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2087                 err("socket: %m");
2088
2089         dosockopts(serve->socket);
2090
2091         DEBUG("Waiting for connections... bind, ");
2092         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2093         if (e != 0 && errno != EADDRINUSE)
2094                 err("bind: %m");
2095         DEBUG("listen, ");
2096         if (listen(serve->socket, 1) < 0)
2097                 err("listen: %m");
2098
2099         freeaddrinfo (ai);
2100         if(serve->servename) {
2101                 return 1;
2102         } else {
2103                 return 0;
2104         }
2105 }
2106
2107 void open_modern(void) {
2108         struct addrinfo hints;
2109         struct addrinfo* ai = NULL;
2110         struct sock_flags;
2111         int e;
2112
2113         memset(&hints, '\0', sizeof(hints));
2114         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2115         hints.ai_socktype = SOCK_STREAM;
2116         hints.ai_family = AF_UNSPEC;
2117         hints.ai_protocol = IPPROTO_TCP;
2118         e = getaddrinfo(modern_listen, modernport, &hints, &ai);
2119         if(e != 0) {
2120                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2121                 exit(EXIT_FAILURE);
2122         }
2123         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2124                 err("socket: %m");
2125         }
2126
2127         dosockopts(modernsock);
2128
2129         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2130                 err("bind: %m");
2131         }
2132         if(listen(modernsock, 10) <0) {
2133                 err("listen: %m");
2134         }
2135
2136         freeaddrinfo(ai);
2137 }
2138
2139 /**
2140  * Connect our servers.
2141  **/
2142 void setup_servers(GArray* servers) {
2143         int i;
2144         struct sigaction sa;
2145         int want_modern=0;
2146
2147         for(i=0;i<servers->len;i++) {
2148                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2149         }
2150         if(want_modern) {
2151                 open_modern();
2152         }
2153         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2154
2155         sa.sa_handler = sigchld_handler;
2156         sigemptyset(&sa.sa_mask);
2157         sa.sa_flags = SA_RESTART;
2158         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2159                 err("sigaction: %m");
2160         sa.sa_handler = sigterm_handler;
2161         sigemptyset(&sa.sa_mask);
2162         sa.sa_flags = SA_RESTART;
2163         if(sigaction(SIGTERM, &sa, NULL) == -1)
2164                 err("sigaction: %m");
2165 }
2166
2167 /**
2168  * Go daemon (unless we specified at compile time that we didn't want this)
2169  * @param serve the first server of our configuration. If its port is zero,
2170  *      then do not daemonize, because we're doing inetd then. This parameter
2171  *      is only used to create a PID file of the form
2172  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2173  **/
2174 #if !defined(NODAEMON)
2175 void daemonize(SERVER* serve) {
2176         FILE*pidf;
2177
2178         if(serve && !(serve->port)) {
2179                 return;
2180         }
2181         if(daemon(0,0)<0) {
2182                 err("daemon");
2183         }
2184         if(!*pidftemplate) {
2185                 if(serve) {
2186                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2187                 } else {
2188                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2189                 }
2190         }
2191         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2192         pidf=fopen(pidfname, "w");
2193         if(pidf) {
2194                 fprintf(pidf,"%d\n", (int)getpid());
2195                 fclose(pidf);
2196         } else {
2197                 perror("fopen");
2198                 fprintf(stderr, "Not fatal; continuing");
2199         }
2200 }
2201 #else
2202 #define daemonize(serve)
2203 #endif /* !defined(NODAEMON) */
2204
2205 /*
2206  * Everything beyond this point (in the file) is run in non-daemon mode.
2207  * The stuff above daemonize() isn't.
2208  */
2209
2210 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2211
2212 void serve_err(SERVER* serve, const char* msg) {
2213         g_message("Export of %s on port %d failed:", serve->exportname,
2214                         serve->port);
2215         err(msg);
2216 }
2217
2218 /**
2219  * Set up user-ID and/or group-ID
2220  **/
2221 void dousers(void) {
2222         struct passwd *pw;
2223         struct group *gr;
2224         gchar* str;
2225         if(rungroup) {
2226                 gr=getgrnam(rungroup);
2227                 if(!gr) {
2228                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2229                         err(str);
2230                 }
2231                 if(setgid(gr->gr_gid)<0) {
2232                         err("Could not set GID: %m"); 
2233                 }
2234         }
2235         if(runuser) {
2236                 pw=getpwnam(runuser);
2237                 if(!pw) {
2238                         str = g_strdup_printf("Invalid user name: %s", runuser);
2239                         err(str);
2240                 }
2241                 if(setuid(pw->pw_uid)<0) {
2242                         err("Could not set UID: %m");
2243                 }
2244         }
2245 }
2246
2247 #ifndef ISSERVER
2248 void glib_message_syslog_redirect(const gchar *log_domain,
2249                                   GLogLevelFlags log_level,
2250                                   const gchar *message,
2251                                   gpointer user_data)
2252 {
2253     int level=LOG_DEBUG;
2254     
2255     switch( log_level )
2256     {
2257       case G_LOG_FLAG_FATAL:
2258       case G_LOG_LEVEL_CRITICAL:
2259       case G_LOG_LEVEL_ERROR:    
2260         level=LOG_ERR; 
2261         break;
2262       case G_LOG_LEVEL_WARNING:
2263         level=LOG_WARNING;
2264         break;
2265       case G_LOG_LEVEL_MESSAGE:
2266       case G_LOG_LEVEL_INFO:
2267         level=LOG_INFO;
2268         break;
2269       case G_LOG_LEVEL_DEBUG:
2270         level=LOG_DEBUG;
2271       default:
2272         level=LOG_ERR;
2273     }
2274     syslog(level, "%s", message);
2275 }
2276 #endif
2277
2278 /**
2279  * Main entry point...
2280  **/
2281 int main(int argc, char *argv[]) {
2282         SERVER *serve;
2283         GArray *servers;
2284         GError *err=NULL;
2285
2286         if (sizeof( struct nbd_request )!=28) {
2287                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2288                 exit(EXIT_FAILURE) ;
2289         }
2290
2291         memset(pidftemplate, '\0', 256);
2292
2293         logging();
2294         config_file_pos = g_strdup(CFILE);
2295         serve=cmdline(argc, argv);
2296         servers = parse_cfile(config_file_pos, &err);
2297         
2298         if(serve) {
2299                 serve->socket_family = AF_UNSPEC;
2300
2301                 append_serve(serve, servers);
2302      
2303                 if (!(serve->port)) {
2304                         CLIENT *client;
2305 #ifndef ISSERVER
2306                         /* You really should define ISSERVER if you're going to use
2307                          * inetd mode, but if you don't, closing stdout and stderr
2308                          * (which inetd had connected to the client socket) will let it
2309                          * work. */
2310                         close(1);
2311                         close(2);
2312                         open("/dev/null", O_WRONLY);
2313                         open("/dev/null", O_WRONLY);
2314                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2315 #endif
2316                         client=g_malloc(sizeof(CLIENT));
2317                         client->server=serve;
2318                         client->net=0;
2319                         client->exportsize=OFFT_MAX;
2320                         set_peername(0,client);
2321                         serveconnection(client);
2322                         return 0;
2323                 }
2324         }
2325     
2326         if(!servers || !servers->len) {
2327                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2328                                 && err->code == CFILE_NOTFOUND)) {
2329                         g_warning("Could not parse config file: %s", 
2330                                         err ? err->message : "Unknown error");
2331                 }
2332         }
2333         if(serve) {
2334                 g_warning("Specifying an export on the command line is deprecated.");
2335                 g_warning("Please use a configuration file instead.");
2336         }
2337
2338         if((!serve) && (!servers||!servers->len)) {
2339                 g_message("No configured exports; quitting.");
2340                 exit(EXIT_FAILURE);
2341         }
2342         if (!dontfork)
2343                 daemonize(serve);
2344         setup_servers(servers);
2345         dousers();
2346         serveloop(servers);
2347         return 0 ;
2348 }