e733c3ef5ab9fdf9f8ee40605ac15c51cd7ecd3b
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 #define NEG_INIT        (1 << 0)
169 #define NEG_OLD         (1 << 1)
170 #define NEG_MODERN      (1 << 2)
171
172 int modernsock=0;         /**< Socket for the modern handler. Not used
173                                if a client was only specified on the
174                                command line; only port used if
175                                oldstyle is set to false (and then the
176                                command-line client isn't used, gna gna) */
177 char* modern_listen;      /**< listenaddr value for modernsock */
178 char* modernport=NBD_DEFAULT_PORT; /**< Port number on which to listen for
179                                       new-style nbd-client connections */
180
181 /**
182  * Types of virtuatlization
183  **/
184 typedef enum {
185         VIRT_NONE=0,    /**< No virtualization */
186         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
187         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
188                              doing the same as in IPLIT */
189         VIRT_CIDR,      /**< Every subnet in its own directory */
190 } VIRT_STYLE;
191
192 /**
193  * Variables associated with a server.
194  **/
195 typedef struct {
196         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
197         off_t expected_size; /**< size of the exported file as it was told to
198                                us through configuration */
199         gchar* listenaddr;   /**< The IP address we're listening on */
200         unsigned int port;   /**< port we're exporting this file at */
201         char* authname;      /**< filename of the authorization file */
202         int flags;           /**< flags associated with this exported file */
203         int socket;          /**< The socket of this server. */
204         int socket_family;   /**< family of the socket */
205         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
206         uint8_t cidrlen;     /**< The length of the mask when we use
207                                   CIDR-style virtualization */
208         gchar* prerun;       /**< command to be ran after connecting a client,
209                                   but before starting to serve */
210         gchar* postrun;      /**< command that will be ran after the client
211                                   disconnects */
212         gchar* servename;    /**< name of the export as selected by nbd-client */
213         int max_connections; /**< maximum number of opened connections */
214         gchar* transactionlog;/**< filename for transaction log */
215 } SERVER;
216
217 /**
218  * Variables associated with a client socket.
219  **/
220 typedef struct {
221         int fhandle;      /**< file descriptor */
222         off_t startoff;   /**< starting offset of this file */
223 } FILE_INFO;
224
225 typedef struct {
226         off_t exportsize;    /**< size of the file we're exporting */
227         char *clientname;    /**< peer */
228         char *exportname;    /**< (processed) filename of the file we're exporting */
229         GArray *export;    /**< array of FILE_INFO of exported files;
230                                array size is always 1 unless we're
231                                doing the multiple file option */
232         int net;             /**< The actual client socket */
233         SERVER *server;      /**< The server this client is getting data from */
234         char* difffilename;  /**< filename of the copy-on-write file, if any */
235         int difffile;        /**< filedescriptor of copyonwrite file. @todo
236                                shouldn't this be an array too? (cfr export) Or
237                                make -m and -c mutually exclusive */
238         u32 difffilelen;     /**< number of pages in difffile */
239         u32 *difmap;         /**< see comment on the global difmap for this one */
240         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
241         int transactionlogfd;/**< fd for transaction log */
242 } CLIENT;
243
244 /**
245  * Type of configuration file values
246  **/
247 typedef enum {
248         PARAM_INT,              /**< This parameter is an integer */
249         PARAM_STRING,           /**< This parameter is a string */
250         PARAM_BOOL,             /**< This parameter is a boolean */
251 } PARAM_TYPE;
252
253 /**
254  * Configuration file values
255  **/
256 typedef struct {
257         gchar *paramname;       /**< Name of the parameter, as it appears in
258                                   the config file */
259         gboolean required;      /**< Whether this is a required (as opposed to
260                                   optional) parameter */
261         PARAM_TYPE ptype;       /**< Type of the parameter. */
262         gpointer target;        /**< Pointer to where the data of this
263                                   parameter should be written. If ptype is
264                                   PARAM_BOOL, the data is or'ed rather than
265                                   overwritten. */
266         gint flagval;           /**< Flag mask for this parameter in case ptype
267                                   is PARAM_BOOL. */
268 } PARAM;
269
270 /**
271  * Translate a command name into human readable form
272  *
273  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
274  * @return pointer to the command name
275  **/
276 static inline const char * getcommandname(uint64_t command) {
277         switch (command) {
278         case NBD_CMD_READ:
279                 return "NBD_CMD_READ";
280         case NBD_CMD_WRITE:
281                 return "NBD_CMD_WRITE";
282         case NBD_CMD_DISC:
283                 return "NBD_CMD_DISC";
284         case NBD_CMD_FLUSH:
285                 return "NBD_CMD_FLUSH";
286         default:
287                 break;
288         }
289         return "UNKNOWN";
290 }
291
292 /**
293  * Check whether a client is allowed to connect. Works with an authorization
294  * file which contains one line per machine, no wildcards.
295  *
296  * @param opts The client who's trying to connect.
297  * @return 0 - authorization refused, 1 - OK
298  **/
299 int authorized_client(CLIENT *opts) {
300         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
301         FILE *f ;
302         char line[LINELEN]; 
303         char *tmp;
304         struct in_addr addr;
305         struct in_addr client;
306         struct in_addr cltemp;
307         int len;
308
309         if ((f=fopen(opts->server->authname,"r"))==NULL) {
310                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
311                      opts->server->authname,strerror(errno)) ;
312                 return 1 ; 
313         }
314   
315         inet_aton(opts->clientname, &client);
316         while (fgets(line,LINELEN,f)!=NULL) {
317                 if((tmp=index(line, '/'))) {
318                         if(strlen(line)<=tmp-line) {
319                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
320                                 return 0;
321                         }
322                         *(tmp++)=0;
323                         if(!inet_aton(line,&addr)) {
324                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
325                                 return 0;
326                         }
327                         len=strtol(tmp, NULL, 0);
328                         addr.s_addr>>=32-len;
329                         addr.s_addr<<=32-len;
330                         memcpy(&cltemp,&client,sizeof(client));
331                         cltemp.s_addr>>=32-len;
332                         cltemp.s_addr<<=32-len;
333                         if(addr.s_addr == cltemp.s_addr) {
334                                 return 1;
335                         }
336                 }
337                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
338                         fclose(f);
339                         return 1;
340                 }
341         }
342         fclose(f);
343         return 0;
344 }
345
346 /**
347  * Read data from a file descriptor into a buffer
348  *
349  * @param f a file descriptor
350  * @param buf a buffer
351  * @param len the number of bytes to be read
352  **/
353 static inline void readit(int f, void *buf, size_t len) {
354         ssize_t res;
355         while (len > 0) {
356                 DEBUG("*");
357                 if ((res = read(f, buf, len)) <= 0) {
358                         if(errno != EAGAIN) {
359                                 err("Read failed: %m");
360                         }
361                 } else {
362                         len -= res;
363                         buf += res;
364                 }
365         }
366 }
367
368 /**
369  * Consume data from an FD that we don't want
370  *
371  * @param f a file descriptor
372  * @param buf a buffer
373  * @param len the number of bytes to consume
374  * @param bufsiz the size of the buffer
375  **/
376 static inline void consume(int f, void * buf, size_t len, size_t bufsiz) {
377         size_t curlen;
378         while (len>0) {
379                 curlen = (len>bufsiz)?bufsiz:len;
380                 readit(f, buf, curlen);
381                 len -= curlen;
382         }
383 }
384
385
386 /**
387  * Write data from a buffer into a filedescriptor
388  *
389  * @param f a file descriptor
390  * @param buf a buffer containing data
391  * @param len the number of bytes to be written
392  **/
393 static inline void writeit(int f, void *buf, size_t len) {
394         ssize_t res;
395         while (len > 0) {
396                 DEBUG("+");
397                 if ((res = write(f, buf, len)) <= 0)
398                         err("Send failed: %m");
399                 len -= res;
400                 buf += res;
401         }
402 }
403
404 /**
405  * Print out a message about how to use nbd-server. Split out to a separate
406  * function so that we can call it from multiple places
407  */
408 void usage() {
409         printf("This is nbd-server version " VERSION "\n");
410         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
411                "\t-r|--read-only\t\tread only\n"
412                "\t-m|--multi-file\t\tmultiple file\n"
413                "\t-c|--copy-on-write\tcopy on write\n"
414                "\t-C|--config-file\tspecify an alternate configuration file\n"
415                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
416                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
417                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
418                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
419                "\tif port is set to 0, stdin is used (for running from inetd).\n"
420                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
421                "\t\taddress of the machine trying to connect\n" 
422                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
423         printf("Using configuration file %s\n", CFILE);
424 }
425
426 /* Dumps a config file section of the given SERVER*, and exits. */
427 void dump_section(SERVER* serve, gchar* section_header) {
428         printf("[%s]\n", section_header);
429         printf("\texportname = %s\n", serve->exportname);
430         printf("\tlistenaddr = %s\n", serve->listenaddr);
431         printf("\tport = %d\n", serve->port);
432         if(serve->flags & F_READONLY) {
433                 printf("\treadonly = true\n");
434         }
435         if(serve->flags & F_MULTIFILE) {
436                 printf("\tmultifile = true\n");
437         }
438         if(serve->flags & F_COPYONWRITE) {
439                 printf("\tcopyonwrite = true\n");
440         }
441         if(serve->expected_size) {
442                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
443         }
444         if(serve->authname) {
445                 printf("\tauthfile = %s\n", serve->authname);
446         }
447         exit(EXIT_SUCCESS);
448 }
449
450 /**
451  * Parse the command line.
452  *
453  * @param argc the argc argument to main()
454  * @param argv the argv argument to main()
455  **/
456 SERVER* cmdline(int argc, char *argv[]) {
457         int i=0;
458         int nonspecial=0;
459         int c;
460         struct option long_options[] = {
461                 {"read-only", no_argument, NULL, 'r'},
462                 {"multi-file", no_argument, NULL, 'm'},
463                 {"copy-on-write", no_argument, NULL, 'c'},
464                 {"dont-fork", no_argument, NULL, 'd'},
465                 {"authorize-file", required_argument, NULL, 'l'},
466                 {"config-file", required_argument, NULL, 'C'},
467                 {"pid-file", required_argument, NULL, 'p'},
468                 {"output-config", required_argument, NULL, 'o'},
469                 {"max-connection", required_argument, NULL, 'M'},
470                 {0,0,0,0}
471         };
472         SERVER *serve;
473         off_t es;
474         size_t last;
475         char suffix;
476         gboolean do_output=FALSE;
477         gchar* section_header="";
478         gchar** addr_port;
479
480         if(argc==1) {
481                 return NULL;
482         }
483         serve=g_new0(SERVER, 1);
484         serve->authname = g_strdup(default_authname);
485         serve->virtstyle=VIRT_IPLIT;
486         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
487                 switch (c) {
488                 case 1:
489                         /* non-option argument */
490                         switch(nonspecial++) {
491                         case 0:
492                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
493                                         addr_port=g_strsplit(optarg, ":", 2);
494
495                                         /* Check for "@" - maybe user using this separator
496                                                  for IPv4 address */
497                                         if(!addr_port[1]) {
498                                                 g_strfreev(addr_port);
499                                                 addr_port=g_strsplit(optarg, "@", 2);
500                                         }
501                                 } else {
502                                         addr_port=g_strsplit(optarg, "@", 2);
503                                 }
504
505                                 if(addr_port[1]) {
506                                         serve->port=strtol(addr_port[1], NULL, 0);
507                                         serve->listenaddr=g_strdup(addr_port[0]);
508                                 } else {
509                                         serve->listenaddr=NULL;
510                                         serve->port=strtol(addr_port[0], NULL, 0);
511                                 }
512                                 g_strfreev(addr_port);
513                                 break;
514                         case 1:
515                                 serve->exportname = g_strdup(optarg);
516                                 if(serve->exportname[0] != '/') {
517                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
518                                         exit(EXIT_FAILURE);
519                                 }
520                                 break;
521                         case 2:
522                                 last=strlen(optarg)-1;
523                                 suffix=optarg[last];
524                                 if (suffix == 'k' || suffix == 'K' ||
525                                     suffix == 'm' || suffix == 'M')
526                                         optarg[last] = '\0';
527                                 es = (off_t)atoll(optarg);
528                                 switch (suffix) {
529                                         case 'm':
530                                         case 'M':  es <<= 10;
531                                         case 'k':
532                                         case 'K':  es <<= 10;
533                                         default :  break;
534                                 }
535                                 serve->expected_size = es;
536                                 break;
537                         }
538                         break;
539                 case 'r':
540                         serve->flags |= F_READONLY;
541                         break;
542                 case 'm':
543                         serve->flags |= F_MULTIFILE;
544                         break;
545                 case 'o':
546                         do_output = TRUE;
547                         section_header = g_strdup(optarg);
548                         break;
549                 case 'p':
550                         strncpy(pidftemplate, optarg, 256);
551                         break;
552                 case 'c': 
553                         serve->flags |=F_COPYONWRITE;
554                         break;
555                 case 'd': 
556                         dontfork = 1;
557                         break;
558                 case 'C':
559                         g_free(config_file_pos);
560                         config_file_pos=g_strdup(optarg);
561                         break;
562                 case 'l':
563                         g_free(serve->authname);
564                         serve->authname=g_strdup(optarg);
565                         break;
566                 case 'M':
567                         serve->max_connections = strtol(optarg, NULL, 0);
568                         break;
569                 default:
570                         usage();
571                         exit(EXIT_FAILURE);
572                         break;
573                 }
574         }
575         /* What's left: the port to export, the name of the to be exported
576          * file, and, optionally, the size of the file, in that order. */
577         if(nonspecial<2) {
578                 g_free(serve);
579                 serve=NULL;
580         } else {
581                 do_oldstyle = TRUE;
582         }
583         if(do_output) {
584                 if(!serve) {
585                         g_critical("Need a complete configuration on the command line to output a config file section!");
586                         exit(EXIT_FAILURE);
587                 }
588                 dump_section(serve, section_header);
589         }
590         return serve;
591 }
592
593 /**
594  * Error codes for config file parsing
595  **/
596 typedef enum {
597         CFILE_NOTFOUND,         /**< The configuration file is not found */
598         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
599         CFILE_KEY_MISSING,      /**< A (required) key is missing */
600         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
601         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
602         CFILE_PROGERR,          /**< Programmer error */
603         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
604                                      define any exports */
605         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
606                                      old-style export. */
607 } CFILE_ERRORS;
608
609 /**
610  * Remove a SERVER from memory. Used from the hash table
611  **/
612 void remove_server(gpointer s) {
613         SERVER *server;
614
615         server=(SERVER*)s;
616         g_free(server->exportname);
617         if(server->authname)
618                 g_free(server->authname);
619         if(server->listenaddr)
620                 g_free(server->listenaddr);
621         if(server->prerun)
622                 g_free(server->prerun);
623         if(server->postrun)
624                 g_free(server->postrun);
625         if(server->transactionlog)
626                 g_free(server->transactionlog);
627         g_free(server);
628 }
629
630 /**
631  * duplicate server
632  * @param s the old server we want to duplicate
633  * @return new duplicated server
634  **/
635 SERVER* dup_serve(SERVER *s) {
636         SERVER *serve = NULL;
637
638         serve=g_new0(SERVER, 1);
639         if(serve == NULL)
640                 return NULL;
641
642         if(s->exportname)
643                 serve->exportname = g_strdup(s->exportname);
644
645         serve->expected_size = s->expected_size;
646
647         if(s->listenaddr)
648                 serve->listenaddr = g_strdup(s->listenaddr);
649
650         serve->port = s->port;
651
652         if(s->authname)
653                 serve->authname = strdup(s->authname);
654
655         serve->flags = s->flags;
656         serve->socket = s->socket;
657         serve->socket_family = s->socket_family;
658         serve->virtstyle = s->virtstyle;
659         serve->cidrlen = s->cidrlen;
660
661         if(s->prerun)
662                 serve->prerun = g_strdup(s->prerun);
663
664         if(s->postrun)
665                 serve->postrun = g_strdup(s->postrun);
666
667         if(s->transactionlog)
668                 serve->transactionlog = g_strdup(s->transactionlog);
669         
670         if(s->servename)
671                 serve->servename = g_strdup(s->servename);
672
673         serve->max_connections = s->max_connections;
674
675         return serve;
676 }
677
678 /**
679  * append new server to array
680  * @param s server
681  * @param a server array
682  * @return 0 success, -1 error
683  */
684 int append_serve(SERVER *s, GArray *a) {
685         SERVER *ns = NULL;
686         struct addrinfo hints;
687         struct addrinfo *ai = NULL;
688         struct addrinfo *rp = NULL;
689         char   host[NI_MAXHOST];
690         gchar  *port = NULL;
691         int e;
692         int ret;
693
694         if(!s) {
695                 err("Invalid parsing server");
696                 return -1;
697         }
698
699         port = g_strdup_printf("%d", s->port);
700
701         memset(&hints,'\0',sizeof(hints));
702         hints.ai_family = AF_UNSPEC;
703         hints.ai_socktype = SOCK_STREAM;
704         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
705         hints.ai_protocol = IPPROTO_TCP;
706
707         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
708
709         if (port)
710                 g_free(port);
711
712         if(e == 0) {
713                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
714                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
715
716                         if (e != 0) { // error
717                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
718                                 continue;
719                         }
720
721                         // duplicate server and set listenaddr to resolved IP address
722                         ns = dup_serve (s);
723                         if (ns) {
724                                 ns->listenaddr = g_strdup(host);
725                                 ns->socket_family = rp->ai_family;
726                                 g_array_append_val(a, *ns);
727                                 free(ns);
728                                 ns = NULL;
729                         }
730                 }
731
732                 ret = 0;
733         } else {
734                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
735                 ret = -1;
736         }
737
738         if (ai)
739                 freeaddrinfo(ai);
740
741         return ret;
742 }
743
744 /**
745  * Parse the config file.
746  *
747  * @param f the name of the config file
748  * @param e a GError. @see CFILE_ERRORS for what error values this function can
749  *      return.
750  * @return a Array of SERVER* pointers, If the config file is empty or does not
751  *      exist, returns an empty GHashTable; if the config file contains an
752  *      error, returns NULL, and e is set appropriately
753  **/
754 GArray* parse_cfile(gchar* f, GError** e) {
755         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
756         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
757         SERVER s;
758         gchar *virtstyle=NULL;
759         PARAM lp[] = {
760                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
761                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
762                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
763                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
764                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
765                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
766                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
767                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
768                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
769                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
770                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
771                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
772                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
773                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
774                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
775                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
776                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
777                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
778                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
779         };
780         const int lp_size=sizeof(lp)/sizeof(PARAM);
781         PARAM gp[] = {
782                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
783                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
784                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
785                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
786                 { "port",       FALSE, PARAM_STRING,    &modernport,    0 },
787         };
788         PARAM* p=gp;
789         int p_size=sizeof(gp)/sizeof(PARAM);
790         GKeyFile *cfile;
791         GError *err = NULL;
792         const char *err_msg=NULL;
793         GQuark errdomain;
794         GArray *retval=NULL;
795         gchar **groups;
796         gboolean value;
797         gchar* startgroup;
798         gint i;
799         gint j;
800
801         errdomain = g_quark_from_string("parse_cfile");
802         cfile = g_key_file_new();
803         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
804         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
805                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
806                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
807                 g_key_file_free(cfile);
808                 return retval;
809         }
810         startgroup = g_key_file_get_start_group(cfile);
811         if(!startgroup || strcmp(startgroup, "generic")) {
812                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
813                 g_key_file_free(cfile);
814                 return NULL;
815         }
816         groups = g_key_file_get_groups(cfile, NULL);
817         for(i=0;groups[i];i++) {
818                 memset(&s, '\0', sizeof(SERVER));
819
820                 /* After the [generic] group, start parsing exports */
821                 if(i==1) {
822                         p=lp;
823                         p_size=lp_size;
824                 } 
825                 for(j=0;j<p_size;j++) {
826                         g_assert(p[j].target != NULL);
827                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
828                         switch(p[j].ptype) {
829                                 case PARAM_INT:
830                                         *((gint*)p[j].target) =
831                                                 g_key_file_get_integer(cfile,
832                                                                 groups[i],
833                                                                 p[j].paramname,
834                                                                 &err);
835                                         break;
836                                 case PARAM_STRING:
837                                         *((gchar**)p[j].target) =
838                                                 g_key_file_get_string(cfile,
839                                                                 groups[i],
840                                                                 p[j].paramname,
841                                                                 &err);
842                                         break;
843                                 case PARAM_BOOL:
844                                         value = g_key_file_get_boolean(cfile,
845                                                         groups[i],
846                                                         p[j].paramname, &err);
847                                         if(!err) {
848                                                 if(value) {
849                                                         *((gint*)p[j].target) |= p[j].flagval;
850                                                 } else {
851                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
852                                                 }
853                                         }
854                                         break;
855                         }
856                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, modernport)) {
857                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies new-style port for oldstyle export");
858                                 g_key_file_free(cfile);
859                                 return NULL;
860                         }
861                         if(err) {
862                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
863                                         if(!p[j].required) {
864                                                 /* Ignore not-found error for optional values */
865                                                 g_clear_error(&err);
866                                                 continue;
867                                         } else {
868                                                 err_msg = MISSING_REQUIRED_ERROR;
869                                         }
870                                 } else {
871                                         err_msg = DEFAULT_ERROR;
872                                 }
873                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
874                                 g_array_free(retval, TRUE);
875                                 g_error_free(err);
876                                 g_key_file_free(cfile);
877                                 return NULL;
878                         }
879                 }
880                 if(virtstyle) {
881                         if(!strncmp(virtstyle, "none", 4)) {
882                                 s.virtstyle=VIRT_NONE;
883                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
884                                 s.virtstyle=VIRT_IPLIT;
885                         } else if(!strncmp(virtstyle, "iphash", 6)) {
886                                 s.virtstyle=VIRT_IPHASH;
887                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
888                                 s.virtstyle=VIRT_CIDR;
889                                 if(strlen(virtstyle)<10) {
890                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
891                                         g_array_free(retval, TRUE);
892                                         g_key_file_free(cfile);
893                                         return NULL;
894                                 }
895                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
896                         } else {
897                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
898                                 g_array_free(retval, TRUE);
899                                 g_key_file_free(cfile);
900                                 return NULL;
901                         }
902                         if(s.port && !do_oldstyle) {
903                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
904                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
905                         }
906                 } else {
907                         s.virtstyle=VIRT_IPLIT;
908                 }
909                 /* Don't need to free this, it's not our string */
910                 virtstyle=NULL;
911                 /* Don't append values for the [generic] group */
912                 if(i>0) {
913                         s.socket_family = AF_UNSPEC;
914                         s.servename = groups[i];
915
916                         append_serve(&s, retval);
917                 } else {
918                         if(!do_oldstyle) {
919                                 lp[1].required = 0;
920                         }
921                 }
922 #ifndef WITH_SDP
923                 if(s.flags & F_SDP) {
924                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
925                         g_array_free(retval, TRUE);
926                         g_key_file_free(cfile);
927                         return NULL;
928                 }
929 #endif
930         }
931         if(i==1) {
932                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
933         }
934         g_key_file_free(cfile);
935         return retval;
936 }
937
938 /**
939  * Signal handler for SIGCHLD
940  * @param s the signal we're handling (must be SIGCHLD, or something
941  * is severely wrong)
942  **/
943 void sigchld_handler(int s) {
944         int status;
945         int* i;
946         pid_t pid;
947
948         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
949                 if(WIFEXITED(status)) {
950                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
951                 }
952                 i=g_hash_table_lookup(children, &pid);
953                 if(!i) {
954                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
955                 } else {
956                         DEBUG("Removing %d from the list of children", pid);
957                         g_hash_table_remove(children, &pid);
958                 }
959         }
960 }
961
962 /**
963  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
964  *
965  * @param key the key
966  * @param value the value corresponding to the above key
967  * @param user_data a pointer which we always set to 1, so that we know what
968  * will happen next.
969  **/
970 void killchild(gpointer key, gpointer value, gpointer user_data) {
971         pid_t *pid=value;
972         int *parent=user_data;
973
974         kill(*pid, SIGTERM);
975         *parent=1;
976 }
977
978 /**
979  * Handle SIGTERM and dispatch it to our children
980  * @param s the signal we're handling (must be SIGTERM, or something
981  * is severely wrong).
982  **/
983 void sigterm_handler(int s) {
984         int parent=0;
985
986         g_hash_table_foreach(children, killchild, &parent);
987
988         if(parent) {
989                 unlink(pidfname);
990         }
991
992         exit(EXIT_SUCCESS);
993 }
994
995 /**
996  * Detect the size of a file.
997  *
998  * @param fhandle An open filedescriptor
999  * @return the size of the file, or OFFT_MAX if detection was
1000  * impossible.
1001  **/
1002 off_t size_autodetect(int fhandle) {
1003         off_t es;
1004         u64 bytes;
1005         struct stat stat_buf;
1006         int error;
1007
1008 #ifdef HAVE_SYS_MOUNT_H
1009 #ifdef HAVE_SYS_IOCTL_H
1010 #ifdef BLKGETSIZE64
1011         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
1012         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
1013                 return (off_t)bytes;
1014         }
1015 #endif /* BLKGETSIZE64 */
1016 #endif /* HAVE_SYS_IOCTL_H */
1017 #endif /* HAVE_SYS_MOUNT_H */
1018
1019         DEBUG("looking for fhandle size with fstat\n");
1020         stat_buf.st_size = 0;
1021         error = fstat(fhandle, &stat_buf);
1022         if (!error) {
1023                 if(stat_buf.st_size > 0)
1024                         return (off_t)stat_buf.st_size;
1025         } else {
1026                 err("fstat failed: %m");
1027         }
1028
1029         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1030         es = lseek(fhandle, (off_t)0, SEEK_END);
1031         if (es > ((off_t)0)) {
1032                 return es;
1033         } else {
1034                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1035         }
1036
1037         err("Could not find size of exported block device: %m");
1038         return OFFT_MAX;
1039 }
1040
1041 /**
1042  * Get the file handle and offset, given an export offset.
1043  *
1044  * @param export An array of export files
1045  * @param a The offset to get corresponding file/offset for
1046  * @param fhandle [out] File descriptor
1047  * @param foffset [out] Offset into fhandle
1048  * @param maxbytes [out] Tells how many bytes can be read/written
1049  * from fhandle starting at foffset (0 if there is no limit)
1050  * @return 0 on success, -1 on failure
1051  **/
1052 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1053         /* Negative offset not allowed */
1054         if(a < 0)
1055                 return -1;
1056
1057         /* Binary search for last file with starting offset <= a */
1058         FILE_INFO fi;
1059         int start = 0;
1060         int end = export->len - 1;
1061         while( start <= end ) {
1062                 int mid = (start + end) / 2;
1063                 fi = g_array_index(export, FILE_INFO, mid);
1064                 if( fi.startoff < a ) {
1065                         start = mid + 1;
1066                 } else if( fi.startoff > a ) {
1067                         end = mid - 1;
1068                 } else {
1069                         start = end = mid;
1070                         break;
1071                 }
1072         }
1073
1074         /* end should never go negative, since first startoff is 0 and a >= 0 */
1075         g_assert(end >= 0);
1076
1077         fi = g_array_index(export, FILE_INFO, end);
1078         *fhandle = fi.fhandle;
1079         *foffset = a - fi.startoff;
1080         *maxbytes = 0;
1081         if( end+1 < export->len ) {
1082                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1083                 *maxbytes = fi_next.startoff - a;
1084         }
1085
1086         return 0;
1087 }
1088
1089 /**
1090  * seek to a position in a file, with error handling.
1091  * @param handle a filedescriptor
1092  * @param a position to seek to
1093  * @todo get rid of this; lastpoint is a global variable right now, but it
1094  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1095  * easier.
1096  **/
1097 void myseek(int handle,off_t a) {
1098         if (lseek(handle, a, SEEK_SET) < 0) {
1099                 err("Can not seek locally!\n");
1100         }
1101 }
1102
1103 /**
1104  * Write an amount of bytes at a given offset to the right file. This
1105  * abstracts the write-side of the multiple file option.
1106  *
1107  * @param a The offset where the write should start
1108  * @param buf The buffer to write from
1109  * @param len The length of buf
1110  * @param client The client we're serving for
1111  * @param fua Flag to indicate 'Force Unit Access'
1112  * @return The number of bytes actually written, or -1 in case of an error
1113  **/
1114 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1115         int fhandle;
1116         off_t foffset;
1117         size_t maxbytes;
1118         ssize_t retval;
1119
1120         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1121                 return -1;
1122         if(maxbytes && len > maxbytes)
1123                 len = maxbytes;
1124
1125         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1126
1127         myseek(fhandle, foffset);
1128         retval = write(fhandle, buf, len);
1129         if(client->server->flags & F_SYNC) {
1130                 fsync(fhandle);
1131         } else if (fua) {
1132
1133           /* This is where we would do the following
1134            *   #ifdef USE_SYNC_FILE_RANGE
1135            * However, we don't, for the reasons set out below
1136            * by Christoph Hellwig <hch@infradead.org>
1137            *
1138            * [BEGINS] 
1139            * fdatasync is equivalent to fsync except that it does not flush
1140            * non-essential metadata (basically just timestamps in practice), but it
1141            * does flush metadata requried to find the data again, e.g. allocation
1142            * information and extent maps.  sync_file_range does nothing but flush
1143            * out pagecache content - it means you basically won't get your data
1144            * back in case of a crash if you either:
1145            * 
1146            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1147            *  b) are using a sparse file on a filesystem
1148            *  c) are using a fallocate-preallocated file on a filesystem
1149            *  d) use any file on a COW filesystem like btrfs
1150            * 
1151            * e.g. it only does anything useful for you if you do not have a volatile
1152            * write cache, and either use a raw block device node, or just overwrite
1153            * an already fully allocated (and not preallocated) file on a non-COW
1154            * filesystem.
1155            * [ENDS]
1156            *
1157            * What we should do is open a second FD with O_DSYNC set, then write to
1158            * that when appropriate. However, with a Linux client, every REQ_FUA
1159            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1160            * problems.
1161            *
1162            */
1163 #if 0
1164                 sync_file_range(fhandle, foffset, len,
1165                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1166                                 SYNC_FILE_RANGE_WAIT_AFTER);
1167 #else
1168                 fdatasync(fhandle);
1169 #endif
1170         }
1171         return retval;
1172 }
1173
1174 /**
1175  * Call rawexpwrite repeatedly until all data has been written.
1176  *
1177  * @param a The offset where the write should start
1178  * @param buf The buffer to write from
1179  * @param len The length of buf
1180  * @param client The client we're serving for
1181  * @param fua Flag to indicate 'Force Unit Access'
1182  * @return 0 on success, nonzero on failure
1183  **/
1184 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1185         ssize_t ret=0;
1186
1187         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1188                 a += ret;
1189                 buf += ret;
1190                 len -= ret;
1191         }
1192         return (ret < 0 || len != 0);
1193 }
1194
1195 /**
1196  * Read an amount of bytes at a given offset from the right file. This
1197  * abstracts the read-side of the multiple files option.
1198  *
1199  * @param a The offset where the read should start
1200  * @param buf A buffer to read into
1201  * @param len The size of buf
1202  * @param client The client we're serving for
1203  * @return The number of bytes actually read, or -1 in case of an
1204  * error.
1205  **/
1206 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1207         int fhandle;
1208         off_t foffset;
1209         size_t maxbytes;
1210
1211         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1212                 return -1;
1213         if(maxbytes && len > maxbytes)
1214                 len = maxbytes;
1215
1216         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1217
1218         myseek(fhandle, foffset);
1219         return read(fhandle, buf, len);
1220 }
1221
1222 /**
1223  * Call rawexpread repeatedly until all data has been read.
1224  * @return 0 on success, nonzero on failure
1225  **/
1226 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1227         ssize_t ret=0;
1228
1229         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1230                 a += ret;
1231                 buf += ret;
1232                 len -= ret;
1233         }
1234         return (ret < 0 || len != 0);
1235 }
1236
1237 /**
1238  * Read an amount of bytes at a given offset from the right file. This
1239  * abstracts the read-side of the copyonwrite stuff, and calls
1240  * rawexpread() with the right parameters to do the actual work.
1241  * @param a The offset where the read should start
1242  * @param buf A buffer to read into
1243  * @param len The size of buf
1244  * @param client The client we're going to read for
1245  * @return 0 on success, nonzero on failure
1246  **/
1247 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1248         off_t rdlen, offset;
1249         off_t mapcnt, mapl, maph, pagestart;
1250
1251         if (!(client->server->flags & F_COPYONWRITE))
1252                 return(rawexpread_fully(a, buf, len, client));
1253         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1254
1255         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1256
1257         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1258                 pagestart=mapcnt*DIFFPAGESIZE;
1259                 offset=a-pagestart;
1260                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1261                         len : (size_t)DIFFPAGESIZE-offset;
1262                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1263                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1264                                (unsigned long)(client->difmap[mapcnt]));
1265                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1266                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1267                 } else { /* the block is not there */
1268                         DEBUG("Page %llu is not here, we read the original one\n",
1269                                (unsigned long long)mapcnt);
1270                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1271                 }
1272                 len-=rdlen; a+=rdlen; buf+=rdlen;
1273         }
1274         return 0;
1275 }
1276
1277 /**
1278  * Write an amount of bytes at a given offset to the right file. This
1279  * abstracts the write-side of the copyonwrite option, and calls
1280  * rawexpwrite() with the right parameters to do the actual work.
1281  *
1282  * @param a The offset where the write should start
1283  * @param buf The buffer to write from
1284  * @param len The length of buf
1285  * @param client The client we're going to write for.
1286  * @param fua Flag to indicate 'Force Unit Access'
1287  * @return 0 on success, nonzero on failure
1288  **/
1289 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1290         char pagebuf[DIFFPAGESIZE];
1291         off_t mapcnt,mapl,maph;
1292         off_t wrlen,rdlen; 
1293         off_t pagestart;
1294         off_t offset;
1295
1296         if (!(client->server->flags & F_COPYONWRITE))
1297                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1298         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1299
1300         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1301
1302         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1303                 pagestart=mapcnt*DIFFPAGESIZE ;
1304                 offset=a-pagestart ;
1305                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1306                         len : (size_t)DIFFPAGESIZE-offset;
1307
1308                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1309                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1310                                (unsigned long)(client->difmap[mapcnt])) ;
1311                         myseek(client->difffile,
1312                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1313                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1314                 } else { /* the block is not there */
1315                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1316                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1317                         DEBUG("Page %llu is not here, we put it at %lu\n",
1318                                (unsigned long long)mapcnt,
1319                                (unsigned long)(client->difmap[mapcnt]));
1320                         rdlen=DIFFPAGESIZE ;
1321                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1322                                 return -1;
1323                         memcpy(pagebuf+offset,buf,wrlen) ;
1324                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1325                                         DIFFPAGESIZE)
1326                                 return -1;
1327                 }                                                   
1328                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1329         }
1330         if (client->server->flags & F_SYNC) {
1331                 fsync(client->difffile);
1332         } else if (fua) {
1333                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1334                    as we iterate through the above?
1335                  */
1336                 fdatasync(client->difffile);
1337         }
1338         return 0;
1339 }
1340
1341 /**
1342  * Flush data to a client
1343  *
1344  * @param client The client we're going to write for.
1345  * @return 0 on success, nonzero on failure
1346  **/
1347 int expflush(CLIENT *client) {
1348         gint i;
1349
1350         if (client->server->flags & F_COPYONWRITE) {
1351                 return fsync(client->difffile);
1352         }
1353         
1354         for (i = 0; i < client->export->len; i++) {
1355                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1356                 if (fsync(fi.fhandle) < 0)
1357                         return -1;
1358         }
1359         
1360         return 0;
1361 }
1362
1363 /**
1364  * Do the initial negotiation.
1365  *
1366  * @param client The client we're negotiating with.
1367  **/
1368 CLIENT* negotiate(int net, CLIENT *client, GArray* servers, int phase) {
1369         char zeros[128];
1370         uint64_t size_host;
1371         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1372         uint16_t smallflags = 0;
1373         uint64_t magic;
1374
1375         memset(zeros, '\0', sizeof(zeros));
1376         if(phase & NEG_INIT) {
1377                 /* common */
1378                 if (write(net, INIT_PASSWD, 8) < 0) {
1379                         err_nonfatal("Negotiation failed: %m");
1380                         if(client)
1381                                 exit(EXIT_FAILURE);
1382                 }
1383                 if(phase & NEG_MODERN) {
1384                         /* modern */
1385                         magic = htonll(opts_magic);
1386                 } else {
1387                         /* oldstyle */
1388                         magic = htonll(cliserv_magic);
1389                 }
1390                 if (write(net, &magic, sizeof(magic)) < 0) {
1391                         err_nonfatal("Negotiation failed: %m");
1392                         if(client)
1393                                 exit(EXIT_FAILURE);
1394                 }
1395         }
1396         if(phase & NEG_MODERN) {
1397                 /* modern */
1398                 uint32_t reserved;
1399                 uint32_t opt;
1400                 uint32_t namelen;
1401                 char* name;
1402                 int i;
1403
1404                 if(!servers)
1405                         err("programmer error");
1406                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1407                         err("Negotiation failed: %m");
1408                 if (read(net, &reserved, sizeof(reserved)) < 0)
1409                         err("Negotiation failed: %m");
1410                 if (read(net, &magic, sizeof(magic)) < 0)
1411                         err("Negotiation failed: %m");
1412                 magic = ntohll(magic);
1413                 if(magic != opts_magic) {
1414                         close(net);
1415                         return NULL;
1416                 }
1417                 if (read(net, &opt, sizeof(opt)) < 0)
1418                         err("Negotiation failed: %m");
1419                 opt = ntohl(opt);
1420                 if(opt != NBD_OPT_EXPORT_NAME) {
1421                         close(net);
1422                         return NULL;
1423                 }
1424                 if (read(net, &namelen, sizeof(namelen)) < 0)
1425                         err("Negotiation failed: %m");
1426                 namelen = ntohl(namelen);
1427                 name = malloc(namelen+1);
1428                 name[namelen]=0;
1429                 if (read(net, name, namelen) < 0)
1430                         err("Negotiation failed: %m");
1431                 for(i=0; i<servers->len; i++) {
1432                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1433                         if(!strcmp(serve->servename, name)) {
1434                                 CLIENT* client = g_new0(CLIENT, 1);
1435                                 client->server = serve;
1436                                 client->exportsize = OFFT_MAX;
1437                                 client->net = net;
1438                                 client->modern = TRUE;
1439                                 client->transactionlogfd = -1;
1440                                 free(name);
1441                                 return client;
1442                         }
1443                 }
1444                 free(name);
1445                 return NULL;
1446         }
1447         /* common */
1448         size_host = htonll((u64)(client->exportsize));
1449         if (write(net, &size_host, 8) < 0)
1450                 err("Negotiation failed: %m");
1451         if (client->server->flags & F_READONLY)
1452                 flags |= NBD_FLAG_READ_ONLY;
1453         if (client->server->flags & F_FLUSH)
1454                 flags |= NBD_FLAG_SEND_FLUSH;
1455         if (client->server->flags & F_FUA)
1456                 flags |= NBD_FLAG_SEND_FUA;
1457         if (client->server->flags & F_ROTATIONAL)
1458                 flags |= NBD_FLAG_ROTATIONAL;
1459         if (phase & NEG_OLD) {
1460                 /* oldstyle */
1461                 flags = htonl(flags);
1462                 if (write(client->net, &flags, 4) < 0)
1463                         err("Negotiation failed: %m");
1464         } else {
1465                 /* modern */
1466                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1467                 smallflags = htons(smallflags);
1468                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1469                         err("Negotiation failed: %m");
1470                 }
1471         }
1472         /* common */
1473         if (write(client->net, zeros, 124) < 0)
1474                 err("Negotiation failed: %m");
1475         return NULL;
1476 }
1477
1478 /** sending macro. */
1479 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1480         if (client->transactionlogfd != -1) \
1481                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1482 /** error macro. */
1483 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1484 /**
1485  * Serve a file to a single client.
1486  *
1487  * @todo This beast needs to be split up in many tiny little manageable
1488  * pieces. Preferably with a chainsaw.
1489  *
1490  * @param client The client we're going to serve to.
1491  * @return when the client disconnects
1492  **/
1493 int mainloop(CLIENT *client) {
1494         struct nbd_request request;
1495         struct nbd_reply reply;
1496         gboolean go_on=TRUE;
1497 #ifdef DODBG
1498         int i = 0;
1499 #endif
1500         negotiate(client->net, client, NULL, client->modern ? NEG_MODERN : (NEG_OLD | NEG_INIT));
1501         DEBUG("Entering request loop!\n");
1502         reply.magic = htonl(NBD_REPLY_MAGIC);
1503         reply.error = 0;
1504         while (go_on) {
1505                 char buf[BUFSIZE];
1506                 char* p;
1507                 size_t len;
1508                 size_t currlen;
1509                 size_t writelen;
1510                 uint16_t command;
1511 #ifdef DODBG
1512                 i++;
1513                 printf("%d: ", i);
1514 #endif
1515                 readit(client->net, &request, sizeof(request));
1516                 if (client->transactionlogfd != -1)
1517                         writeit(client->transactionlogfd, &request, sizeof(request));
1518
1519                 request.from = ntohll(request.from);
1520                 request.type = ntohl(request.type);
1521                 command = request.type & NBD_CMD_MASK_COMMAND;
1522                 len = ntohl(request.len);
1523
1524                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1525                                 (unsigned long long)request.from,
1526                                 (unsigned long long)request.from / 512, (unsigned int)len);
1527
1528                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1529                         err("Not enough magic.");
1530
1531                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1532
1533                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1534                         if ((request.from + len) > (OFFT_MAX)) {
1535                                 DEBUG("[Number too large!]");
1536                                 ERROR(client, reply, EINVAL);
1537                                 continue;
1538                         }
1539
1540                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1541                                 DEBUG("[RANGE!]");
1542                                 ERROR(client, reply, EINVAL);
1543                                 continue;
1544                         }
1545
1546                         currlen = len;
1547                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1548                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1549                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1550                         }
1551                 }
1552
1553                 switch (command) {
1554
1555                 case NBD_CMD_DISC:
1556                         msg2(LOG_INFO, "Disconnect request received.");
1557                         if (client->server->flags & F_COPYONWRITE) { 
1558                                 if (client->difmap) g_free(client->difmap) ;
1559                                 close(client->difffile);
1560                                 unlink(client->difffilename);
1561                                 free(client->difffilename);
1562                         }
1563                         go_on=FALSE;
1564                         continue;
1565
1566                 case NBD_CMD_WRITE:
1567                         DEBUG("wr: net->buf, ");
1568                         while(len > 0) {
1569                                 readit(client->net, buf, currlen);
1570                                 DEBUG("buf->exp, ");
1571                                 if ((client->server->flags & F_READONLY) ||
1572                                     (client->server->flags & F_AUTOREADONLY)) {
1573                                         DEBUG("[WRITE to READONLY!]");
1574                                         ERROR(client, reply, EPERM);
1575                                         consume(client->net, buf, len-currlen, BUFSIZE);
1576                                         continue;
1577                                 }
1578                                 if (expwrite(request.from, buf, currlen, client,
1579                                              request.type & NBD_CMD_FLAG_FUA)) {
1580                                         DEBUG("Write failed: %m" );
1581                                         ERROR(client, reply, errno);
1582                                         consume(client->net, buf, len-currlen, BUFSIZE);
1583                                         continue;
1584                                 }
1585                                 len -= currlen;
1586                                 request.from += currlen;
1587                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1588                         }
1589                         SEND(client->net, reply);
1590                         DEBUG("OK!\n");
1591                         continue;
1592
1593                 case NBD_CMD_FLUSH:
1594                         DEBUG("fl: ");
1595                         if (expflush(client)) {
1596                                 DEBUG("Flush failed: %m");
1597                                 ERROR(client, reply, errno);
1598                                 continue;
1599                         }
1600                         SEND(client->net, reply);
1601                         DEBUG("OK!\n");
1602                         continue;
1603
1604                 case NBD_CMD_READ:
1605                         DEBUG("exp->buf, ");
1606                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1607                         if (client->transactionlogfd != -1)
1608                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1609                         p = buf + sizeof(struct nbd_reply);
1610                         writelen = currlen + sizeof(struct nbd_reply);
1611                         while(len > 0) {
1612                                 if (expread(request.from, p, currlen, client)) {
1613                                         DEBUG("Read failed: %m");
1614                                         ERROR(client, reply, errno);
1615                                         continue;
1616                                 }
1617                                 
1618                                 DEBUG("buf->net, ");
1619                                 writeit(client->net, buf, writelen);
1620                                 len -= currlen;
1621                                 request.from += currlen;
1622                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1623                                 p = buf;
1624                                 writelen = currlen;
1625                         }
1626                         DEBUG("OK!\n");
1627                         continue;
1628
1629                 default:
1630                         DEBUG ("Ignoring unknown command\n");
1631                         continue;
1632                 }
1633         }
1634         return 0;
1635 }
1636
1637 /**
1638  * Set up client export array, which is an array of FILE_INFO.
1639  * Also, split a single exportfile into multiple ones, if that was asked.
1640  * @param client information on the client which we want to setup export for
1641  **/
1642 void setupexport(CLIENT* client) {
1643         int i;
1644         off_t laststartoff = 0, lastsize = 0;
1645         int multifile = (client->server->flags & F_MULTIFILE);
1646
1647         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1648
1649         /* If multi-file, open as many files as we can.
1650          * If not, open exactly one file.
1651          * Calculate file sizes as we go to get total size. */
1652         for(i=0; ; i++) {
1653                 FILE_INFO fi;
1654                 gchar *tmpname;
1655                 gchar* error_string;
1656                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1657
1658                 if(multifile) {
1659                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1660                 } else {
1661                         tmpname=g_strdup(client->exportname);
1662                 }
1663                 DEBUG( "Opening %s\n", tmpname );
1664                 fi.fhandle = open(tmpname, mode);
1665                 if(fi.fhandle == -1 && mode == O_RDWR) {
1666                         /* Try again because maybe media was read-only */
1667                         fi.fhandle = open(tmpname, O_RDONLY);
1668                         if(fi.fhandle != -1) {
1669                                 /* Opening the base file in copyonwrite mode is
1670                                  * okay */
1671                                 if(!(client->server->flags & F_COPYONWRITE)) {
1672                                         client->server->flags |= F_AUTOREADONLY;
1673                                         client->server->flags |= F_READONLY;
1674                                 }
1675                         }
1676                 }
1677                 if(fi.fhandle == -1) {
1678                         if(multifile && i>0)
1679                                 break;
1680                         error_string=g_strdup_printf(
1681                                 "Could not open exported file %s: %%m",
1682                                 tmpname);
1683                         err(error_string);
1684                 }
1685                 fi.startoff = laststartoff + lastsize;
1686                 g_array_append_val(client->export, fi);
1687                 g_free(tmpname);
1688
1689                 /* Starting offset and size of this file will be used to
1690                  * calculate starting offset of next file */
1691                 laststartoff = fi.startoff;
1692                 lastsize = size_autodetect(fi.fhandle);
1693
1694                 if(!multifile)
1695                         break;
1696         }
1697
1698         /* Set export size to total calculated size */
1699         client->exportsize = laststartoff + lastsize;
1700
1701         /* Export size may be overridden */
1702         if(client->server->expected_size) {
1703                 /* desired size must be <= total calculated size */
1704                 if(client->server->expected_size > client->exportsize) {
1705                         err("Size of exported file is too big\n");
1706                 }
1707
1708                 client->exportsize = client->server->expected_size;
1709         }
1710
1711         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1712         if(multifile) {
1713                 msg3(LOG_INFO, "Total number of files: %d", i);
1714         }
1715 }
1716
1717 int copyonwrite_prepare(CLIENT* client) {
1718         off_t i;
1719         if ((client->difffilename = malloc(1024))==NULL)
1720                 err("Failed to allocate string for diff file name");
1721         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1722                 (int)getpid()) ;
1723         client->difffilename[1023]='\0';
1724         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1725         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1726         if (client->difffile<0) err("Could not create diff file (%m)") ;
1727         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1728                 err("Could not allocate memory") ;
1729         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1730
1731         return 0;
1732 }
1733
1734 /**
1735  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1736  * options
1737  *
1738  * @param command the command to be ran. Read from the config file
1739  * @param file the file name we're about to export
1740  **/
1741 int do_run(gchar* command, gchar* file) {
1742         gchar* cmd;
1743         int retval=0;
1744
1745         if(command && *command) {
1746                 cmd = g_strdup_printf(command, file);
1747                 retval=system(cmd);
1748                 g_free(cmd);
1749         }
1750         return retval;
1751 }
1752
1753 /**
1754  * Serve a connection. 
1755  *
1756  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1757  * follow the road map.
1758  *
1759  * @param client a connected client
1760  **/
1761 void serveconnection(CLIENT *client) {
1762         if (client->server->transactionlog && (client->transactionlogfd == -1))
1763         {
1764                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1765                                                            O_WRONLY | O_CREAT,
1766                                                            S_IRUSR | S_IWUSR)))
1767                         g_warning("Could not open transaction log %s",
1768                                   client->server->transactionlog);
1769         }
1770
1771         if(do_run(client->server->prerun, client->exportname)) {
1772                 exit(EXIT_FAILURE);
1773         }
1774         setupexport(client);
1775
1776         if (client->server->flags & F_COPYONWRITE) {
1777                 copyonwrite_prepare(client);
1778         }
1779
1780         setmysockopt(client->net);
1781
1782         mainloop(client);
1783         do_run(client->server->postrun, client->exportname);
1784
1785         if (-1 != client->transactionlogfd)
1786         {
1787                 close(client->transactionlogfd);
1788                 client->transactionlogfd = -1;
1789         }
1790 }
1791
1792 /**
1793  * Find the name of the file we have to serve. This will use g_strdup_printf
1794  * to put the IP address of the client inside a filename containing
1795  * "%s" (in the form as specified by the "virtstyle" option). That name
1796  * is then written to client->exportname.
1797  *
1798  * @param net A socket connected to an nbd client
1799  * @param client information about the client. The IP address in human-readable
1800  * format will be written to a new char* buffer, the address of which will be
1801  * stored in client->clientname.
1802  **/
1803 void set_peername(int net, CLIENT *client) {
1804         struct sockaddr_storage addrin;
1805         struct sockaddr_storage netaddr;
1806         struct sockaddr_in  *netaddr4 = NULL;
1807         struct sockaddr_in6 *netaddr6 = NULL;
1808         size_t addrinlen = sizeof( addrin );
1809         struct addrinfo hints;
1810         struct addrinfo *ai = NULL;
1811         char peername[NI_MAXHOST];
1812         char netname[NI_MAXHOST];
1813         char *tmp = NULL;
1814         int i;
1815         int e;
1816         int shift;
1817
1818         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1819                 err("getsockname failed: %m");
1820
1821         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1822                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1823
1824         memset(&hints, '\0', sizeof (hints));
1825         hints.ai_flags = AI_ADDRCONFIG;
1826         e = getaddrinfo(peername, NULL, &hints, &ai);
1827
1828         if(e != 0) {
1829                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1830                 freeaddrinfo(ai);
1831                 return;
1832         }
1833
1834         switch(client->server->virtstyle) {
1835                 case VIRT_NONE:
1836                         client->exportname=g_strdup(client->server->exportname);
1837                         break;
1838                 case VIRT_IPHASH:
1839                         for(i=0;i<strlen(peername);i++) {
1840                                 if(peername[i]=='.') {
1841                                         peername[i]='/';
1842                                 }
1843                         }
1844                 case VIRT_IPLIT:
1845                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1846                         break;
1847                 case VIRT_CIDR:
1848                         memcpy(&netaddr, &addrin, addrinlen);
1849                         if(ai->ai_family == AF_INET) {
1850                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1851                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1852                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1853
1854                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1855                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1856                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1857                         }else if(ai->ai_family == AF_INET6) {
1858                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1859
1860                                 shift = 128-(client->server->cidrlen);
1861                                 i = 3;
1862                                 while(shift >= 32) {
1863                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1864                                         shift-=32;
1865                                         i--;
1866                                 }
1867                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1868                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1869
1870                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1871                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1872                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1873                         }
1874
1875                         if(tmp != NULL)
1876                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1877
1878                         break;
1879         }
1880
1881         freeaddrinfo(ai);
1882         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1883              peername, client->exportname);
1884         client->clientname=g_strdup(peername);
1885 }
1886
1887 /**
1888  * Destroy a pid_t*
1889  * @param data a pointer to pid_t which should be freed
1890  **/
1891 void destroy_pid_t(gpointer data) {
1892         g_free(data);
1893 }
1894
1895 /**
1896  * Loop through the available servers, and serve them. Never returns.
1897  **/
1898 int serveloop(GArray* servers) {
1899         struct sockaddr_storage addrin;
1900         socklen_t addrinlen=sizeof(addrin);
1901         int i;
1902         int max;
1903         int sock;
1904         fd_set mset;
1905         fd_set rset;
1906
1907         /* 
1908          * Set up the master fd_set. The set of descriptors we need
1909          * to select() for never changes anyway and it buys us a *lot*
1910          * of time to only build this once. However, if we ever choose
1911          * to not fork() for clients anymore, we may have to revisit
1912          * this.
1913          */
1914         max=0;
1915         FD_ZERO(&mset);
1916         for(i=0;i<servers->len;i++) {
1917                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1918                         FD_SET(sock, &mset);
1919                         max=sock>max?sock:max;
1920                 }
1921         }
1922         if(modernsock) {
1923                 FD_SET(modernsock, &mset);
1924                 max=modernsock>max?modernsock:max;
1925         }
1926         for(;;) {
1927                 CLIENT *client = NULL;
1928                 pid_t *pid;
1929
1930                 memcpy(&rset, &mset, sizeof(fd_set));
1931                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1932                         int net = 0;
1933                         SERVER* serve=NULL;
1934
1935                         DEBUG("accept, ");
1936                         if(FD_ISSET(modernsock, &rset)) {
1937                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1938                                         err("accept: %m");
1939                                 client = negotiate(net, NULL, servers, NEG_INIT | NEG_MODERN);
1940                                 if(!client) {
1941                                         err_nonfatal("negotiation failed");
1942                                         close(net);
1943                                         net=0;
1944                                         continue;
1945                                 }
1946                                 serve = client->server;
1947                         }
1948                         for(i=0;i<servers->len && !net;i++) {
1949                                 serve=&(g_array_index(servers, SERVER, i));
1950                                 if(FD_ISSET(serve->socket, &rset)) {
1951                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1952                                                 err("accept: %m");
1953                                 }
1954                         }
1955                         if(net) {
1956                                 int sock_flags;
1957
1958                                 if(serve->max_connections > 0 &&
1959                                    g_hash_table_size(children) >= serve->max_connections) {
1960                                         msg2(LOG_INFO, "Max connections reached");
1961                                         close(net);
1962                                         continue;
1963                                 }
1964                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1965                                         err("fcntl F_GETFL");
1966                                 }
1967                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1968                                         err("fcntl F_SETFL ~O_NONBLOCK");
1969                                 }
1970                                 if(!client) {
1971                                         client = g_new0(CLIENT, 1);
1972                                         client->server=serve;
1973                                         client->exportsize=OFFT_MAX;
1974                                         client->net=net;
1975                                         client->transactionlogfd = -1;
1976                                 }
1977                                 set_peername(net, client);
1978                                 if (!authorized_client(client)) {
1979                                         msg2(LOG_INFO,"Unauthorized client") ;
1980                                         close(net);
1981                                         continue;
1982                                 }
1983                                 msg2(LOG_INFO,"Authorized client") ;
1984                                 pid=g_malloc(sizeof(pid_t));
1985
1986                                 if (!dontfork) {
1987                                         if ((*pid=fork())<0) {
1988                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1989                                                 close(net);
1990                                                 continue;
1991                                         }
1992                                         if (*pid>0) { /* parent */
1993                                                 close(net);
1994                                                 g_hash_table_insert(children, pid, pid);
1995                                                 continue;
1996                                         }
1997                                         /* child */
1998                                         g_hash_table_destroy(children);
1999                                         for(i=0;i<servers->len;i++) {
2000                                                 serve=&g_array_index(servers, SERVER, i);
2001                                                 close(serve->socket);
2002                                         }
2003                                         /* FALSE does not free the
2004                                            actual data. This is required,
2005                                            because the client has a
2006                                            direct reference into that
2007                                            data, and otherwise we get a
2008                                            segfault... */
2009                                         g_array_free(servers, FALSE);
2010                                 }
2011
2012                                 msg2(LOG_INFO,"Starting to serve");
2013                                 serveconnection(client);
2014                                 exit(EXIT_SUCCESS);
2015                         }
2016                 }
2017         }
2018 }
2019
2020 void dosockopts(int socket) {
2021 #ifndef sun
2022         int yes=1;
2023 #else
2024         char yes='1';
2025 #endif /* sun */
2026         int sock_flags;
2027
2028         /* lose the pesky "Address already in use" error message */
2029         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2030                 err("setsockopt SO_REUSEADDR");
2031         }
2032         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2033                 err("setsockopt SO_KEEPALIVE");
2034         }
2035
2036         /* make the listening socket non-blocking */
2037         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2038                 err("fcntl F_GETFL");
2039         }
2040         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2041                 err("fcntl F_SETFL O_NONBLOCK");
2042         }
2043 }
2044
2045 /**
2046  * Connect a server's socket.
2047  *
2048  * @param serve the server we want to connect.
2049  **/
2050 int setup_serve(SERVER *serve) {
2051         struct addrinfo hints;
2052         struct addrinfo *ai = NULL;
2053         gchar *port = NULL;
2054         int e;
2055
2056         if(!do_oldstyle) {
2057                 return serve->servename ? 1 : 0;
2058         }
2059         memset(&hints,'\0',sizeof(hints));
2060         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2061         hints.ai_socktype = SOCK_STREAM;
2062         hints.ai_family = serve->socket_family;
2063
2064         port = g_strdup_printf ("%d", serve->port);
2065         if (port == NULL)
2066                 return 0;
2067
2068         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2069
2070         g_free(port);
2071
2072         if(e != 0) {
2073                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2074                 serve->socket = -1;
2075                 freeaddrinfo(ai);
2076                 exit(EXIT_FAILURE);
2077         }
2078
2079         if(serve->socket_family == AF_UNSPEC)
2080                 serve->socket_family = ai->ai_family;
2081
2082 #ifdef WITH_SDP
2083         if ((serve->flags) && F_SDP) {
2084                 if (ai->ai_family == AF_INET)
2085                         ai->ai_family = AF_INET_SDP;
2086                 else (ai->ai_family == AF_INET6)
2087                         ai->ai_family = AF_INET6_SDP;
2088         }
2089 #endif
2090         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2091                 err("socket: %m");
2092
2093         dosockopts(serve->socket);
2094
2095         DEBUG("Waiting for connections... bind, ");
2096         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2097         if (e != 0 && errno != EADDRINUSE)
2098                 err("bind: %m");
2099         DEBUG("listen, ");
2100         if (listen(serve->socket, 1) < 0)
2101                 err("listen: %m");
2102
2103         freeaddrinfo (ai);
2104         if(serve->servename) {
2105                 return 1;
2106         } else {
2107                 return 0;
2108         }
2109 }
2110
2111 void open_modern(void) {
2112         struct addrinfo hints;
2113         struct addrinfo* ai = NULL;
2114         struct sock_flags;
2115         int e;
2116
2117         memset(&hints, '\0', sizeof(hints));
2118         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2119         hints.ai_socktype = SOCK_STREAM;
2120         hints.ai_family = AF_UNSPEC;
2121         hints.ai_protocol = IPPROTO_TCP;
2122         e = getaddrinfo(modern_listen, modernport, &hints, &ai);
2123         if(e != 0) {
2124                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2125                 exit(EXIT_FAILURE);
2126         }
2127         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2128                 err("socket: %m");
2129         }
2130
2131         dosockopts(modernsock);
2132
2133         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2134                 err("bind: %m");
2135         }
2136         if(listen(modernsock, 10) <0) {
2137                 err("listen: %m");
2138         }
2139
2140         freeaddrinfo(ai);
2141 }
2142
2143 /**
2144  * Connect our servers.
2145  **/
2146 void setup_servers(GArray* servers) {
2147         int i;
2148         struct sigaction sa;
2149         int want_modern=0;
2150
2151         for(i=0;i<servers->len;i++) {
2152                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2153         }
2154         if(want_modern) {
2155                 open_modern();
2156         }
2157         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2158
2159         sa.sa_handler = sigchld_handler;
2160         sigemptyset(&sa.sa_mask);
2161         sa.sa_flags = SA_RESTART;
2162         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2163                 err("sigaction: %m");
2164         sa.sa_handler = sigterm_handler;
2165         sigemptyset(&sa.sa_mask);
2166         sa.sa_flags = SA_RESTART;
2167         if(sigaction(SIGTERM, &sa, NULL) == -1)
2168                 err("sigaction: %m");
2169 }
2170
2171 /**
2172  * Go daemon (unless we specified at compile time that we didn't want this)
2173  * @param serve the first server of our configuration. If its port is zero,
2174  *      then do not daemonize, because we're doing inetd then. This parameter
2175  *      is only used to create a PID file of the form
2176  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2177  **/
2178 #if !defined(NODAEMON)
2179 void daemonize(SERVER* serve) {
2180         FILE*pidf;
2181
2182         if(serve && !(serve->port)) {
2183                 return;
2184         }
2185         if(daemon(0,0)<0) {
2186                 err("daemon");
2187         }
2188         if(!*pidftemplate) {
2189                 if(serve) {
2190                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2191                 } else {
2192                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2193                 }
2194         }
2195         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2196         pidf=fopen(pidfname, "w");
2197         if(pidf) {
2198                 fprintf(pidf,"%d\n", (int)getpid());
2199                 fclose(pidf);
2200         } else {
2201                 perror("fopen");
2202                 fprintf(stderr, "Not fatal; continuing");
2203         }
2204 }
2205 #else
2206 #define daemonize(serve)
2207 #endif /* !defined(NODAEMON) */
2208
2209 /*
2210  * Everything beyond this point (in the file) is run in non-daemon mode.
2211  * The stuff above daemonize() isn't.
2212  */
2213
2214 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2215
2216 void serve_err(SERVER* serve, const char* msg) {
2217         g_message("Export of %s on port %d failed:", serve->exportname,
2218                         serve->port);
2219         err(msg);
2220 }
2221
2222 /**
2223  * Set up user-ID and/or group-ID
2224  **/
2225 void dousers(void) {
2226         struct passwd *pw;
2227         struct group *gr;
2228         gchar* str;
2229         if(rungroup) {
2230                 gr=getgrnam(rungroup);
2231                 if(!gr) {
2232                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2233                         err(str);
2234                 }
2235                 if(setgid(gr->gr_gid)<0) {
2236                         err("Could not set GID: %m"); 
2237                 }
2238         }
2239         if(runuser) {
2240                 pw=getpwnam(runuser);
2241                 if(!pw) {
2242                         str = g_strdup_printf("Invalid user name: %s", runuser);
2243                         err(str);
2244                 }
2245                 if(setuid(pw->pw_uid)<0) {
2246                         err("Could not set UID: %m");
2247                 }
2248         }
2249 }
2250
2251 #ifndef ISSERVER
2252 void glib_message_syslog_redirect(const gchar *log_domain,
2253                                   GLogLevelFlags log_level,
2254                                   const gchar *message,
2255                                   gpointer user_data)
2256 {
2257     int level=LOG_DEBUG;
2258     
2259     switch( log_level )
2260     {
2261       case G_LOG_FLAG_FATAL:
2262       case G_LOG_LEVEL_CRITICAL:
2263       case G_LOG_LEVEL_ERROR:    
2264         level=LOG_ERR; 
2265         break;
2266       case G_LOG_LEVEL_WARNING:
2267         level=LOG_WARNING;
2268         break;
2269       case G_LOG_LEVEL_MESSAGE:
2270       case G_LOG_LEVEL_INFO:
2271         level=LOG_INFO;
2272         break;
2273       case G_LOG_LEVEL_DEBUG:
2274         level=LOG_DEBUG;
2275       default:
2276         level=LOG_ERR;
2277     }
2278     syslog(level, "%s", message);
2279 }
2280 #endif
2281
2282 /**
2283  * Main entry point...
2284  **/
2285 int main(int argc, char *argv[]) {
2286         SERVER *serve;
2287         GArray *servers;
2288         GError *err=NULL;
2289
2290         if (sizeof( struct nbd_request )!=28) {
2291                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2292                 exit(EXIT_FAILURE) ;
2293         }
2294
2295         memset(pidftemplate, '\0', 256);
2296
2297         logging();
2298         config_file_pos = g_strdup(CFILE);
2299         serve=cmdline(argc, argv);
2300         servers = parse_cfile(config_file_pos, &err);
2301         
2302         if(serve) {
2303                 serve->socket_family = AF_UNSPEC;
2304
2305                 append_serve(serve, servers);
2306      
2307                 if (!(serve->port)) {
2308                         CLIENT *client;
2309 #ifndef ISSERVER
2310                         /* You really should define ISSERVER if you're going to use
2311                          * inetd mode, but if you don't, closing stdout and stderr
2312                          * (which inetd had connected to the client socket) will let it
2313                          * work. */
2314                         close(1);
2315                         close(2);
2316                         open("/dev/null", O_WRONLY);
2317                         open("/dev/null", O_WRONLY);
2318                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2319 #endif
2320                         client=g_malloc(sizeof(CLIENT));
2321                         client->server=serve;
2322                         client->net=0;
2323                         client->exportsize=OFFT_MAX;
2324                         set_peername(0,client);
2325                         serveconnection(client);
2326                         return 0;
2327                 }
2328         }
2329     
2330         if(!servers || !servers->len) {
2331                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2332                                 && err->code == CFILE_NOTFOUND)) {
2333                         g_warning("Could not parse config file: %s", 
2334                                         err ? err->message : "Unknown error");
2335                 }
2336         }
2337         if(serve) {
2338                 g_warning("Specifying an export on the command line is deprecated.");
2339                 g_warning("Please use a configuration file instead.");
2340         }
2341
2342         if((!serve) && (!servers||!servers->len)) {
2343                 g_message("No configured exports; quitting.");
2344                 exit(EXIT_FAILURE);
2345         }
2346         if (!dontfork)
2347                 daemonize(serve);
2348         setup_servers(servers);
2349         dousers();
2350         serveloop(servers);
2351         return 0 ;
2352 }