Document function parameters for doxygen
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 int modernsock=0;         /**< Socket for the modern handler. Not used
169                                if a client was only specified on the
170                                command line; only port used if
171                                oldstyle is set to false (and then the
172                                command-line client isn't used, gna gna) */
173 char* modern_listen;      /**< listenaddr value for modernsock */
174
175 /**
176  * Types of virtuatlization
177  **/
178 typedef enum {
179         VIRT_NONE=0,    /**< No virtualization */
180         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
181         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
182                              doing the same as in IPLIT */
183         VIRT_CIDR,      /**< Every subnet in its own directory */
184 } VIRT_STYLE;
185
186 /**
187  * Variables associated with a server.
188  **/
189 typedef struct {
190         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
191         off_t expected_size; /**< size of the exported file as it was told to
192                                us through configuration */
193         gchar* listenaddr;   /**< The IP address we're listening on */
194         unsigned int port;   /**< port we're exporting this file at */
195         char* authname;      /**< filename of the authorization file */
196         int flags;           /**< flags associated with this exported file */
197         int socket;          /**< The socket of this server. */
198         int socket_family;   /**< family of the socket */
199         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
200         uint8_t cidrlen;     /**< The length of the mask when we use
201                                   CIDR-style virtualization */
202         gchar* prerun;       /**< command to be ran after connecting a client,
203                                   but before starting to serve */
204         gchar* postrun;      /**< command that will be ran after the client
205                                   disconnects */
206         gchar* servename;    /**< name of the export as selected by nbd-client */
207         int max_connections; /**< maximum number of opened connections */
208         gchar* transactionlog;/**< filename for transaction log */
209 } SERVER;
210
211 /**
212  * Variables associated with a client socket.
213  **/
214 typedef struct {
215         int fhandle;      /**< file descriptor */
216         off_t startoff;   /**< starting offset of this file */
217 } FILE_INFO;
218
219 typedef struct {
220         off_t exportsize;    /**< size of the file we're exporting */
221         char *clientname;    /**< peer */
222         char *exportname;    /**< (processed) filename of the file we're exporting */
223         GArray *export;    /**< array of FILE_INFO of exported files;
224                                array size is always 1 unless we're
225                                doing the multiple file option */
226         int net;             /**< The actual client socket */
227         SERVER *server;      /**< The server this client is getting data from */
228         char* difffilename;  /**< filename of the copy-on-write file, if any */
229         int difffile;        /**< filedescriptor of copyonwrite file. @todo
230                                shouldn't this be an array too? (cfr export) Or
231                                make -m and -c mutually exclusive */
232         u32 difffilelen;     /**< number of pages in difffile */
233         u32 *difmap;         /**< see comment on the global difmap for this one */
234         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
235         int transactionlogfd;/**< fd for transaction log */
236 } CLIENT;
237
238 /**
239  * Type of configuration file values
240  **/
241 typedef enum {
242         PARAM_INT,              /**< This parameter is an integer */
243         PARAM_STRING,           /**< This parameter is a string */
244         PARAM_BOOL,             /**< This parameter is a boolean */
245 } PARAM_TYPE;
246
247 /**
248  * Configuration file values
249  **/
250 typedef struct {
251         gchar *paramname;       /**< Name of the parameter, as it appears in
252                                   the config file */
253         gboolean required;      /**< Whether this is a required (as opposed to
254                                   optional) parameter */
255         PARAM_TYPE ptype;       /**< Type of the parameter. */
256         gpointer target;        /**< Pointer to where the data of this
257                                   parameter should be written. If ptype is
258                                   PARAM_BOOL, the data is or'ed rather than
259                                   overwritten. */
260         gint flagval;           /**< Flag mask for this parameter in case ptype
261                                   is PARAM_BOOL. */
262 } PARAM;
263
264 /**
265  * Translate a command name into human readable form
266  *
267  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
268  * @return pointer to the command name
269  **/
270 static inline const char * getcommandname(uint64_t command) {
271         switch (command) {
272         case NBD_CMD_READ:
273                 return "NBD_CMD_READ";
274         case NBD_CMD_WRITE:
275                 return "NBD_CMD_WRITE";
276         case NBD_CMD_DISC:
277                 return "NBD_CMD_DISC";
278         case NBD_CMD_FLUSH:
279                 return "NBD_CMD_FLUSH";
280         default:
281                 break;
282         }
283         return "UNKNOWN";
284 }
285
286 /**
287  * Check whether a client is allowed to connect. Works with an authorization
288  * file which contains one line per machine, no wildcards.
289  *
290  * @param opts The client who's trying to connect.
291  * @return 0 - authorization refused, 1 - OK
292  **/
293 int authorized_client(CLIENT *opts) {
294         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
295         FILE *f ;
296         char line[LINELEN]; 
297         char *tmp;
298         struct in_addr addr;
299         struct in_addr client;
300         struct in_addr cltemp;
301         int len;
302
303         if ((f=fopen(opts->server->authname,"r"))==NULL) {
304                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
305                      opts->server->authname,strerror(errno)) ;
306                 return 1 ; 
307         }
308   
309         inet_aton(opts->clientname, &client);
310         while (fgets(line,LINELEN,f)!=NULL) {
311                 if((tmp=index(line, '/'))) {
312                         if(strlen(line)<=tmp-line) {
313                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
314                                 return 0;
315                         }
316                         *(tmp++)=0;
317                         if(!inet_aton(line,&addr)) {
318                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
319                                 return 0;
320                         }
321                         len=strtol(tmp, NULL, 0);
322                         addr.s_addr>>=32-len;
323                         addr.s_addr<<=32-len;
324                         memcpy(&cltemp,&client,sizeof(client));
325                         cltemp.s_addr>>=32-len;
326                         cltemp.s_addr<<=32-len;
327                         if(addr.s_addr == cltemp.s_addr) {
328                                 return 1;
329                         }
330                 }
331                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
332                         fclose(f);
333                         return 1;
334                 }
335         }
336         fclose(f);
337         return 0;
338 }
339
340 /**
341  * Read data from a file descriptor into a buffer
342  *
343  * @param f a file descriptor
344  * @param buf a buffer
345  * @param len the number of bytes to be read
346  **/
347 static inline void readit(int f, void *buf, size_t len) {
348         ssize_t res;
349         while (len > 0) {
350                 DEBUG("*");
351                 if ((res = read(f, buf, len)) <= 0) {
352                         if(errno != EAGAIN) {
353                                 err("Read failed: %m");
354                         }
355                 } else {
356                         len -= res;
357                         buf += res;
358                 }
359         }
360 }
361
362 /**
363  * Write data from a buffer into a filedescriptor
364  *
365  * @param f a file descriptor
366  * @param buf a buffer containing data
367  * @param len the number of bytes to be written
368  **/
369 static inline void writeit(int f, void *buf, size_t len) {
370         ssize_t res;
371         while (len > 0) {
372                 DEBUG("+");
373                 if ((res = write(f, buf, len)) <= 0)
374                         err("Send failed: %m");
375                 len -= res;
376                 buf += res;
377         }
378 }
379
380 /**
381  * Print out a message about how to use nbd-server. Split out to a separate
382  * function so that we can call it from multiple places
383  */
384 void usage() {
385         printf("This is nbd-server version " VERSION "\n");
386         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
387                "\t-r|--read-only\t\tread only\n"
388                "\t-m|--multi-file\t\tmultiple file\n"
389                "\t-c|--copy-on-write\tcopy on write\n"
390                "\t-C|--config-file\tspecify an alternate configuration file\n"
391                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
392                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
393                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
394                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
395                "\tif port is set to 0, stdin is used (for running from inetd)\n"
396                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
397                "\t\taddress of the machine trying to connect\n" 
398                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
399         printf("Using configuration file %s\n", CFILE);
400 }
401
402 /* Dumps a config file section of the given SERVER*, and exits. */
403 void dump_section(SERVER* serve, gchar* section_header) {
404         printf("[%s]\n", section_header);
405         printf("\texportname = %s\n", serve->exportname);
406         printf("\tlistenaddr = %s\n", serve->listenaddr);
407         printf("\tport = %d\n", serve->port);
408         if(serve->flags & F_READONLY) {
409                 printf("\treadonly = true\n");
410         }
411         if(serve->flags & F_MULTIFILE) {
412                 printf("\tmultifile = true\n");
413         }
414         if(serve->flags & F_COPYONWRITE) {
415                 printf("\tcopyonwrite = true\n");
416         }
417         if(serve->expected_size) {
418                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
419         }
420         if(serve->authname) {
421                 printf("\tauthfile = %s\n", serve->authname);
422         }
423         exit(EXIT_SUCCESS);
424 }
425
426 /**
427  * Parse the command line.
428  *
429  * @param argc the argc argument to main()
430  * @param argv the argv argument to main()
431  **/
432 SERVER* cmdline(int argc, char *argv[]) {
433         int i=0;
434         int nonspecial=0;
435         int c;
436         struct option long_options[] = {
437                 {"read-only", no_argument, NULL, 'r'},
438                 {"multi-file", no_argument, NULL, 'm'},
439                 {"copy-on-write", no_argument, NULL, 'c'},
440                 {"dont-fork", no_argument, NULL, 'd'},
441                 {"authorize-file", required_argument, NULL, 'l'},
442                 {"config-file", required_argument, NULL, 'C'},
443                 {"pid-file", required_argument, NULL, 'p'},
444                 {"output-config", required_argument, NULL, 'o'},
445                 {"max-connection", required_argument, NULL, 'M'},
446                 {0,0,0,0}
447         };
448         SERVER *serve;
449         off_t es;
450         size_t last;
451         char suffix;
452         gboolean do_output=FALSE;
453         gchar* section_header="";
454         gchar** addr_port;
455
456         if(argc==1) {
457                 return NULL;
458         }
459         serve=g_new0(SERVER, 1);
460         serve->authname = g_strdup(default_authname);
461         serve->virtstyle=VIRT_IPLIT;
462         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
463                 switch (c) {
464                 case 1:
465                         /* non-option argument */
466                         switch(nonspecial++) {
467                         case 0:
468                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
469                                         addr_port=g_strsplit(optarg, ":", 2);
470
471                                         /* Check for "@" - maybe user using this separator
472                                                  for IPv4 address */
473                                         if(!addr_port[1]) {
474                                                 g_strfreev(addr_port);
475                                                 addr_port=g_strsplit(optarg, "@", 2);
476                                         }
477                                 } else {
478                                         addr_port=g_strsplit(optarg, "@", 2);
479                                 }
480
481                                 if(addr_port[1]) {
482                                         serve->port=strtol(addr_port[1], NULL, 0);
483                                         serve->listenaddr=g_strdup(addr_port[0]);
484                                 } else {
485                                         serve->listenaddr=NULL;
486                                         serve->port=strtol(addr_port[0], NULL, 0);
487                                 }
488                                 g_strfreev(addr_port);
489                                 break;
490                         case 1:
491                                 serve->exportname = g_strdup(optarg);
492                                 if(serve->exportname[0] != '/') {
493                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
494                                         exit(EXIT_FAILURE);
495                                 }
496                                 break;
497                         case 2:
498                                 last=strlen(optarg)-1;
499                                 suffix=optarg[last];
500                                 if (suffix == 'k' || suffix == 'K' ||
501                                     suffix == 'm' || suffix == 'M')
502                                         optarg[last] = '\0';
503                                 es = (off_t)atoll(optarg);
504                                 switch (suffix) {
505                                         case 'm':
506                                         case 'M':  es <<= 10;
507                                         case 'k':
508                                         case 'K':  es <<= 10;
509                                         default :  break;
510                                 }
511                                 serve->expected_size = es;
512                                 break;
513                         }
514                         break;
515                 case 'r':
516                         serve->flags |= F_READONLY;
517                         break;
518                 case 'm':
519                         serve->flags |= F_MULTIFILE;
520                         break;
521                 case 'o':
522                         do_output = TRUE;
523                         section_header = g_strdup(optarg);
524                         break;
525                 case 'p':
526                         strncpy(pidftemplate, optarg, 256);
527                         break;
528                 case 'c': 
529                         serve->flags |=F_COPYONWRITE;
530                         break;
531                 case 'd': 
532                         dontfork = 1;
533                         break;
534                 case 'C':
535                         g_free(config_file_pos);
536                         config_file_pos=g_strdup(optarg);
537                         break;
538                 case 'l':
539                         g_free(serve->authname);
540                         serve->authname=g_strdup(optarg);
541                         break;
542                 case 'M':
543                         serve->max_connections = strtol(optarg, NULL, 0);
544                         break;
545                 default:
546                         usage();
547                         exit(EXIT_FAILURE);
548                         break;
549                 }
550         }
551         /* What's left: the port to export, the name of the to be exported
552          * file, and, optionally, the size of the file, in that order. */
553         if(nonspecial<2) {
554                 g_free(serve);
555                 serve=NULL;
556         } else {
557                 do_oldstyle = TRUE;
558         }
559         if(do_output) {
560                 if(!serve) {
561                         g_critical("Need a complete configuration on the command line to output a config file section!");
562                         exit(EXIT_FAILURE);
563                 }
564                 dump_section(serve, section_header);
565         }
566         return serve;
567 }
568
569 /**
570  * Error codes for config file parsing
571  **/
572 typedef enum {
573         CFILE_NOTFOUND,         /**< The configuration file is not found */
574         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
575         CFILE_KEY_MISSING,      /**< A (required) key is missing */
576         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
577         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
578         CFILE_PROGERR,          /**< Programmer error */
579         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
580                                      define any exports */
581         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
582                                      old-style export. */
583 } CFILE_ERRORS;
584
585 /**
586  * Remove a SERVER from memory. Used from the hash table
587  **/
588 void remove_server(gpointer s) {
589         SERVER *server;
590
591         server=(SERVER*)s;
592         g_free(server->exportname);
593         if(server->authname)
594                 g_free(server->authname);
595         if(server->listenaddr)
596                 g_free(server->listenaddr);
597         if(server->prerun)
598                 g_free(server->prerun);
599         if(server->postrun)
600                 g_free(server->postrun);
601         if(server->transactionlog)
602                 g_free(server->transactionlog);
603         g_free(server);
604 }
605
606 /**
607  * duplicate server
608  * @param s the old server we want to duplicate
609  * @return new duplicated server
610  **/
611 SERVER* dup_serve(SERVER *s) {
612         SERVER *serve = NULL;
613
614         serve=g_new0(SERVER, 1);
615         if(serve == NULL)
616                 return NULL;
617
618         if(s->exportname)
619                 serve->exportname = g_strdup(s->exportname);
620
621         serve->expected_size = s->expected_size;
622
623         if(s->listenaddr)
624                 serve->listenaddr = g_strdup(s->listenaddr);
625
626         serve->port = s->port;
627
628         if(s->authname)
629                 serve->authname = strdup(s->authname);
630
631         serve->flags = s->flags;
632         serve->socket = s->socket;
633         serve->socket_family = s->socket_family;
634         serve->virtstyle = s->virtstyle;
635         serve->cidrlen = s->cidrlen;
636
637         if(s->prerun)
638                 serve->prerun = g_strdup(s->prerun);
639
640         if(s->postrun)
641                 serve->postrun = g_strdup(s->postrun);
642
643         if(s->transactionlog)
644                 serve->transactionlog = g_strdup(s->transactionlog);
645         
646         if(s->servename)
647                 serve->servename = g_strdup(s->servename);
648
649         serve->max_connections = s->max_connections;
650
651         return serve;
652 }
653
654 /**
655  * append new server to array
656  * @param s server
657  * @param a server array
658  * @return 0 success, -1 error
659  */
660 int append_serve(SERVER *s, GArray *a) {
661         SERVER *ns = NULL;
662         struct addrinfo hints;
663         struct addrinfo *ai = NULL;
664         struct addrinfo *rp = NULL;
665         char   host[NI_MAXHOST];
666         gchar  *port = NULL;
667         int e;
668         int ret;
669
670         if(!s) {
671                 err("Invalid parsing server");
672                 return -1;
673         }
674
675         port = g_strdup_printf("%d", s->port);
676
677         memset(&hints,'\0',sizeof(hints));
678         hints.ai_family = AF_UNSPEC;
679         hints.ai_socktype = SOCK_STREAM;
680         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
681         hints.ai_protocol = IPPROTO_TCP;
682
683         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
684
685         if (port)
686                 g_free(port);
687
688         if(e == 0) {
689                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
690                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
691
692                         if (e != 0) { // error
693                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
694                                 continue;
695                         }
696
697                         // duplicate server and set listenaddr to resolved IP address
698                         ns = dup_serve (s);
699                         if (ns) {
700                                 ns->listenaddr = g_strdup(host);
701                                 ns->socket_family = rp->ai_family;
702                                 g_array_append_val(a, *ns);
703                                 free(ns);
704                                 ns = NULL;
705                         }
706                 }
707
708                 ret = 0;
709         } else {
710                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
711                 ret = -1;
712         }
713
714         if (ai)
715                 freeaddrinfo(ai);
716
717         return ret;
718 }
719
720 /**
721  * Parse the config file.
722  *
723  * @param f the name of the config file
724  * @param e a GError. @see CFILE_ERRORS for what error values this function can
725  *      return.
726  * @return a Array of SERVER* pointers, If the config file is empty or does not
727  *      exist, returns an empty GHashTable; if the config file contains an
728  *      error, returns NULL, and e is set appropriately
729  **/
730 GArray* parse_cfile(gchar* f, GError** e) {
731         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
732         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
733         SERVER s;
734         gchar *virtstyle=NULL;
735         PARAM lp[] = {
736                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
737                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
738                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
739                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
740                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
741                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
742                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
743                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
744                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
745                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
746                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
747                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
748                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
749                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
750                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
751                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
752                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
753                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
754                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
755         };
756         const int lp_size=sizeof(lp)/sizeof(PARAM);
757         PARAM gp[] = {
758                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
759                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
760                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
761                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
762         };
763         PARAM* p=gp;
764         int p_size=sizeof(gp)/sizeof(PARAM);
765         GKeyFile *cfile;
766         GError *err = NULL;
767         const char *err_msg=NULL;
768         GQuark errdomain;
769         GArray *retval=NULL;
770         gchar **groups;
771         gboolean value;
772         gchar* startgroup;
773         gint i;
774         gint j;
775
776         errdomain = g_quark_from_string("parse_cfile");
777         cfile = g_key_file_new();
778         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
779         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
780                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
781                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
782                 g_key_file_free(cfile);
783                 return retval;
784         }
785         startgroup = g_key_file_get_start_group(cfile);
786         if(!startgroup || strcmp(startgroup, "generic")) {
787                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
788                 g_key_file_free(cfile);
789                 return NULL;
790         }
791         groups = g_key_file_get_groups(cfile, NULL);
792         for(i=0;groups[i];i++) {
793                 memset(&s, '\0', sizeof(SERVER));
794
795                 /* After the [generic] group, start parsing exports */
796                 if(i==1) {
797                         p=lp;
798                         p_size=lp_size;
799                 } 
800                 for(j=0;j<p_size;j++) {
801                         g_assert(p[j].target != NULL);
802                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
803                         switch(p[j].ptype) {
804                                 case PARAM_INT:
805                                         *((gint*)p[j].target) =
806                                                 g_key_file_get_integer(cfile,
807                                                                 groups[i],
808                                                                 p[j].paramname,
809                                                                 &err);
810                                         break;
811                                 case PARAM_STRING:
812                                         *((gchar**)p[j].target) =
813                                                 g_key_file_get_string(cfile,
814                                                                 groups[i],
815                                                                 p[j].paramname,
816                                                                 &err);
817                                         break;
818                                 case PARAM_BOOL:
819                                         value = g_key_file_get_boolean(cfile,
820                                                         groups[i],
821                                                         p[j].paramname, &err);
822                                         if(!err) {
823                                                 if(value) {
824                                                         *((gint*)p[j].target) |= p[j].flagval;
825                                                 } else {
826                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
827                                                 }
828                                         }
829                                         break;
830                         }
831                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) {
832                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export");
833                                 g_key_file_free(cfile);
834                                 return NULL;
835                         }
836                         if(err) {
837                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
838                                         if(!p[j].required) {
839                                                 /* Ignore not-found error for optional values */
840                                                 g_clear_error(&err);
841                                                 continue;
842                                         } else {
843                                                 err_msg = MISSING_REQUIRED_ERROR;
844                                         }
845                                 } else {
846                                         err_msg = DEFAULT_ERROR;
847                                 }
848                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
849                                 g_array_free(retval, TRUE);
850                                 g_error_free(err);
851                                 g_key_file_free(cfile);
852                                 return NULL;
853                         }
854                 }
855                 if(virtstyle) {
856                         if(!strncmp(virtstyle, "none", 4)) {
857                                 s.virtstyle=VIRT_NONE;
858                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
859                                 s.virtstyle=VIRT_IPLIT;
860                         } else if(!strncmp(virtstyle, "iphash", 6)) {
861                                 s.virtstyle=VIRT_IPHASH;
862                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
863                                 s.virtstyle=VIRT_CIDR;
864                                 if(strlen(virtstyle)<10) {
865                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
866                                         g_array_free(retval, TRUE);
867                                         g_key_file_free(cfile);
868                                         return NULL;
869                                 }
870                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
871                         } else {
872                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
873                                 g_array_free(retval, TRUE);
874                                 g_key_file_free(cfile);
875                                 return NULL;
876                         }
877                         if(s.port && !do_oldstyle) {
878                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
879                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
880                         }
881                 } else {
882                         s.virtstyle=VIRT_IPLIT;
883                 }
884                 /* Don't need to free this, it's not our string */
885                 virtstyle=NULL;
886                 /* Don't append values for the [generic] group */
887                 if(i>0) {
888                         s.socket_family = AF_UNSPEC;
889                         s.servename = groups[i];
890
891                         append_serve(&s, retval);
892                 } else {
893                         if(!do_oldstyle) {
894                                 lp[1].required = 0;
895                         }
896                 }
897 #ifndef WITH_SDP
898                 if(s.flags & F_SDP) {
899                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
900                         g_array_free(retval, TRUE);
901                         g_key_file_free(cfile);
902                         return NULL;
903                 }
904 #endif
905         }
906         if(i==1) {
907                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
908         }
909         g_key_file_free(cfile);
910         return retval;
911 }
912
913 /**
914  * Signal handler for SIGCHLD
915  * @param s the signal we're handling (must be SIGCHLD, or something
916  * is severely wrong)
917  **/
918 void sigchld_handler(int s) {
919         int status;
920         int* i;
921         pid_t pid;
922
923         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
924                 if(WIFEXITED(status)) {
925                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
926                 }
927                 i=g_hash_table_lookup(children, &pid);
928                 if(!i) {
929                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
930                 } else {
931                         DEBUG("Removing %d from the list of children", pid);
932                         g_hash_table_remove(children, &pid);
933                 }
934         }
935 }
936
937 /**
938  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
939  *
940  * @param key the key
941  * @param value the value corresponding to the above key
942  * @param user_data a pointer which we always set to 1, so that we know what
943  * will happen next.
944  **/
945 void killchild(gpointer key, gpointer value, gpointer user_data) {
946         pid_t *pid=value;
947         int *parent=user_data;
948
949         kill(*pid, SIGTERM);
950         *parent=1;
951 }
952
953 /**
954  * Handle SIGTERM and dispatch it to our children
955  * @param s the signal we're handling (must be SIGTERM, or something
956  * is severely wrong).
957  **/
958 void sigterm_handler(int s) {
959         int parent=0;
960
961         g_hash_table_foreach(children, killchild, &parent);
962
963         if(parent) {
964                 unlink(pidfname);
965         }
966
967         exit(EXIT_SUCCESS);
968 }
969
970 /**
971  * Detect the size of a file.
972  *
973  * @param fhandle An open filedescriptor
974  * @return the size of the file, or OFFT_MAX if detection was
975  * impossible.
976  **/
977 off_t size_autodetect(int fhandle) {
978         off_t es;
979         u64 bytes;
980         struct stat stat_buf;
981         int error;
982
983 #ifdef HAVE_SYS_MOUNT_H
984 #ifdef HAVE_SYS_IOCTL_H
985 #ifdef BLKGETSIZE64
986         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
987         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
988                 return (off_t)bytes;
989         }
990 #endif /* BLKGETSIZE64 */
991 #endif /* HAVE_SYS_IOCTL_H */
992 #endif /* HAVE_SYS_MOUNT_H */
993
994         DEBUG("looking for fhandle size with fstat\n");
995         stat_buf.st_size = 0;
996         error = fstat(fhandle, &stat_buf);
997         if (!error) {
998                 if(stat_buf.st_size > 0)
999                         return (off_t)stat_buf.st_size;
1000         } else {
1001                 err("fstat failed: %m");
1002         }
1003
1004         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1005         es = lseek(fhandle, (off_t)0, SEEK_END);
1006         if (es > ((off_t)0)) {
1007                 return es;
1008         } else {
1009                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1010         }
1011
1012         err("Could not find size of exported block device: %m");
1013         return OFFT_MAX;
1014 }
1015
1016 /**
1017  * Get the file handle and offset, given an export offset.
1018  *
1019  * @param export An array of export files
1020  * @param a The offset to get corresponding file/offset for
1021  * @param fhandle [out] File descriptor
1022  * @param foffset [out] Offset into fhandle
1023  * @param maxbytes [out] Tells how many bytes can be read/written
1024  * from fhandle starting at foffset (0 if there is no limit)
1025  * @return 0 on success, -1 on failure
1026  **/
1027 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1028         /* Negative offset not allowed */
1029         if(a < 0)
1030                 return -1;
1031
1032         /* Binary search for last file with starting offset <= a */
1033         FILE_INFO fi;
1034         int start = 0;
1035         int end = export->len - 1;
1036         while( start <= end ) {
1037                 int mid = (start + end) / 2;
1038                 fi = g_array_index(export, FILE_INFO, mid);
1039                 if( fi.startoff < a ) {
1040                         start = mid + 1;
1041                 } else if( fi.startoff > a ) {
1042                         end = mid - 1;
1043                 } else {
1044                         start = end = mid;
1045                         break;
1046                 }
1047         }
1048
1049         /* end should never go negative, since first startoff is 0 and a >= 0 */
1050         g_assert(end >= 0);
1051
1052         fi = g_array_index(export, FILE_INFO, end);
1053         *fhandle = fi.fhandle;
1054         *foffset = a - fi.startoff;
1055         *maxbytes = 0;
1056         if( end+1 < export->len ) {
1057                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1058                 *maxbytes = fi_next.startoff - a;
1059         }
1060
1061         return 0;
1062 }
1063
1064 /**
1065  * seek to a position in a file, with error handling.
1066  * @param handle a filedescriptor
1067  * @param a position to seek to
1068  * @todo get rid of this; lastpoint is a global variable right now, but it
1069  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1070  * easier.
1071  **/
1072 void myseek(int handle,off_t a) {
1073         if (lseek(handle, a, SEEK_SET) < 0) {
1074                 err("Can not seek locally!\n");
1075         }
1076 }
1077
1078 /**
1079  * Write an amount of bytes at a given offset to the right file. This
1080  * abstracts the write-side of the multiple file option.
1081  *
1082  * @param a The offset where the write should start
1083  * @param buf The buffer to write from
1084  * @param len The length of buf
1085  * @param client The client we're serving for
1086  * @param fua Flag to indicate 'Force Unit Access'
1087  * @return The number of bytes actually written, or -1 in case of an error
1088  **/
1089 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1090         int fhandle;
1091         off_t foffset;
1092         size_t maxbytes;
1093         ssize_t retval;
1094
1095         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1096                 return -1;
1097         if(maxbytes && len > maxbytes)
1098                 len = maxbytes;
1099
1100         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1101
1102         myseek(fhandle, foffset);
1103         retval = write(fhandle, buf, len);
1104         if(client->server->flags & F_SYNC) {
1105                 fsync(fhandle);
1106         } else if (fua) {
1107
1108           /* This is where we would do the following
1109            *   #ifdef USE_SYNC_FILE_RANGE
1110            * However, we don't, for the reasons set out below
1111            * by Christoph Hellwig <hch@infradead.org>
1112            *
1113            * [BEGINS] 
1114            * fdatasync is equivalent to fsync except that it does not flush
1115            * non-essential metadata (basically just timestamps in practice), but it
1116            * does flush metadata requried to find the data again, e.g. allocation
1117            * information and extent maps.  sync_file_range does nothing but flush
1118            * out pagecache content - it means you basically won't get your data
1119            * back in case of a crash if you either:
1120            * 
1121            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1122            *  b) are using a sparse file on a filesystem
1123            *  c) are using a fallocate-preallocated file on a filesystem
1124            *  d) use any file on a COW filesystem like btrfs
1125            * 
1126            * e.g. it only does anything useful for you if you do not have a volatile
1127            * write cache, and either use a raw block device node, or just overwrite
1128            * an already fully allocated (and not preallocated) file on a non-COW
1129            * filesystem.
1130            * [ENDS]
1131            *
1132            * What we should do is open a second FD with O_DSYNC set, then write to
1133            * that when appropriate. However, with a Linux client, every REQ_FUA
1134            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1135            * problems.
1136            *
1137            */
1138 #if 0
1139                 sync_file_range(fhandle, foffset, len,
1140                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1141                                 SYNC_FILE_RANGE_WAIT_AFTER);
1142 #else
1143                 fdatasync(fhandle);
1144 #endif
1145         }
1146         return retval;
1147 }
1148
1149 /**
1150  * Call rawexpwrite repeatedly until all data has been written.
1151  *
1152  * @param a The offset where the write should start
1153  * @param buf The buffer to write from
1154  * @param len The length of buf
1155  * @param client The client we're serving for
1156  * @param fua Flag to indicate 'Force Unit Access'
1157  * @return 0 on success, nonzero on failure
1158  **/
1159 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1160         ssize_t ret=0;
1161
1162         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1163                 a += ret;
1164                 buf += ret;
1165                 len -= ret;
1166         }
1167         return (ret < 0 || len != 0);
1168 }
1169
1170 /**
1171  * Read an amount of bytes at a given offset from the right file. This
1172  * abstracts the read-side of the multiple files option.
1173  *
1174  * @param a The offset where the read should start
1175  * @param buf A buffer to read into
1176  * @param len The size of buf
1177  * @param client The client we're serving for
1178  * @return The number of bytes actually read, or -1 in case of an
1179  * error.
1180  **/
1181 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1182         int fhandle;
1183         off_t foffset;
1184         size_t maxbytes;
1185
1186         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1187                 return -1;
1188         if(maxbytes && len > maxbytes)
1189                 len = maxbytes;
1190
1191         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1192
1193         myseek(fhandle, foffset);
1194         return read(fhandle, buf, len);
1195 }
1196
1197 /**
1198  * Call rawexpread repeatedly until all data has been read.
1199  * @return 0 on success, nonzero on failure
1200  **/
1201 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1202         ssize_t ret=0;
1203
1204         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1205                 a += ret;
1206                 buf += ret;
1207                 len -= ret;
1208         }
1209         return (ret < 0 || len != 0);
1210 }
1211
1212 /**
1213  * Read an amount of bytes at a given offset from the right file. This
1214  * abstracts the read-side of the copyonwrite stuff, and calls
1215  * rawexpread() with the right parameters to do the actual work.
1216  * @param a The offset where the read should start
1217  * @param buf A buffer to read into
1218  * @param len The size of buf
1219  * @param client The client we're going to read for
1220  * @return 0 on success, nonzero on failure
1221  **/
1222 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1223         off_t rdlen, offset;
1224         off_t mapcnt, mapl, maph, pagestart;
1225
1226         if (!(client->server->flags & F_COPYONWRITE))
1227                 return(rawexpread_fully(a, buf, len, client));
1228         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1229
1230         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1231
1232         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1233                 pagestart=mapcnt*DIFFPAGESIZE;
1234                 offset=a-pagestart;
1235                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1236                         len : (size_t)DIFFPAGESIZE-offset;
1237                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1238                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1239                                (unsigned long)(client->difmap[mapcnt]));
1240                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1241                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1242                 } else { /* the block is not there */
1243                         DEBUG("Page %llu is not here, we read the original one\n",
1244                                (unsigned long long)mapcnt);
1245                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1246                 }
1247                 len-=rdlen; a+=rdlen; buf+=rdlen;
1248         }
1249         return 0;
1250 }
1251
1252 /**
1253  * Write an amount of bytes at a given offset to the right file. This
1254  * abstracts the write-side of the copyonwrite option, and calls
1255  * rawexpwrite() with the right parameters to do the actual work.
1256  *
1257  * @param a The offset where the write should start
1258  * @param buf The buffer to write from
1259  * @param len The length of buf
1260  * @param client The client we're going to write for.
1261  * @param fua Flag to indicate 'Force Unit Access'
1262  * @return 0 on success, nonzero on failure
1263  **/
1264 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1265         char pagebuf[DIFFPAGESIZE];
1266         off_t mapcnt,mapl,maph;
1267         off_t wrlen,rdlen; 
1268         off_t pagestart;
1269         off_t offset;
1270
1271         if (!(client->server->flags & F_COPYONWRITE))
1272                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1273         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1274
1275         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1276
1277         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1278                 pagestart=mapcnt*DIFFPAGESIZE ;
1279                 offset=a-pagestart ;
1280                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1281                         len : (size_t)DIFFPAGESIZE-offset;
1282
1283                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1284                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1285                                (unsigned long)(client->difmap[mapcnt])) ;
1286                         myseek(client->difffile,
1287                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1288                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1289                 } else { /* the block is not there */
1290                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1291                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1292                         DEBUG("Page %llu is not here, we put it at %lu\n",
1293                                (unsigned long long)mapcnt,
1294                                (unsigned long)(client->difmap[mapcnt]));
1295                         rdlen=DIFFPAGESIZE ;
1296                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1297                                 return -1;
1298                         memcpy(pagebuf+offset,buf,wrlen) ;
1299                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1300                                         DIFFPAGESIZE)
1301                                 return -1;
1302                 }                                                   
1303                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1304         }
1305         if (client->server->flags & F_SYNC) {
1306                 fsync(client->difffile);
1307         } else if (fua) {
1308                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1309                    as we iterate through the above?
1310                  */
1311                 fdatasync(client->difffile);
1312         }
1313         return 0;
1314 }
1315
1316 /**
1317  * Flush data to a client
1318  *
1319  * @param client The client we're going to write for.
1320  * @return 0 on success, nonzero on failure
1321  **/
1322 int expflush(CLIENT *client) {
1323         gint i;
1324
1325         if (client->server->flags & F_COPYONWRITE) {
1326                 return fsync(client->difffile);
1327         }
1328         
1329         for (i = 0; i < client->export->len; i++) {
1330                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1331                 if (fsync(fi.fhandle) < 0)
1332                         return -1;
1333         }
1334         
1335         return 0;
1336 }
1337
1338 /**
1339  * Do the initial negotiation.
1340  *
1341  * @param client The client we're negotiating with.
1342  **/
1343 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1344         char zeros[128];
1345         uint64_t size_host;
1346         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1347         uint16_t smallflags = 0;
1348         uint64_t magic;
1349
1350         memset(zeros, '\0', sizeof(zeros));
1351         if(!client || !client->modern) {
1352                 /* common */
1353                 if (write(net, INIT_PASSWD, 8) < 0) {
1354                         err_nonfatal("Negotiation failed: %m");
1355                         if(client)
1356                                 exit(EXIT_FAILURE);
1357                 }
1358                 if(!client || client->modern) {
1359                         /* modern */
1360                         magic = htonll(opts_magic);
1361                 } else {
1362                         /* oldstyle */
1363                         magic = htonll(cliserv_magic);
1364                 }
1365                 if (write(net, &magic, sizeof(magic)) < 0) {
1366                         err_nonfatal("Negotiation failed: %m");
1367                         if(client)
1368                                 exit(EXIT_FAILURE);
1369                 }
1370         }
1371         if(!client) {
1372                 /* modern */
1373                 uint32_t reserved;
1374                 uint32_t opt;
1375                 uint32_t namelen;
1376                 char* name;
1377                 int i;
1378
1379                 if(!servers)
1380                         err("programmer error");
1381                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1382                         err("Negotiation failed: %m");
1383                 if (read(net, &reserved, sizeof(reserved)) < 0)
1384                         err("Negotiation failed: %m");
1385                 if (read(net, &magic, sizeof(magic)) < 0)
1386                         err("Negotiation failed: %m");
1387                 magic = ntohll(magic);
1388                 if(magic != opts_magic) {
1389                         close(net);
1390                         return NULL;
1391                 }
1392                 if (read(net, &opt, sizeof(opt)) < 0)
1393                         err("Negotiation failed: %m");
1394                 opt = ntohl(opt);
1395                 if(opt != NBD_OPT_EXPORT_NAME) {
1396                         close(net);
1397                         return NULL;
1398                 }
1399                 if (read(net, &namelen, sizeof(namelen)) < 0)
1400                         err("Negotiation failed: %m");
1401                 namelen = ntohl(namelen);
1402                 name = malloc(namelen+1);
1403                 name[namelen]=0;
1404                 if (read(net, name, namelen) < 0)
1405                         err("Negotiation failed: %m");
1406                 for(i=0; i<servers->len; i++) {
1407                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1408                         if(!strcmp(serve->servename, name)) {
1409                                 CLIENT* client = g_new0(CLIENT, 1);
1410                                 client->server = serve;
1411                                 client->exportsize = OFFT_MAX;
1412                                 client->net = net;
1413                                 client->modern = TRUE;
1414                                 client->transactionlogfd = -1;
1415                                 free(name);
1416                                 return client;
1417                         }
1418                 }
1419                 free(name);
1420                 return NULL;
1421         }
1422         /* common */
1423         size_host = htonll((u64)(client->exportsize));
1424         if (write(net, &size_host, 8) < 0)
1425                 err("Negotiation failed: %m");
1426         if (client->server->flags & F_READONLY)
1427                 flags |= NBD_FLAG_READ_ONLY;
1428         if (client->server->flags & F_FLUSH)
1429                 flags |= NBD_FLAG_SEND_FLUSH;
1430         if (client->server->flags & F_FUA)
1431                 flags |= NBD_FLAG_SEND_FUA;
1432         if (client->server->flags & F_ROTATIONAL)
1433                 flags |= NBD_FLAG_ROTATIONAL;
1434         if (!client->modern) {
1435                 /* oldstyle */
1436                 flags = htonl(flags);
1437                 if (write(client->net, &flags, 4) < 0)
1438                         err("Negotiation failed: %m");
1439         } else {
1440                 /* modern */
1441                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1442                 smallflags = htons(smallflags);
1443                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1444                         err("Negotiation failed: %m");
1445                 }
1446         }
1447         /* common */
1448         if (write(client->net, zeros, 124) < 0)
1449                 err("Negotiation failed: %m");
1450         return NULL;
1451 }
1452
1453 /** sending macro. */
1454 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1455         if (client->transactionlogfd != -1) \
1456                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1457 /** error macro. */
1458 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1459 /**
1460  * Serve a file to a single client.
1461  *
1462  * @todo This beast needs to be split up in many tiny little manageable
1463  * pieces. Preferably with a chainsaw.
1464  *
1465  * @param client The client we're going to serve to.
1466  * @return when the client disconnects
1467  **/
1468 int mainloop(CLIENT *client) {
1469         struct nbd_request request;
1470         struct nbd_reply reply;
1471         gboolean go_on=TRUE;
1472 #ifdef DODBG
1473         int i = 0;
1474 #endif
1475         negotiate(client->net, client, NULL);
1476         DEBUG("Entering request loop!\n");
1477         reply.magic = htonl(NBD_REPLY_MAGIC);
1478         reply.error = 0;
1479         while (go_on) {
1480                 char buf[BUFSIZE];
1481                 char* p;
1482                 size_t len;
1483                 size_t currlen;
1484                 size_t writelen;
1485                 uint16_t command;
1486 #ifdef DODBG
1487                 i++;
1488                 printf("%d: ", i);
1489 #endif
1490                 readit(client->net, &request, sizeof(request));
1491                 if (client->transactionlogfd != -1)
1492                         writeit(client->transactionlogfd, &request, sizeof(request));
1493
1494                 request.from = ntohll(request.from);
1495                 request.type = ntohl(request.type);
1496                 command = request.type & NBD_CMD_MASK_COMMAND;
1497                 len = ntohl(request.len);
1498
1499                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1500                                 (unsigned long long)request.from,
1501                                 (unsigned long long)request.from / 512, (unsigned int)len);
1502
1503                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1504                         err("Not enough magic.");
1505
1506                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1507
1508                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1509                         if ((request.from + len) > (OFFT_MAX)) {
1510                                 DEBUG("[Number too large!]");
1511                                 ERROR(client, reply, EINVAL);
1512                                 continue;
1513                         }
1514
1515                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1516                                 DEBUG("[RANGE!]");
1517                                 ERROR(client, reply, EINVAL);
1518                                 continue;
1519                         }
1520
1521                         currlen = len;
1522                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1523                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1524                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1525                         }
1526                 }
1527
1528                 switch (command) {
1529
1530                 case NBD_CMD_DISC:
1531                         msg2(LOG_INFO, "Disconnect request received.");
1532                         if (client->server->flags & F_COPYONWRITE) { 
1533                                 if (client->difmap) g_free(client->difmap) ;
1534                                 close(client->difffile);
1535                                 unlink(client->difffilename);
1536                                 free(client->difffilename);
1537                         }
1538                         go_on=FALSE;
1539                         continue;
1540
1541                 case NBD_CMD_WRITE:
1542                         DEBUG("wr: net->buf, ");
1543                         while(len > 0) {
1544                                 readit(client->net, buf, currlen);
1545                                 DEBUG("buf->exp, ");
1546                                 if ((client->server->flags & F_READONLY) ||
1547                                     (client->server->flags & F_AUTOREADONLY)) {
1548                                         DEBUG("[WRITE to READONLY!]");
1549                                         ERROR(client, reply, EPERM);
1550                                         continue;
1551                                 }
1552                                 if (expwrite(request.from, buf, len, client,
1553                                              request.type & NBD_CMD_FLAG_FUA)) {
1554                                         DEBUG("Write failed: %m" );
1555                                         ERROR(client, reply, errno);
1556                                         continue;
1557                                 }
1558                                 len -= currlen;
1559                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1560                         }
1561                         SEND(client->net, reply);
1562                         DEBUG("OK!\n");
1563                         continue;
1564
1565                 case NBD_CMD_FLUSH:
1566                         DEBUG("fl: ");
1567                         if (expflush(client)) {
1568                                 DEBUG("Flush failed: %m");
1569                                 ERROR(client, reply, errno);
1570                                 continue;
1571                         }
1572                         SEND(client->net, reply);
1573                         DEBUG("OK!\n");
1574                         continue;
1575
1576                 case NBD_CMD_READ:
1577                         DEBUG("exp->buf, ");
1578                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1579                         if (client->transactionlogfd != -1)
1580                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1581                         p = buf + sizeof(struct nbd_reply);
1582                         writelen = currlen + sizeof(struct nbd_reply);
1583                         while(len > 0) {
1584                                 if (expread(request.from, p, currlen, client)) {
1585                                         DEBUG("Read failed: %m");
1586                                         ERROR(client, reply, errno);
1587                                         continue;
1588                                 }
1589                                 
1590                                 DEBUG("buf->net, ");
1591                                 writeit(client->net, buf, writelen);
1592                                 len -= currlen;
1593                                 request.from += currlen;
1594                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1595                                 p = buf;
1596                                 writelen = currlen;
1597                         }
1598                         DEBUG("OK!\n");
1599                         continue;
1600
1601                 default:
1602                         DEBUG ("Ignoring unknown command\n");
1603                         continue;
1604                 }
1605         }
1606         return 0;
1607 }
1608
1609 /**
1610  * Set up client export array, which is an array of FILE_INFO.
1611  * Also, split a single exportfile into multiple ones, if that was asked.
1612  * @param client information on the client which we want to setup export for
1613  **/
1614 void setupexport(CLIENT* client) {
1615         int i;
1616         off_t laststartoff = 0, lastsize = 0;
1617         int multifile = (client->server->flags & F_MULTIFILE);
1618
1619         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1620
1621         /* If multi-file, open as many files as we can.
1622          * If not, open exactly one file.
1623          * Calculate file sizes as we go to get total size. */
1624         for(i=0; ; i++) {
1625                 FILE_INFO fi;
1626                 gchar *tmpname;
1627                 gchar* error_string;
1628                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1629
1630                 if(multifile) {
1631                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1632                 } else {
1633                         tmpname=g_strdup(client->exportname);
1634                 }
1635                 DEBUG( "Opening %s\n", tmpname );
1636                 fi.fhandle = open(tmpname, mode);
1637                 if(fi.fhandle == -1 && mode == O_RDWR) {
1638                         /* Try again because maybe media was read-only */
1639                         fi.fhandle = open(tmpname, O_RDONLY);
1640                         if(fi.fhandle != -1) {
1641                                 /* Opening the base file in copyonwrite mode is
1642                                  * okay */
1643                                 if(!(client->server->flags & F_COPYONWRITE)) {
1644                                         client->server->flags |= F_AUTOREADONLY;
1645                                         client->server->flags |= F_READONLY;
1646                                 }
1647                         }
1648                 }
1649                 if(fi.fhandle == -1) {
1650                         if(multifile && i>0)
1651                                 break;
1652                         error_string=g_strdup_printf(
1653                                 "Could not open exported file %s: %%m",
1654                                 tmpname);
1655                         err(error_string);
1656                 }
1657                 fi.startoff = laststartoff + lastsize;
1658                 g_array_append_val(client->export, fi);
1659                 g_free(tmpname);
1660
1661                 /* Starting offset and size of this file will be used to
1662                  * calculate starting offset of next file */
1663                 laststartoff = fi.startoff;
1664                 lastsize = size_autodetect(fi.fhandle);
1665
1666                 if(!multifile)
1667                         break;
1668         }
1669
1670         /* Set export size to total calculated size */
1671         client->exportsize = laststartoff + lastsize;
1672
1673         /* Export size may be overridden */
1674         if(client->server->expected_size) {
1675                 /* desired size must be <= total calculated size */
1676                 if(client->server->expected_size > client->exportsize) {
1677                         err("Size of exported file is too big\n");
1678                 }
1679
1680                 client->exportsize = client->server->expected_size;
1681         }
1682
1683         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1684         if(multifile) {
1685                 msg3(LOG_INFO, "Total number of files: %d", i);
1686         }
1687 }
1688
1689 int copyonwrite_prepare(CLIENT* client) {
1690         off_t i;
1691         if ((client->difffilename = malloc(1024))==NULL)
1692                 err("Failed to allocate string for diff file name");
1693         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1694                 (int)getpid()) ;
1695         client->difffilename[1023]='\0';
1696         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1697         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1698         if (client->difffile<0) err("Could not create diff file (%m)") ;
1699         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1700                 err("Could not allocate memory") ;
1701         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1702
1703         return 0;
1704 }
1705
1706 /**
1707  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1708  * options
1709  *
1710  * @param command the command to be ran. Read from the config file
1711  * @param file the file name we're about to export
1712  **/
1713 int do_run(gchar* command, gchar* file) {
1714         gchar* cmd;
1715         int retval=0;
1716
1717         if(command && *command) {
1718                 cmd = g_strdup_printf(command, file);
1719                 retval=system(cmd);
1720                 g_free(cmd);
1721         }
1722         return retval;
1723 }
1724
1725 /**
1726  * Serve a connection. 
1727  *
1728  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1729  * follow the road map.
1730  *
1731  * @param client a connected client
1732  **/
1733 void serveconnection(CLIENT *client) {
1734         if (client->server->transactionlog && (client->transactionlogfd == -1))
1735         {
1736                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1737                                                            O_WRONLY | O_CREAT,
1738                                                            S_IRUSR | S_IWUSR)))
1739                         g_warning("Could not open transaction log %s",
1740                                   client->server->transactionlog);
1741         }
1742
1743         if(do_run(client->server->prerun, client->exportname)) {
1744                 exit(EXIT_FAILURE);
1745         }
1746         setupexport(client);
1747
1748         if (client->server->flags & F_COPYONWRITE) {
1749                 copyonwrite_prepare(client);
1750         }
1751
1752         setmysockopt(client->net);
1753
1754         mainloop(client);
1755         do_run(client->server->postrun, client->exportname);
1756
1757         if (-1 != client->transactionlogfd)
1758         {
1759                 close(client->transactionlogfd);
1760                 client->transactionlogfd = -1;
1761         }
1762 }
1763
1764 /**
1765  * Find the name of the file we have to serve. This will use g_strdup_printf
1766  * to put the IP address of the client inside a filename containing
1767  * "%s" (in the form as specified by the "virtstyle" option). That name
1768  * is then written to client->exportname.
1769  *
1770  * @param net A socket connected to an nbd client
1771  * @param client information about the client. The IP address in human-readable
1772  * format will be written to a new char* buffer, the address of which will be
1773  * stored in client->clientname.
1774  **/
1775 void set_peername(int net, CLIENT *client) {
1776         struct sockaddr_storage addrin;
1777         struct sockaddr_storage netaddr;
1778         struct sockaddr_in  *netaddr4 = NULL;
1779         struct sockaddr_in6 *netaddr6 = NULL;
1780         size_t addrinlen = sizeof( addrin );
1781         struct addrinfo hints;
1782         struct addrinfo *ai = NULL;
1783         char peername[NI_MAXHOST];
1784         char netname[NI_MAXHOST];
1785         char *tmp = NULL;
1786         int i;
1787         int e;
1788         int shift;
1789
1790         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1791                 err("getsockname failed: %m");
1792
1793         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1794                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1795
1796         memset(&hints, '\0', sizeof (hints));
1797         hints.ai_flags = AI_ADDRCONFIG;
1798         e = getaddrinfo(peername, NULL, &hints, &ai);
1799
1800         if(e != 0) {
1801                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1802                 freeaddrinfo(ai);
1803                 return;
1804         }
1805
1806         switch(client->server->virtstyle) {
1807                 case VIRT_NONE:
1808                         client->exportname=g_strdup(client->server->exportname);
1809                         break;
1810                 case VIRT_IPHASH:
1811                         for(i=0;i<strlen(peername);i++) {
1812                                 if(peername[i]=='.') {
1813                                         peername[i]='/';
1814                                 }
1815                         }
1816                 case VIRT_IPLIT:
1817                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1818                         break;
1819                 case VIRT_CIDR:
1820                         memcpy(&netaddr, &addrin, addrinlen);
1821                         if(ai->ai_family == AF_INET) {
1822                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1823                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1824                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1825
1826                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1827                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1828                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1829                         }else if(ai->ai_family == AF_INET6) {
1830                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1831
1832                                 shift = 128-(client->server->cidrlen);
1833                                 i = 3;
1834                                 while(shift >= 32) {
1835                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1836                                         shift-=32;
1837                                         i--;
1838                                 }
1839                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1840                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1841
1842                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1843                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1844                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1845                         }
1846
1847                         if(tmp != NULL)
1848                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1849
1850                         break;
1851         }
1852
1853         freeaddrinfo(ai);
1854         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1855              peername, client->exportname);
1856         client->clientname=g_strdup(peername);
1857 }
1858
1859 /**
1860  * Destroy a pid_t*
1861  * @param data a pointer to pid_t which should be freed
1862  **/
1863 void destroy_pid_t(gpointer data) {
1864         g_free(data);
1865 }
1866
1867 /**
1868  * Loop through the available servers, and serve them. Never returns.
1869  **/
1870 int serveloop(GArray* servers) {
1871         struct sockaddr_storage addrin;
1872         socklen_t addrinlen=sizeof(addrin);
1873         int i;
1874         int max;
1875         int sock;
1876         fd_set mset;
1877         fd_set rset;
1878
1879         /* 
1880          * Set up the master fd_set. The set of descriptors we need
1881          * to select() for never changes anyway and it buys us a *lot*
1882          * of time to only build this once. However, if we ever choose
1883          * to not fork() for clients anymore, we may have to revisit
1884          * this.
1885          */
1886         max=0;
1887         FD_ZERO(&mset);
1888         for(i=0;i<servers->len;i++) {
1889                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1890                         FD_SET(sock, &mset);
1891                         max=sock>max?sock:max;
1892                 }
1893         }
1894         if(modernsock) {
1895                 FD_SET(modernsock, &mset);
1896                 max=modernsock>max?modernsock:max;
1897         }
1898         for(;;) {
1899                 CLIENT *client = NULL;
1900                 pid_t *pid;
1901
1902                 memcpy(&rset, &mset, sizeof(fd_set));
1903                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1904                         int net = 0;
1905                         SERVER* serve=NULL;
1906
1907                         DEBUG("accept, ");
1908                         if(FD_ISSET(modernsock, &rset)) {
1909                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1910                                         err("accept: %m");
1911                                 client = negotiate(net, NULL, servers);
1912                                 if(!client) {
1913                                         err_nonfatal("negotiation failed");
1914                                         close(net);
1915                                         net=0;
1916                                         continue;
1917                                 }
1918                                 serve = client->server;
1919                         }
1920                         for(i=0;i<servers->len && !net;i++) {
1921                                 serve=&(g_array_index(servers, SERVER, i));
1922                                 if(FD_ISSET(serve->socket, &rset)) {
1923                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1924                                                 err("accept: %m");
1925                                 }
1926                         }
1927                         if(net) {
1928                                 int sock_flags;
1929
1930                                 if(serve->max_connections > 0 &&
1931                                    g_hash_table_size(children) >= serve->max_connections) {
1932                                         msg2(LOG_INFO, "Max connections reached");
1933                                         close(net);
1934                                         continue;
1935                                 }
1936                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1937                                         err("fcntl F_GETFL");
1938                                 }
1939                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1940                                         err("fcntl F_SETFL ~O_NONBLOCK");
1941                                 }
1942                                 if(!client) {
1943                                         client = g_new0(CLIENT, 1);
1944                                         client->server=serve;
1945                                         client->exportsize=OFFT_MAX;
1946                                         client->net=net;
1947                                         client->transactionlogfd = -1;
1948                                 }
1949                                 set_peername(net, client);
1950                                 if (!authorized_client(client)) {
1951                                         msg2(LOG_INFO,"Unauthorized client") ;
1952                                         close(net);
1953                                         continue;
1954                                 }
1955                                 msg2(LOG_INFO,"Authorized client") ;
1956                                 pid=g_malloc(sizeof(pid_t));
1957
1958                                 if (!dontfork) {
1959                                         if ((*pid=fork())<0) {
1960                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1961                                                 close(net);
1962                                                 continue;
1963                                         }
1964                                         if (*pid>0) { /* parent */
1965                                                 close(net);
1966                                                 g_hash_table_insert(children, pid, pid);
1967                                                 continue;
1968                                         }
1969                                         /* child */
1970                                         g_hash_table_destroy(children);
1971                                         for(i=0;i<servers->len;i++) {
1972                                                 serve=&g_array_index(servers, SERVER, i);
1973                                                 close(serve->socket);
1974                                         }
1975                                         /* FALSE does not free the
1976                                            actual data. This is required,
1977                                            because the client has a
1978                                            direct reference into that
1979                                            data, and otherwise we get a
1980                                            segfault... */
1981                                         g_array_free(servers, FALSE);
1982                                 }
1983
1984                                 msg2(LOG_INFO,"Starting to serve");
1985                                 serveconnection(client);
1986                                 exit(EXIT_SUCCESS);
1987                         }
1988                 }
1989         }
1990 }
1991
1992 void dosockopts(int socket) {
1993 #ifndef sun
1994         int yes=1;
1995 #else
1996         char yes='1';
1997 #endif /* sun */
1998         int sock_flags;
1999
2000         /* lose the pesky "Address already in use" error message */
2001         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2002                 err("setsockopt SO_REUSEADDR");
2003         }
2004         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2005                 err("setsockopt SO_KEEPALIVE");
2006         }
2007
2008         /* make the listening socket non-blocking */
2009         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2010                 err("fcntl F_GETFL");
2011         }
2012         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2013                 err("fcntl F_SETFL O_NONBLOCK");
2014         }
2015 }
2016
2017 /**
2018  * Connect a server's socket.
2019  *
2020  * @param serve the server we want to connect.
2021  **/
2022 int setup_serve(SERVER *serve) {
2023         struct addrinfo hints;
2024         struct addrinfo *ai = NULL;
2025         gchar *port = NULL;
2026         int e;
2027
2028         if(!do_oldstyle) {
2029                 return serve->servename ? 1 : 0;
2030         }
2031         memset(&hints,'\0',sizeof(hints));
2032         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2033         hints.ai_socktype = SOCK_STREAM;
2034         hints.ai_family = serve->socket_family;
2035
2036         port = g_strdup_printf ("%d", serve->port);
2037         if (port == NULL)
2038                 return 0;
2039
2040         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2041
2042         g_free(port);
2043
2044         if(e != 0) {
2045                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2046                 serve->socket = -1;
2047                 freeaddrinfo(ai);
2048                 exit(EXIT_FAILURE);
2049         }
2050
2051         if(serve->socket_family == AF_UNSPEC)
2052                 serve->socket_family = ai->ai_family;
2053
2054 #ifdef WITH_SDP
2055         if ((serve->flags) && F_SDP) {
2056                 if (ai->ai_family == AF_INET)
2057                         ai->ai_family = AF_INET_SDP;
2058                 else (ai->ai_family == AF_INET6)
2059                         ai->ai_family = AF_INET6_SDP;
2060         }
2061 #endif
2062         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2063                 err("socket: %m");
2064
2065         dosockopts(serve->socket);
2066
2067         DEBUG("Waiting for connections... bind, ");
2068         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2069         if (e != 0 && errno != EADDRINUSE)
2070                 err("bind: %m");
2071         DEBUG("listen, ");
2072         if (listen(serve->socket, 1) < 0)
2073                 err("listen: %m");
2074
2075         freeaddrinfo (ai);
2076         if(serve->servename) {
2077                 return 1;
2078         } else {
2079                 return 0;
2080         }
2081 }
2082
2083 void open_modern(void) {
2084         struct addrinfo hints;
2085         struct addrinfo* ai = NULL;
2086         struct sock_flags;
2087         int e;
2088
2089         memset(&hints, '\0', sizeof(hints));
2090         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2091         hints.ai_socktype = SOCK_STREAM;
2092         hints.ai_family = AF_UNSPEC;
2093         hints.ai_protocol = IPPROTO_TCP;
2094         e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai);
2095         if(e != 0) {
2096                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2097                 exit(EXIT_FAILURE);
2098         }
2099         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2100                 err("socket: %m");
2101         }
2102
2103         dosockopts(modernsock);
2104
2105         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2106                 err("bind: %m");
2107         }
2108         if(listen(modernsock, 10) <0) {
2109                 err("listen: %m");
2110         }
2111
2112         freeaddrinfo(ai);
2113 }
2114
2115 /**
2116  * Connect our servers.
2117  **/
2118 void setup_servers(GArray* servers) {
2119         int i;
2120         struct sigaction sa;
2121         int want_modern=0;
2122
2123         for(i=0;i<servers->len;i++) {
2124                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2125         }
2126         if(want_modern) {
2127                 open_modern();
2128         }
2129         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2130
2131         sa.sa_handler = sigchld_handler;
2132         sigemptyset(&sa.sa_mask);
2133         sa.sa_flags = SA_RESTART;
2134         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2135                 err("sigaction: %m");
2136         sa.sa_handler = sigterm_handler;
2137         sigemptyset(&sa.sa_mask);
2138         sa.sa_flags = SA_RESTART;
2139         if(sigaction(SIGTERM, &sa, NULL) == -1)
2140                 err("sigaction: %m");
2141 }
2142
2143 /**
2144  * Go daemon (unless we specified at compile time that we didn't want this)
2145  * @param serve the first server of our configuration. If its port is zero,
2146  *      then do not daemonize, because we're doing inetd then. This parameter
2147  *      is only used to create a PID file of the form
2148  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2149  **/
2150 #if !defined(NODAEMON)
2151 void daemonize(SERVER* serve) {
2152         FILE*pidf;
2153
2154         if(serve && !(serve->port)) {
2155                 return;
2156         }
2157         if(daemon(0,0)<0) {
2158                 err("daemon");
2159         }
2160         if(!*pidftemplate) {
2161                 if(serve) {
2162                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2163                 } else {
2164                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2165                 }
2166         }
2167         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2168         pidf=fopen(pidfname, "w");
2169         if(pidf) {
2170                 fprintf(pidf,"%d\n", (int)getpid());
2171                 fclose(pidf);
2172         } else {
2173                 perror("fopen");
2174                 fprintf(stderr, "Not fatal; continuing");
2175         }
2176 }
2177 #else
2178 #define daemonize(serve)
2179 #endif /* !defined(NODAEMON) */
2180
2181 /*
2182  * Everything beyond this point (in the file) is run in non-daemon mode.
2183  * The stuff above daemonize() isn't.
2184  */
2185
2186 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2187
2188 void serve_err(SERVER* serve, const char* msg) {
2189         g_message("Export of %s on port %d failed:", serve->exportname,
2190                         serve->port);
2191         err(msg);
2192 }
2193
2194 /**
2195  * Set up user-ID and/or group-ID
2196  **/
2197 void dousers(void) {
2198         struct passwd *pw;
2199         struct group *gr;
2200         gchar* str;
2201         if(rungroup) {
2202                 gr=getgrnam(rungroup);
2203                 if(!gr) {
2204                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2205                         err(str);
2206                 }
2207                 if(setgid(gr->gr_gid)<0) {
2208                         err("Could not set GID: %m"); 
2209                 }
2210         }
2211         if(runuser) {
2212                 pw=getpwnam(runuser);
2213                 if(!pw) {
2214                         str = g_strdup_printf("Invalid user name: %s", runuser);
2215                         err(str);
2216                 }
2217                 if(setuid(pw->pw_uid)<0) {
2218                         err("Could not set UID: %m");
2219                 }
2220         }
2221 }
2222
2223 #ifndef ISSERVER
2224 void glib_message_syslog_redirect(const gchar *log_domain,
2225                                   GLogLevelFlags log_level,
2226                                   const gchar *message,
2227                                   gpointer user_data)
2228 {
2229     int level=LOG_DEBUG;
2230     
2231     switch( log_level )
2232     {
2233       case G_LOG_FLAG_FATAL:
2234       case G_LOG_LEVEL_CRITICAL:
2235       case G_LOG_LEVEL_ERROR:    
2236         level=LOG_ERR; 
2237         break;
2238       case G_LOG_LEVEL_WARNING:
2239         level=LOG_WARNING;
2240         break;
2241       case G_LOG_LEVEL_MESSAGE:
2242       case G_LOG_LEVEL_INFO:
2243         level=LOG_INFO;
2244         break;
2245       case G_LOG_LEVEL_DEBUG:
2246         level=LOG_DEBUG;
2247       default:
2248         level=LOG_ERR;
2249     }
2250     syslog(level, "%s", message);
2251 }
2252 #endif
2253
2254 /**
2255  * Main entry point...
2256  **/
2257 int main(int argc, char *argv[]) {
2258         SERVER *serve;
2259         GArray *servers;
2260         GError *err=NULL;
2261
2262         if (sizeof( struct nbd_request )!=28) {
2263                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2264                 exit(EXIT_FAILURE) ;
2265         }
2266
2267         memset(pidftemplate, '\0', 256);
2268
2269         logging();
2270         config_file_pos = g_strdup(CFILE);
2271         serve=cmdline(argc, argv);
2272         servers = parse_cfile(config_file_pos, &err);
2273         
2274         if(serve) {
2275                 serve->socket_family = AF_UNSPEC;
2276
2277                 append_serve(serve, servers);
2278      
2279                 if (!(serve->port)) {
2280                         CLIENT *client;
2281 #ifndef ISSERVER
2282                         /* You really should define ISSERVER if you're going to use
2283                          * inetd mode, but if you don't, closing stdout and stderr
2284                          * (which inetd had connected to the client socket) will let it
2285                          * work. */
2286                         close(1);
2287                         close(2);
2288                         open("/dev/null", O_WRONLY);
2289                         open("/dev/null", O_WRONLY);
2290                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2291 #endif
2292                         client=g_malloc(sizeof(CLIENT));
2293                         client->server=serve;
2294                         client->net=0;
2295                         client->exportsize=OFFT_MAX;
2296                         set_peername(0,client);
2297                         serveconnection(client);
2298                         return 0;
2299                 }
2300         }
2301     
2302         if(!servers || !servers->len) {
2303                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2304                                 && err->code == CFILE_NOTFOUND)) {
2305                         g_warning("Could not parse config file: %s", 
2306                                         err ? err->message : "Unknown error");
2307                 }
2308         }
2309         if(serve) {
2310                 g_warning("Specifying an export on the command line is deprecated.");
2311                 g_warning("Please use a configuration file instead.");
2312         }
2313
2314         if((!serve) && (!servers||!servers->len)) {
2315                 g_message("No configured exports; quitting.");
2316                 exit(EXIT_FAILURE);
2317         }
2318         if (!dontfork)
2319                 daemonize(serve);
2320         setup_servers(servers);
2321         dousers();
2322         serveloop(servers);
2323         return 0 ;
2324 }