Fix handling of oversize writes
[nbd.git] / nbd-server.c
1 /*
2  * Network Block Device - server
3  *
4  * Copyright 1996-1998 Pavel Machek, distribute under GPL
5  *  <pavel@atrey.karlin.mff.cuni.cz>
6  * Copyright 2001-2004 Wouter Verhelst <wouter@debian.org>
7  * Copyright 2002 Anton Altaparmakov <aia21@cam.ac.uk>
8  *
9  * Version 1.0 - hopefully 64-bit-clean
10  * Version 1.1 - merging enhancements from Josh Parsons, <josh@coombs.anu.edu.au>
11  * Version 1.2 - autodetect size of block devices, thanx to Peter T. Breuer" <ptb@it.uc3m.es>
12  * Version 1.5 - can compile on Unix systems that don't have 64 bit integer
13  *      type, or don't have 64 bit file offsets by defining FS_32BIT
14  *      in compile options for nbd-server *only*. This can be done
15  *      with make FSCHOICE=-DFS_32BIT nbd-server. (I don't have the
16  *      original autoconf input file, or I would make it a configure
17  *      option.) Ken Yap <ken@nlc.net.au>.
18  * Version 1.6 - fix autodetection of block device size and really make 64 bit
19  *      clean on 32 bit machines. Anton Altaparmakov <aia21@cam.ac.uk>
20  * Version 2.0 - Version synchronised with client
21  * Version 2.1 - Reap zombie client processes when they exit. Removed
22  *      (uncommented) the _IO magic, it's no longer necessary. Wouter
23  *      Verhelst <wouter@debian.org>
24  * Version 2.2 - Auto switch to read-only mode (usefull for floppies).
25  * Version 2.3 - Fixed code so that Large File Support works. This
26  *      removes the FS_32BIT compile-time directive; define
27  *      _FILE_OFFSET_BITS=64 and _LARGEFILE_SOURCE if you used to be
28  *      using FS_32BIT. This will allow you to use files >2GB instead of
29  *      having to use the -m option. Wouter Verhelst <wouter@debian.org>
30  * Version 2.4 - Added code to keep track of children, so that we can
31  *      properly kill them from initscripts. Add a call to daemon(),
32  *      so that processes don't think they have to wait for us, which is
33  *      interesting for initscripts as well. Wouter Verhelst
34  *      <wouter@debian.org>
35  * Version 2.5 - Bugfix release: forgot to reset child_arraysize to
36  *      zero after fork()ing, resulting in nbd-server going berserk
37  *      when it receives a signal with at least one child open. Wouter
38  *      Verhelst <wouter@debian.org>
39  * 10/10/2003 - Added socket option SO_KEEPALIVE (sf.net bug 819235);
40  *      rectified type of mainloop::size_host (sf.net bugs 814435 and
41  *      817385); close the PID file after writing to it, so that the
42  *      daemon can actually be found. Wouter Verhelst
43  *      <wouter@debian.org>
44  * 10/10/2003 - Size of the data "size_host" was wrong and so was not
45  *      correctly put in network endianness. Many types were corrected
46  *      (size_t and off_t instead of int).  <vspaceg@sourceforge.net>
47  * Version 2.6 - Some code cleanup.
48  * Version 2.7 - Better build system.
49  * 11/02/2004 - Doxygenified the source, modularized it a bit. Needs a 
50  *      lot more work, but this is a start. Wouter Verhelst
51  *      <wouter@debian.org>
52  * 16/03/2010 - Add IPv6 support.
53  *      Kitt Tientanopajai <kitt@kitty.in.th>
54  *      Neutron Soutmun <neo.neutron@gmail.com>
55  *      Suriya Soutmun <darksolar@gmail.com>
56  */
57
58 /* Includes LFS defines, which defines behaviours of some of the following
59  * headers, so must come before those */
60 #include "lfs.h"
61
62 #include <sys/types.h>
63 #include <sys/socket.h>
64 #include <sys/stat.h>
65 #include <sys/select.h>         /* select */
66 #include <sys/wait.h>           /* wait */
67 #ifdef HAVE_SYS_IOCTL_H
68 #include <sys/ioctl.h>
69 #endif
70 #include <sys/param.h>
71 #ifdef HAVE_SYS_MOUNT_H
72 #include <sys/mount.h>          /* For BLKGETSIZE */
73 #endif
74 #include <signal.h>             /* sigaction */
75 #include <errno.h>
76 #include <netinet/tcp.h>
77 #include <netinet/in.h>
78 #include <netdb.h>
79 #include <syslog.h>
80 #include <unistd.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <fcntl.h>
85 #include <arpa/inet.h>
86 #include <strings.h>
87 #include <dirent.h>
88 #include <unistd.h>
89 #include <getopt.h>
90 #include <pwd.h>
91 #include <grp.h>
92
93 #include <glib.h>
94
95 /* used in cliserv.h, so must come first */
96 #define MY_NAME "nbd_server"
97 #include "cliserv.h"
98
99 #ifdef WITH_SDP
100 #include <sdp_inet.h>
101 #endif
102
103 /** Default position of the config file */
104 #ifndef SYSCONFDIR
105 #define SYSCONFDIR "/etc"
106 #endif
107 #define CFILE SYSCONFDIR "/nbd-server/config"
108
109 /** Where our config file actually is */
110 gchar* config_file_pos;
111
112 /** What user we're running as */
113 gchar* runuser=NULL;
114 /** What group we're running as */
115 gchar* rungroup=NULL;
116 /** whether to export using the old negotiation protocol (port-based) */
117 gboolean do_oldstyle=FALSE;
118
119 /* Whether we should avoid forking */
120 int dontfork = 0;
121
122 /** Logging macros, now nothing goes to syslog unless you say ISSERVER */
123 #ifdef ISSERVER
124 #define msg2(a,b) syslog(a,b)
125 #define msg3(a,b,c) syslog(a,b,c)
126 #define msg4(a,b,c,d) syslog(a,b,c,d)
127 #else
128 #define msg2(a,b) g_message(b)
129 #define msg3(a,b,c) g_message(b,c)
130 #define msg4(a,b,c,d) g_message(b,c,d)
131 #endif
132
133 /* Debugging macros */
134 //#define DODBG
135 #ifdef DODBG
136 #define DEBUG(...) printf(__VA_ARGS__)
137 #else
138 #define DEBUG(...)
139 #endif
140 #ifndef PACKAGE_VERSION
141 #define PACKAGE_VERSION ""
142 #endif
143 /**
144  * The highest value a variable of type off_t can reach. This is a signed
145  * integer, so set all bits except for the leftmost one.
146  **/
147 #define OFFT_MAX ~((off_t)1<<(sizeof(off_t)*8-1))
148 #define LINELEN 256       /**< Size of static buffer used to read the
149                                authorization file (yuck) */
150 #define BUFSIZE ((1024*1024)+sizeof(struct nbd_reply)) /**< Size of buffer that can hold requests */
151 #define DIFFPAGESIZE 4096 /**< diff file uses those chunks */
152 #define F_READONLY 1      /**< flag to tell us a file is readonly */
153 #define F_MULTIFILE 2     /**< flag to tell us a file is exported using -m */
154 #define F_COPYONWRITE 4   /**< flag to tell us a file is exported using
155                             copyonwrite */
156 #define F_AUTOREADONLY 8  /**< flag to tell us a file is set to autoreadonly */
157 #define F_SPARSE 16       /**< flag to tell us copyronwrite should use a sparse file */
158 #define F_SDP 32          /**< flag to tell us the export should be done using the Socket Direct Protocol for RDMA */
159 #define F_SYNC 64         /**< Whether to fsync() after a write */
160 #define F_FLUSH 128       /**< Whether server wants FLUSH to be sent by the client */
161 #define F_FUA 256         /**< Whether server wants FUA to be sent by the client */
162 #define F_ROTATIONAL 512  /**< Whether server wants the client to implement the elevator algorithm */
163 GHashTable *children;
164 char pidfname[256]; /**< name of our PID file */
165 char pidftemplate[256]; /**< template to be used for the filename of the PID file */
166 char default_authname[] = SYSCONFDIR "/nbd-server/allow"; /**< default name of allow file */
167
168 int modernsock=0;         /**< Socket for the modern handler. Not used
169                                if a client was only specified on the
170                                command line; only port used if
171                                oldstyle is set to false (and then the
172                                command-line client isn't used, gna gna) */
173 char* modern_listen;      /**< listenaddr value for modernsock */
174
175 /**
176  * Types of virtuatlization
177  **/
178 typedef enum {
179         VIRT_NONE=0,    /**< No virtualization */
180         VIRT_IPLIT,     /**< Literal IP address as part of the filename */
181         VIRT_IPHASH,    /**< Replacing all dots in an ip address by a / before
182                              doing the same as in IPLIT */
183         VIRT_CIDR,      /**< Every subnet in its own directory */
184 } VIRT_STYLE;
185
186 /**
187  * Variables associated with a server.
188  **/
189 typedef struct {
190         gchar* exportname;    /**< (unprocessed) filename of the file we're exporting */
191         off_t expected_size; /**< size of the exported file as it was told to
192                                us through configuration */
193         gchar* listenaddr;   /**< The IP address we're listening on */
194         unsigned int port;   /**< port we're exporting this file at */
195         char* authname;      /**< filename of the authorization file */
196         int flags;           /**< flags associated with this exported file */
197         int socket;          /**< The socket of this server. */
198         int socket_family;   /**< family of the socket */
199         VIRT_STYLE virtstyle;/**< The style of virtualization, if any */
200         uint8_t cidrlen;     /**< The length of the mask when we use
201                                   CIDR-style virtualization */
202         gchar* prerun;       /**< command to be ran after connecting a client,
203                                   but before starting to serve */
204         gchar* postrun;      /**< command that will be ran after the client
205                                   disconnects */
206         gchar* servename;    /**< name of the export as selected by nbd-client */
207         int max_connections; /**< maximum number of opened connections */
208         gchar* transactionlog;/**< filename for transaction log */
209 } SERVER;
210
211 /**
212  * Variables associated with a client socket.
213  **/
214 typedef struct {
215         int fhandle;      /**< file descriptor */
216         off_t startoff;   /**< starting offset of this file */
217 } FILE_INFO;
218
219 typedef struct {
220         off_t exportsize;    /**< size of the file we're exporting */
221         char *clientname;    /**< peer */
222         char *exportname;    /**< (processed) filename of the file we're exporting */
223         GArray *export;    /**< array of FILE_INFO of exported files;
224                                array size is always 1 unless we're
225                                doing the multiple file option */
226         int net;             /**< The actual client socket */
227         SERVER *server;      /**< The server this client is getting data from */
228         char* difffilename;  /**< filename of the copy-on-write file, if any */
229         int difffile;        /**< filedescriptor of copyonwrite file. @todo
230                                shouldn't this be an array too? (cfr export) Or
231                                make -m and -c mutually exclusive */
232         u32 difffilelen;     /**< number of pages in difffile */
233         u32 *difmap;         /**< see comment on the global difmap for this one */
234         gboolean modern;     /**< client was negotiated using modern negotiation protocol */
235         int transactionlogfd;/**< fd for transaction log */
236 } CLIENT;
237
238 /**
239  * Type of configuration file values
240  **/
241 typedef enum {
242         PARAM_INT,              /**< This parameter is an integer */
243         PARAM_STRING,           /**< This parameter is a string */
244         PARAM_BOOL,             /**< This parameter is a boolean */
245 } PARAM_TYPE;
246
247 /**
248  * Configuration file values
249  **/
250 typedef struct {
251         gchar *paramname;       /**< Name of the parameter, as it appears in
252                                   the config file */
253         gboolean required;      /**< Whether this is a required (as opposed to
254                                   optional) parameter */
255         PARAM_TYPE ptype;       /**< Type of the parameter. */
256         gpointer target;        /**< Pointer to where the data of this
257                                   parameter should be written. If ptype is
258                                   PARAM_BOOL, the data is or'ed rather than
259                                   overwritten. */
260         gint flagval;           /**< Flag mask for this parameter in case ptype
261                                   is PARAM_BOOL. */
262 } PARAM;
263
264 /**
265  * Translate a command name into human readable form
266  *
267  * @param command The command number (after applying NBD_CMD_MASK_COMMAND)
268  * @return pointer to the command name
269  **/
270 static inline const char * getcommandname(uint64_t command) {
271         switch (command) {
272         case NBD_CMD_READ:
273                 return "NBD_CMD_READ";
274         case NBD_CMD_WRITE:
275                 return "NBD_CMD_WRITE";
276         case NBD_CMD_DISC:
277                 return "NBD_CMD_DISC";
278         case NBD_CMD_FLUSH:
279                 return "NBD_CMD_FLUSH";
280         default:
281                 break;
282         }
283         return "UNKNOWN";
284 }
285
286 /**
287  * Check whether a client is allowed to connect. Works with an authorization
288  * file which contains one line per machine, no wildcards.
289  *
290  * @param opts The client who's trying to connect.
291  * @return 0 - authorization refused, 1 - OK
292  **/
293 int authorized_client(CLIENT *opts) {
294         const char *ERRMSG="Invalid entry '%s' in authfile '%s', so, refusing all connections.";
295         FILE *f ;
296         char line[LINELEN]; 
297         char *tmp;
298         struct in_addr addr;
299         struct in_addr client;
300         struct in_addr cltemp;
301         int len;
302
303         if ((f=fopen(opts->server->authname,"r"))==NULL) {
304                 msg4(LOG_INFO,"Can't open authorization file %s (%s).",
305                      opts->server->authname,strerror(errno)) ;
306                 return 1 ; 
307         }
308   
309         inet_aton(opts->clientname, &client);
310         while (fgets(line,LINELEN,f)!=NULL) {
311                 if((tmp=index(line, '/'))) {
312                         if(strlen(line)<=tmp-line) {
313                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
314                                 return 0;
315                         }
316                         *(tmp++)=0;
317                         if(!inet_aton(line,&addr)) {
318                                 msg4(LOG_CRIT, ERRMSG, line, opts->server->authname);
319                                 return 0;
320                         }
321                         len=strtol(tmp, NULL, 0);
322                         addr.s_addr>>=32-len;
323                         addr.s_addr<<=32-len;
324                         memcpy(&cltemp,&client,sizeof(client));
325                         cltemp.s_addr>>=32-len;
326                         cltemp.s_addr<<=32-len;
327                         if(addr.s_addr == cltemp.s_addr) {
328                                 return 1;
329                         }
330                 }
331                 if (strncmp(line,opts->clientname,strlen(opts->clientname))==0) {
332                         fclose(f);
333                         return 1;
334                 }
335         }
336         fclose(f);
337         return 0;
338 }
339
340 /**
341  * Read data from a file descriptor into a buffer
342  *
343  * @param f a file descriptor
344  * @param buf a buffer
345  * @param len the number of bytes to be read
346  **/
347 static inline void readit(int f, void *buf, size_t len) {
348         ssize_t res;
349         while (len > 0) {
350                 DEBUG("*");
351                 if ((res = read(f, buf, len)) <= 0) {
352                         if(errno != EAGAIN) {
353                                 err("Read failed: %m");
354                         }
355                 } else {
356                         len -= res;
357                         buf += res;
358                 }
359         }
360 }
361
362 /**
363  * Consume data from an FD that we don't want
364  *
365  * @param f a file descriptor
366  * @param buf a buffer
367  * @param len the number of bytes to consume
368  * @param bufsiz the size of the buffer
369  **/
370 static inline void consume(int f, void * buf, size_t len, size_t bufsiz) {
371         size_t curlen;
372         while (len>0) {
373                 curlen = (len>bufsiz)?bufsiz:len;
374                 readit(f, buf, curlen);
375                 len -= curlen;
376         }
377 }
378
379
380 /**
381  * Write data from a buffer into a filedescriptor
382  *
383  * @param f a file descriptor
384  * @param buf a buffer containing data
385  * @param len the number of bytes to be written
386  **/
387 static inline void writeit(int f, void *buf, size_t len) {
388         ssize_t res;
389         while (len > 0) {
390                 DEBUG("+");
391                 if ((res = write(f, buf, len)) <= 0)
392                         err("Send failed: %m");
393                 len -= res;
394                 buf += res;
395         }
396 }
397
398 /**
399  * Print out a message about how to use nbd-server. Split out to a separate
400  * function so that we can call it from multiple places
401  */
402 void usage() {
403         printf("This is nbd-server version " VERSION "\n");
404         printf("Usage: [ip:|ip6@]port file_to_export [size][kKmM] [-l authorize_file] [-r] [-m] [-c] [-C configuration file] [-p PID file name] [-o section name] [-M max connections]\n"
405                "\t-r|--read-only\t\tread only\n"
406                "\t-m|--multi-file\t\tmultiple file\n"
407                "\t-c|--copy-on-write\tcopy on write\n"
408                "\t-C|--config-file\tspecify an alternate configuration file\n"
409                "\t-l|--authorize-file\tfile with list of hosts that are allowed to\n\t\t\t\tconnect.\n"
410                "\t-p|--pid-file\t\tspecify a filename to write our PID to\n"
411                "\t-o|--output-config\toutput a config file section for what you\n\t\t\t\tspecified on the command line, with the\n\t\t\t\tspecified section name\n"
412                "\t-M|--max-connections\tspecify the maximum number of opened connections\n\n"
413                "\tif port is set to 0, stdin is used (for running from inetd)\n"
414                "\tif file_to_export contains '%%s', it is substituted with the IP\n"
415                "\t\taddress of the machine trying to connect\n" 
416                "\tif ip is set, it contains the local IP address on which we're listening.\n\tif not, the server will listen on all local IP addresses\n");
417         printf("Using configuration file %s\n", CFILE);
418 }
419
420 /* Dumps a config file section of the given SERVER*, and exits. */
421 void dump_section(SERVER* serve, gchar* section_header) {
422         printf("[%s]\n", section_header);
423         printf("\texportname = %s\n", serve->exportname);
424         printf("\tlistenaddr = %s\n", serve->listenaddr);
425         printf("\tport = %d\n", serve->port);
426         if(serve->flags & F_READONLY) {
427                 printf("\treadonly = true\n");
428         }
429         if(serve->flags & F_MULTIFILE) {
430                 printf("\tmultifile = true\n");
431         }
432         if(serve->flags & F_COPYONWRITE) {
433                 printf("\tcopyonwrite = true\n");
434         }
435         if(serve->expected_size) {
436                 printf("\tfilesize = %lld\n", (long long int)serve->expected_size);
437         }
438         if(serve->authname) {
439                 printf("\tauthfile = %s\n", serve->authname);
440         }
441         exit(EXIT_SUCCESS);
442 }
443
444 /**
445  * Parse the command line.
446  *
447  * @param argc the argc argument to main()
448  * @param argv the argv argument to main()
449  **/
450 SERVER* cmdline(int argc, char *argv[]) {
451         int i=0;
452         int nonspecial=0;
453         int c;
454         struct option long_options[] = {
455                 {"read-only", no_argument, NULL, 'r'},
456                 {"multi-file", no_argument, NULL, 'm'},
457                 {"copy-on-write", no_argument, NULL, 'c'},
458                 {"dont-fork", no_argument, NULL, 'd'},
459                 {"authorize-file", required_argument, NULL, 'l'},
460                 {"config-file", required_argument, NULL, 'C'},
461                 {"pid-file", required_argument, NULL, 'p'},
462                 {"output-config", required_argument, NULL, 'o'},
463                 {"max-connection", required_argument, NULL, 'M'},
464                 {0,0,0,0}
465         };
466         SERVER *serve;
467         off_t es;
468         size_t last;
469         char suffix;
470         gboolean do_output=FALSE;
471         gchar* section_header="";
472         gchar** addr_port;
473
474         if(argc==1) {
475                 return NULL;
476         }
477         serve=g_new0(SERVER, 1);
478         serve->authname = g_strdup(default_authname);
479         serve->virtstyle=VIRT_IPLIT;
480         while((c=getopt_long(argc, argv, "-C:cdl:mo:rp:M:", long_options, &i))>=0) {
481                 switch (c) {
482                 case 1:
483                         /* non-option argument */
484                         switch(nonspecial++) {
485                         case 0:
486                                 if(strchr(optarg, ':') == strrchr(optarg, ':')) {
487                                         addr_port=g_strsplit(optarg, ":", 2);
488
489                                         /* Check for "@" - maybe user using this separator
490                                                  for IPv4 address */
491                                         if(!addr_port[1]) {
492                                                 g_strfreev(addr_port);
493                                                 addr_port=g_strsplit(optarg, "@", 2);
494                                         }
495                                 } else {
496                                         addr_port=g_strsplit(optarg, "@", 2);
497                                 }
498
499                                 if(addr_port[1]) {
500                                         serve->port=strtol(addr_port[1], NULL, 0);
501                                         serve->listenaddr=g_strdup(addr_port[0]);
502                                 } else {
503                                         serve->listenaddr=NULL;
504                                         serve->port=strtol(addr_port[0], NULL, 0);
505                                 }
506                                 g_strfreev(addr_port);
507                                 break;
508                         case 1:
509                                 serve->exportname = g_strdup(optarg);
510                                 if(serve->exportname[0] != '/') {
511                                         fprintf(stderr, "E: The to be exported file needs to be an absolute filename!\n");
512                                         exit(EXIT_FAILURE);
513                                 }
514                                 break;
515                         case 2:
516                                 last=strlen(optarg)-1;
517                                 suffix=optarg[last];
518                                 if (suffix == 'k' || suffix == 'K' ||
519                                     suffix == 'm' || suffix == 'M')
520                                         optarg[last] = '\0';
521                                 es = (off_t)atoll(optarg);
522                                 switch (suffix) {
523                                         case 'm':
524                                         case 'M':  es <<= 10;
525                                         case 'k':
526                                         case 'K':  es <<= 10;
527                                         default :  break;
528                                 }
529                                 serve->expected_size = es;
530                                 break;
531                         }
532                         break;
533                 case 'r':
534                         serve->flags |= F_READONLY;
535                         break;
536                 case 'm':
537                         serve->flags |= F_MULTIFILE;
538                         break;
539                 case 'o':
540                         do_output = TRUE;
541                         section_header = g_strdup(optarg);
542                         break;
543                 case 'p':
544                         strncpy(pidftemplate, optarg, 256);
545                         break;
546                 case 'c': 
547                         serve->flags |=F_COPYONWRITE;
548                         break;
549                 case 'd': 
550                         dontfork = 1;
551                         break;
552                 case 'C':
553                         g_free(config_file_pos);
554                         config_file_pos=g_strdup(optarg);
555                         break;
556                 case 'l':
557                         g_free(serve->authname);
558                         serve->authname=g_strdup(optarg);
559                         break;
560                 case 'M':
561                         serve->max_connections = strtol(optarg, NULL, 0);
562                         break;
563                 default:
564                         usage();
565                         exit(EXIT_FAILURE);
566                         break;
567                 }
568         }
569         /* What's left: the port to export, the name of the to be exported
570          * file, and, optionally, the size of the file, in that order. */
571         if(nonspecial<2) {
572                 g_free(serve);
573                 serve=NULL;
574         } else {
575                 do_oldstyle = TRUE;
576         }
577         if(do_output) {
578                 if(!serve) {
579                         g_critical("Need a complete configuration on the command line to output a config file section!");
580                         exit(EXIT_FAILURE);
581                 }
582                 dump_section(serve, section_header);
583         }
584         return serve;
585 }
586
587 /**
588  * Error codes for config file parsing
589  **/
590 typedef enum {
591         CFILE_NOTFOUND,         /**< The configuration file is not found */
592         CFILE_MISSING_GENERIC,  /**< The (required) group "generic" is missing */
593         CFILE_KEY_MISSING,      /**< A (required) key is missing */
594         CFILE_VALUE_INVALID,    /**< A value is syntactically invalid */
595         CFILE_VALUE_UNSUPPORTED,/**< A value is not supported in this build */
596         CFILE_PROGERR,          /**< Programmer error */
597         CFILE_NO_EXPORTS,       /**< A config file was specified that does not
598                                      define any exports */
599         CFILE_INCORRECT_PORT,   /**< The reserved port was specified for an
600                                      old-style export. */
601 } CFILE_ERRORS;
602
603 /**
604  * Remove a SERVER from memory. Used from the hash table
605  **/
606 void remove_server(gpointer s) {
607         SERVER *server;
608
609         server=(SERVER*)s;
610         g_free(server->exportname);
611         if(server->authname)
612                 g_free(server->authname);
613         if(server->listenaddr)
614                 g_free(server->listenaddr);
615         if(server->prerun)
616                 g_free(server->prerun);
617         if(server->postrun)
618                 g_free(server->postrun);
619         if(server->transactionlog)
620                 g_free(server->transactionlog);
621         g_free(server);
622 }
623
624 /**
625  * duplicate server
626  * @param s the old server we want to duplicate
627  * @return new duplicated server
628  **/
629 SERVER* dup_serve(SERVER *s) {
630         SERVER *serve = NULL;
631
632         serve=g_new0(SERVER, 1);
633         if(serve == NULL)
634                 return NULL;
635
636         if(s->exportname)
637                 serve->exportname = g_strdup(s->exportname);
638
639         serve->expected_size = s->expected_size;
640
641         if(s->listenaddr)
642                 serve->listenaddr = g_strdup(s->listenaddr);
643
644         serve->port = s->port;
645
646         if(s->authname)
647                 serve->authname = strdup(s->authname);
648
649         serve->flags = s->flags;
650         serve->socket = s->socket;
651         serve->socket_family = s->socket_family;
652         serve->virtstyle = s->virtstyle;
653         serve->cidrlen = s->cidrlen;
654
655         if(s->prerun)
656                 serve->prerun = g_strdup(s->prerun);
657
658         if(s->postrun)
659                 serve->postrun = g_strdup(s->postrun);
660
661         if(s->transactionlog)
662                 serve->transactionlog = g_strdup(s->transactionlog);
663         
664         if(s->servename)
665                 serve->servename = g_strdup(s->servename);
666
667         serve->max_connections = s->max_connections;
668
669         return serve;
670 }
671
672 /**
673  * append new server to array
674  * @param s server
675  * @param a server array
676  * @return 0 success, -1 error
677  */
678 int append_serve(SERVER *s, GArray *a) {
679         SERVER *ns = NULL;
680         struct addrinfo hints;
681         struct addrinfo *ai = NULL;
682         struct addrinfo *rp = NULL;
683         char   host[NI_MAXHOST];
684         gchar  *port = NULL;
685         int e;
686         int ret;
687
688         if(!s) {
689                 err("Invalid parsing server");
690                 return -1;
691         }
692
693         port = g_strdup_printf("%d", s->port);
694
695         memset(&hints,'\0',sizeof(hints));
696         hints.ai_family = AF_UNSPEC;
697         hints.ai_socktype = SOCK_STREAM;
698         hints.ai_flags = AI_ADDRCONFIG | AI_PASSIVE;
699         hints.ai_protocol = IPPROTO_TCP;
700
701         e = getaddrinfo(s->listenaddr, port, &hints, &ai);
702
703         if (port)
704                 g_free(port);
705
706         if(e == 0) {
707                 for (rp = ai; rp != NULL; rp = rp->ai_next) {
708                         e = getnameinfo(rp->ai_addr, rp->ai_addrlen, host, sizeof(host), NULL, 0, NI_NUMERICHOST);
709
710                         if (e != 0) { // error
711                                 fprintf(stderr, "getnameinfo: %s\n", gai_strerror(e));
712                                 continue;
713                         }
714
715                         // duplicate server and set listenaddr to resolved IP address
716                         ns = dup_serve (s);
717                         if (ns) {
718                                 ns->listenaddr = g_strdup(host);
719                                 ns->socket_family = rp->ai_family;
720                                 g_array_append_val(a, *ns);
721                                 free(ns);
722                                 ns = NULL;
723                         }
724                 }
725
726                 ret = 0;
727         } else {
728                 fprintf(stderr, "getaddrinfo failed on listen host/address: %s (%s)\n", s->listenaddr ? s->listenaddr : "any", gai_strerror(e));
729                 ret = -1;
730         }
731
732         if (ai)
733                 freeaddrinfo(ai);
734
735         return ret;
736 }
737
738 /**
739  * Parse the config file.
740  *
741  * @param f the name of the config file
742  * @param e a GError. @see CFILE_ERRORS for what error values this function can
743  *      return.
744  * @return a Array of SERVER* pointers, If the config file is empty or does not
745  *      exist, returns an empty GHashTable; if the config file contains an
746  *      error, returns NULL, and e is set appropriately
747  **/
748 GArray* parse_cfile(gchar* f, GError** e) {
749         const char* DEFAULT_ERROR = "Could not parse %s in group %s: %s";
750         const char* MISSING_REQUIRED_ERROR = "Could not find required value %s in group %s: %s";
751         SERVER s;
752         gchar *virtstyle=NULL;
753         PARAM lp[] = {
754                 { "exportname", TRUE,   PARAM_STRING,   &(s.exportname),        0 },
755                 { "port",       TRUE,   PARAM_INT,      &(s.port),              0 },
756                 { "authfile",   FALSE,  PARAM_STRING,   &(s.authname),          0 },
757                 { "filesize",   FALSE,  PARAM_INT,      &(s.expected_size),     0 },
758                 { "virtstyle",  FALSE,  PARAM_STRING,   &(virtstyle),           0 },
759                 { "prerun",     FALSE,  PARAM_STRING,   &(s.prerun),            0 },
760                 { "postrun",    FALSE,  PARAM_STRING,   &(s.postrun),           0 },
761                 { "transactionlog", FALSE, PARAM_STRING, &(s.transactionlog),   0 },
762                 { "readonly",   FALSE,  PARAM_BOOL,     &(s.flags),             F_READONLY },
763                 { "multifile",  FALSE,  PARAM_BOOL,     &(s.flags),             F_MULTIFILE },
764                 { "copyonwrite", FALSE, PARAM_BOOL,     &(s.flags),             F_COPYONWRITE },
765                 { "sparse_cow", FALSE,  PARAM_BOOL,     &(s.flags),             F_SPARSE },
766                 { "sdp",        FALSE,  PARAM_BOOL,     &(s.flags),             F_SDP },
767                 { "sync",       FALSE,  PARAM_BOOL,     &(s.flags),             F_SYNC },
768                 { "flush",      FALSE,  PARAM_BOOL,     &(s.flags),             F_FLUSH },
769                 { "fua",        FALSE,  PARAM_BOOL,     &(s.flags),             F_FUA },
770                 { "rotational", FALSE,  PARAM_BOOL,     &(s.flags),             F_ROTATIONAL },
771                 { "listenaddr", FALSE,  PARAM_STRING,   &(s.listenaddr),        0 },
772                 { "maxconnections", FALSE, PARAM_INT,   &(s.max_connections),   0 },
773         };
774         const int lp_size=sizeof(lp)/sizeof(PARAM);
775         PARAM gp[] = {
776                 { "user",       FALSE, PARAM_STRING,    &runuser,       0 },
777                 { "group",      FALSE, PARAM_STRING,    &rungroup,      0 },
778                 { "oldstyle",   FALSE, PARAM_BOOL,      &do_oldstyle,   1 },
779                 { "listenaddr", FALSE, PARAM_STRING,    &modern_listen, 0 },
780         };
781         PARAM* p=gp;
782         int p_size=sizeof(gp)/sizeof(PARAM);
783         GKeyFile *cfile;
784         GError *err = NULL;
785         const char *err_msg=NULL;
786         GQuark errdomain;
787         GArray *retval=NULL;
788         gchar **groups;
789         gboolean value;
790         gchar* startgroup;
791         gint i;
792         gint j;
793
794         errdomain = g_quark_from_string("parse_cfile");
795         cfile = g_key_file_new();
796         retval = g_array_new(FALSE, TRUE, sizeof(SERVER));
797         if(!g_key_file_load_from_file(cfile, f, G_KEY_FILE_KEEP_COMMENTS |
798                         G_KEY_FILE_KEEP_TRANSLATIONS, &err)) {
799                 g_set_error(e, errdomain, CFILE_NOTFOUND, "Could not open config file %s.", f);
800                 g_key_file_free(cfile);
801                 return retval;
802         }
803         startgroup = g_key_file_get_start_group(cfile);
804         if(!startgroup || strcmp(startgroup, "generic")) {
805                 g_set_error(e, errdomain, CFILE_MISSING_GENERIC, "Config file does not contain the [generic] group!");
806                 g_key_file_free(cfile);
807                 return NULL;
808         }
809         groups = g_key_file_get_groups(cfile, NULL);
810         for(i=0;groups[i];i++) {
811                 memset(&s, '\0', sizeof(SERVER));
812
813                 /* After the [generic] group, start parsing exports */
814                 if(i==1) {
815                         p=lp;
816                         p_size=lp_size;
817                 } 
818                 for(j=0;j<p_size;j++) {
819                         g_assert(p[j].target != NULL);
820                         g_assert(p[j].ptype==PARAM_INT||p[j].ptype==PARAM_STRING||p[j].ptype==PARAM_BOOL);
821                         switch(p[j].ptype) {
822                                 case PARAM_INT:
823                                         *((gint*)p[j].target) =
824                                                 g_key_file_get_integer(cfile,
825                                                                 groups[i],
826                                                                 p[j].paramname,
827                                                                 &err);
828                                         break;
829                                 case PARAM_STRING:
830                                         *((gchar**)p[j].target) =
831                                                 g_key_file_get_string(cfile,
832                                                                 groups[i],
833                                                                 p[j].paramname,
834                                                                 &err);
835                                         break;
836                                 case PARAM_BOOL:
837                                         value = g_key_file_get_boolean(cfile,
838                                                         groups[i],
839                                                         p[j].paramname, &err);
840                                         if(!err) {
841                                                 if(value) {
842                                                         *((gint*)p[j].target) |= p[j].flagval;
843                                                 } else {
844                                                         *((gint*)p[j].target) &= ~(p[j].flagval);
845                                                 }
846                                         }
847                                         break;
848                         }
849                         if(!strcmp(p[j].paramname, "port") && !strcmp(p[j].target, NBD_DEFAULT_PORT)) {
850                                 g_set_error(e, errdomain, CFILE_INCORRECT_PORT, "Config file specifies default port for oldstyle export");
851                                 g_key_file_free(cfile);
852                                 return NULL;
853                         }
854                         if(err) {
855                                 if(err->code == G_KEY_FILE_ERROR_KEY_NOT_FOUND) {
856                                         if(!p[j].required) {
857                                                 /* Ignore not-found error for optional values */
858                                                 g_clear_error(&err);
859                                                 continue;
860                                         } else {
861                                                 err_msg = MISSING_REQUIRED_ERROR;
862                                         }
863                                 } else {
864                                         err_msg = DEFAULT_ERROR;
865                                 }
866                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, err_msg, p[j].paramname, groups[i], err->message);
867                                 g_array_free(retval, TRUE);
868                                 g_error_free(err);
869                                 g_key_file_free(cfile);
870                                 return NULL;
871                         }
872                 }
873                 if(virtstyle) {
874                         if(!strncmp(virtstyle, "none", 4)) {
875                                 s.virtstyle=VIRT_NONE;
876                         } else if(!strncmp(virtstyle, "ipliteral", 9)) {
877                                 s.virtstyle=VIRT_IPLIT;
878                         } else if(!strncmp(virtstyle, "iphash", 6)) {
879                                 s.virtstyle=VIRT_IPHASH;
880                         } else if(!strncmp(virtstyle, "cidrhash", 8)) {
881                                 s.virtstyle=VIRT_CIDR;
882                                 if(strlen(virtstyle)<10) {
883                                         g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s: missing length", virtstyle, groups[i]);
884                                         g_array_free(retval, TRUE);
885                                         g_key_file_free(cfile);
886                                         return NULL;
887                                 }
888                                 s.cidrlen=strtol(virtstyle+8, NULL, 0);
889                         } else {
890                                 g_set_error(e, errdomain, CFILE_VALUE_INVALID, "Invalid value %s for parameter virtstyle in group %s", virtstyle, groups[i]);
891                                 g_array_free(retval, TRUE);
892                                 g_key_file_free(cfile);
893                                 return NULL;
894                         }
895                         if(s.port && !do_oldstyle) {
896                                 g_warning("A port was specified, but oldstyle exports were not requested. This may not do what you expect.");
897                                 g_warning("Please read 'man 5 nbd-server' and search for oldstyle for more info");
898                         }
899                 } else {
900                         s.virtstyle=VIRT_IPLIT;
901                 }
902                 /* Don't need to free this, it's not our string */
903                 virtstyle=NULL;
904                 /* Don't append values for the [generic] group */
905                 if(i>0) {
906                         s.socket_family = AF_UNSPEC;
907                         s.servename = groups[i];
908
909                         append_serve(&s, retval);
910                 } else {
911                         if(!do_oldstyle) {
912                                 lp[1].required = 0;
913                         }
914                 }
915 #ifndef WITH_SDP
916                 if(s.flags & F_SDP) {
917                         g_set_error(e, errdomain, CFILE_VALUE_UNSUPPORTED, "This nbd-server was built without support for SDP, yet group %s uses it", groups[i]);
918                         g_array_free(retval, TRUE);
919                         g_key_file_free(cfile);
920                         return NULL;
921                 }
922 #endif
923         }
924         if(i==1) {
925                 g_set_error(e, errdomain, CFILE_NO_EXPORTS, "The config file does not specify any exports");
926         }
927         g_key_file_free(cfile);
928         return retval;
929 }
930
931 /**
932  * Signal handler for SIGCHLD
933  * @param s the signal we're handling (must be SIGCHLD, or something
934  * is severely wrong)
935  **/
936 void sigchld_handler(int s) {
937         int status;
938         int* i;
939         pid_t pid;
940
941         while((pid=waitpid(-1, &status, WNOHANG)) > 0) {
942                 if(WIFEXITED(status)) {
943                         msg3(LOG_INFO, "Child exited with %d", WEXITSTATUS(status));
944                 }
945                 i=g_hash_table_lookup(children, &pid);
946                 if(!i) {
947                         msg3(LOG_INFO, "SIGCHLD received for an unknown child with PID %ld", (long)pid);
948                 } else {
949                         DEBUG("Removing %d from the list of children", pid);
950                         g_hash_table_remove(children, &pid);
951                 }
952         }
953 }
954
955 /**
956  * Kill a child. Called from sigterm_handler::g_hash_table_foreach.
957  *
958  * @param key the key
959  * @param value the value corresponding to the above key
960  * @param user_data a pointer which we always set to 1, so that we know what
961  * will happen next.
962  **/
963 void killchild(gpointer key, gpointer value, gpointer user_data) {
964         pid_t *pid=value;
965         int *parent=user_data;
966
967         kill(*pid, SIGTERM);
968         *parent=1;
969 }
970
971 /**
972  * Handle SIGTERM and dispatch it to our children
973  * @param s the signal we're handling (must be SIGTERM, or something
974  * is severely wrong).
975  **/
976 void sigterm_handler(int s) {
977         int parent=0;
978
979         g_hash_table_foreach(children, killchild, &parent);
980
981         if(parent) {
982                 unlink(pidfname);
983         }
984
985         exit(EXIT_SUCCESS);
986 }
987
988 /**
989  * Detect the size of a file.
990  *
991  * @param fhandle An open filedescriptor
992  * @return the size of the file, or OFFT_MAX if detection was
993  * impossible.
994  **/
995 off_t size_autodetect(int fhandle) {
996         off_t es;
997         u64 bytes;
998         struct stat stat_buf;
999         int error;
1000
1001 #ifdef HAVE_SYS_MOUNT_H
1002 #ifdef HAVE_SYS_IOCTL_H
1003 #ifdef BLKGETSIZE64
1004         DEBUG("looking for export size with ioctl BLKGETSIZE64\n");
1005         if (!ioctl(fhandle, BLKGETSIZE64, &bytes) && bytes) {
1006                 return (off_t)bytes;
1007         }
1008 #endif /* BLKGETSIZE64 */
1009 #endif /* HAVE_SYS_IOCTL_H */
1010 #endif /* HAVE_SYS_MOUNT_H */
1011
1012         DEBUG("looking for fhandle size with fstat\n");
1013         stat_buf.st_size = 0;
1014         error = fstat(fhandle, &stat_buf);
1015         if (!error) {
1016                 if(stat_buf.st_size > 0)
1017                         return (off_t)stat_buf.st_size;
1018         } else {
1019                 err("fstat failed: %m");
1020         }
1021
1022         DEBUG("looking for fhandle size with lseek SEEK_END\n");
1023         es = lseek(fhandle, (off_t)0, SEEK_END);
1024         if (es > ((off_t)0)) {
1025                 return es;
1026         } else {
1027                 DEBUG("lseek failed: %d", errno==EBADF?1:(errno==ESPIPE?2:(errno==EINVAL?3:4)));
1028         }
1029
1030         err("Could not find size of exported block device: %m");
1031         return OFFT_MAX;
1032 }
1033
1034 /**
1035  * Get the file handle and offset, given an export offset.
1036  *
1037  * @param export An array of export files
1038  * @param a The offset to get corresponding file/offset for
1039  * @param fhandle [out] File descriptor
1040  * @param foffset [out] Offset into fhandle
1041  * @param maxbytes [out] Tells how many bytes can be read/written
1042  * from fhandle starting at foffset (0 if there is no limit)
1043  * @return 0 on success, -1 on failure
1044  **/
1045 int get_filepos(GArray* export, off_t a, int* fhandle, off_t* foffset, size_t* maxbytes ) {
1046         /* Negative offset not allowed */
1047         if(a < 0)
1048                 return -1;
1049
1050         /* Binary search for last file with starting offset <= a */
1051         FILE_INFO fi;
1052         int start = 0;
1053         int end = export->len - 1;
1054         while( start <= end ) {
1055                 int mid = (start + end) / 2;
1056                 fi = g_array_index(export, FILE_INFO, mid);
1057                 if( fi.startoff < a ) {
1058                         start = mid + 1;
1059                 } else if( fi.startoff > a ) {
1060                         end = mid - 1;
1061                 } else {
1062                         start = end = mid;
1063                         break;
1064                 }
1065         }
1066
1067         /* end should never go negative, since first startoff is 0 and a >= 0 */
1068         g_assert(end >= 0);
1069
1070         fi = g_array_index(export, FILE_INFO, end);
1071         *fhandle = fi.fhandle;
1072         *foffset = a - fi.startoff;
1073         *maxbytes = 0;
1074         if( end+1 < export->len ) {
1075                 FILE_INFO fi_next = g_array_index(export, FILE_INFO, end+1);
1076                 *maxbytes = fi_next.startoff - a;
1077         }
1078
1079         return 0;
1080 }
1081
1082 /**
1083  * seek to a position in a file, with error handling.
1084  * @param handle a filedescriptor
1085  * @param a position to seek to
1086  * @todo get rid of this; lastpoint is a global variable right now, but it
1087  * shouldn't be. If we pass it on as a parameter, that makes things a *lot*
1088  * easier.
1089  **/
1090 void myseek(int handle,off_t a) {
1091         if (lseek(handle, a, SEEK_SET) < 0) {
1092                 err("Can not seek locally!\n");
1093         }
1094 }
1095
1096 /**
1097  * Write an amount of bytes at a given offset to the right file. This
1098  * abstracts the write-side of the multiple file option.
1099  *
1100  * @param a The offset where the write should start
1101  * @param buf The buffer to write from
1102  * @param len The length of buf
1103  * @param client The client we're serving for
1104  * @param fua Flag to indicate 'Force Unit Access'
1105  * @return The number of bytes actually written, or -1 in case of an error
1106  **/
1107 ssize_t rawexpwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1108         int fhandle;
1109         off_t foffset;
1110         size_t maxbytes;
1111         ssize_t retval;
1112
1113         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1114                 return -1;
1115         if(maxbytes && len > maxbytes)
1116                 len = maxbytes;
1117
1118         DEBUG("(WRITE to fd %d offset %llu len %u fua %d), ", fhandle, (long long unsigned)foffset, (unsigned int)len, fua);
1119
1120         myseek(fhandle, foffset);
1121         retval = write(fhandle, buf, len);
1122         if(client->server->flags & F_SYNC) {
1123                 fsync(fhandle);
1124         } else if (fua) {
1125
1126           /* This is where we would do the following
1127            *   #ifdef USE_SYNC_FILE_RANGE
1128            * However, we don't, for the reasons set out below
1129            * by Christoph Hellwig <hch@infradead.org>
1130            *
1131            * [BEGINS] 
1132            * fdatasync is equivalent to fsync except that it does not flush
1133            * non-essential metadata (basically just timestamps in practice), but it
1134            * does flush metadata requried to find the data again, e.g. allocation
1135            * information and extent maps.  sync_file_range does nothing but flush
1136            * out pagecache content - it means you basically won't get your data
1137            * back in case of a crash if you either:
1138            * 
1139            *  a) have a volatile write cache in your disk (e.g. any normal SATA disk)
1140            *  b) are using a sparse file on a filesystem
1141            *  c) are using a fallocate-preallocated file on a filesystem
1142            *  d) use any file on a COW filesystem like btrfs
1143            * 
1144            * e.g. it only does anything useful for you if you do not have a volatile
1145            * write cache, and either use a raw block device node, or just overwrite
1146            * an already fully allocated (and not preallocated) file on a non-COW
1147            * filesystem.
1148            * [ENDS]
1149            *
1150            * What we should do is open a second FD with O_DSYNC set, then write to
1151            * that when appropriate. However, with a Linux client, every REQ_FUA
1152            * immediately follows a REQ_FLUSH, so fdatasync does not cause performance
1153            * problems.
1154            *
1155            */
1156 #if 0
1157                 sync_file_range(fhandle, foffset, len,
1158                                 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
1159                                 SYNC_FILE_RANGE_WAIT_AFTER);
1160 #else
1161                 fdatasync(fhandle);
1162 #endif
1163         }
1164         return retval;
1165 }
1166
1167 /**
1168  * Call rawexpwrite repeatedly until all data has been written.
1169  *
1170  * @param a The offset where the write should start
1171  * @param buf The buffer to write from
1172  * @param len The length of buf
1173  * @param client The client we're serving for
1174  * @param fua Flag to indicate 'Force Unit Access'
1175  * @return 0 on success, nonzero on failure
1176  **/
1177 int rawexpwrite_fully(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1178         ssize_t ret=0;
1179
1180         while(len > 0 && (ret=rawexpwrite(a, buf, len, client, fua)) > 0 ) {
1181                 a += ret;
1182                 buf += ret;
1183                 len -= ret;
1184         }
1185         return (ret < 0 || len != 0);
1186 }
1187
1188 /**
1189  * Read an amount of bytes at a given offset from the right file. This
1190  * abstracts the read-side of the multiple files option.
1191  *
1192  * @param a The offset where the read should start
1193  * @param buf A buffer to read into
1194  * @param len The size of buf
1195  * @param client The client we're serving for
1196  * @return The number of bytes actually read, or -1 in case of an
1197  * error.
1198  **/
1199 ssize_t rawexpread(off_t a, char *buf, size_t len, CLIENT *client) {
1200         int fhandle;
1201         off_t foffset;
1202         size_t maxbytes;
1203
1204         if(get_filepos(client->export, a, &fhandle, &foffset, &maxbytes))
1205                 return -1;
1206         if(maxbytes && len > maxbytes)
1207                 len = maxbytes;
1208
1209         DEBUG("(READ from fd %d offset %llu len %u), ", fhandle, (long long unsigned int)foffset, (unsigned int)len);
1210
1211         myseek(fhandle, foffset);
1212         return read(fhandle, buf, len);
1213 }
1214
1215 /**
1216  * Call rawexpread repeatedly until all data has been read.
1217  * @return 0 on success, nonzero on failure
1218  **/
1219 int rawexpread_fully(off_t a, char *buf, size_t len, CLIENT *client) {
1220         ssize_t ret=0;
1221
1222         while(len > 0 && (ret=rawexpread(a, buf, len, client)) > 0 ) {
1223                 a += ret;
1224                 buf += ret;
1225                 len -= ret;
1226         }
1227         return (ret < 0 || len != 0);
1228 }
1229
1230 /**
1231  * Read an amount of bytes at a given offset from the right file. This
1232  * abstracts the read-side of the copyonwrite stuff, and calls
1233  * rawexpread() with the right parameters to do the actual work.
1234  * @param a The offset where the read should start
1235  * @param buf A buffer to read into
1236  * @param len The size of buf
1237  * @param client The client we're going to read for
1238  * @return 0 on success, nonzero on failure
1239  **/
1240 int expread(off_t a, char *buf, size_t len, CLIENT *client) {
1241         off_t rdlen, offset;
1242         off_t mapcnt, mapl, maph, pagestart;
1243
1244         if (!(client->server->flags & F_COPYONWRITE))
1245                 return(rawexpread_fully(a, buf, len, client));
1246         DEBUG("Asked to read %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1247
1248         mapl=a/DIFFPAGESIZE; maph=(a+len-1)/DIFFPAGESIZE;
1249
1250         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1251                 pagestart=mapcnt*DIFFPAGESIZE;
1252                 offset=a-pagestart;
1253                 rdlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1254                         len : (size_t)DIFFPAGESIZE-offset;
1255                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1256                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1257                                (unsigned long)(client->difmap[mapcnt]));
1258                         myseek(client->difffile, client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1259                         if (read(client->difffile, buf, rdlen) != rdlen) return -1;
1260                 } else { /* the block is not there */
1261                         DEBUG("Page %llu is not here, we read the original one\n",
1262                                (unsigned long long)mapcnt);
1263                         if(rawexpread_fully(a, buf, rdlen, client)) return -1;
1264                 }
1265                 len-=rdlen; a+=rdlen; buf+=rdlen;
1266         }
1267         return 0;
1268 }
1269
1270 /**
1271  * Write an amount of bytes at a given offset to the right file. This
1272  * abstracts the write-side of the copyonwrite option, and calls
1273  * rawexpwrite() with the right parameters to do the actual work.
1274  *
1275  * @param a The offset where the write should start
1276  * @param buf The buffer to write from
1277  * @param len The length of buf
1278  * @param client The client we're going to write for.
1279  * @param fua Flag to indicate 'Force Unit Access'
1280  * @return 0 on success, nonzero on failure
1281  **/
1282 int expwrite(off_t a, char *buf, size_t len, CLIENT *client, int fua) {
1283         char pagebuf[DIFFPAGESIZE];
1284         off_t mapcnt,mapl,maph;
1285         off_t wrlen,rdlen; 
1286         off_t pagestart;
1287         off_t offset;
1288
1289         if (!(client->server->flags & F_COPYONWRITE))
1290                 return(rawexpwrite_fully(a, buf, len, client, fua)); 
1291         DEBUG("Asked to write %u bytes at %llu.\n", (unsigned int)len, (unsigned long long)a);
1292
1293         mapl=a/DIFFPAGESIZE ; maph=(a+len-1)/DIFFPAGESIZE ;
1294
1295         for (mapcnt=mapl;mapcnt<=maph;mapcnt++) {
1296                 pagestart=mapcnt*DIFFPAGESIZE ;
1297                 offset=a-pagestart ;
1298                 wrlen=(0<DIFFPAGESIZE-offset && len<(size_t)(DIFFPAGESIZE-offset)) ?
1299                         len : (size_t)DIFFPAGESIZE-offset;
1300
1301                 if (client->difmap[mapcnt]!=(u32)(-1)) { /* the block is already there */
1302                         DEBUG("Page %llu is at %lu\n", (unsigned long long)mapcnt,
1303                                (unsigned long)(client->difmap[mapcnt])) ;
1304                         myseek(client->difffile,
1305                                         client->difmap[mapcnt]*DIFFPAGESIZE+offset);
1306                         if (write(client->difffile, buf, wrlen) != wrlen) return -1 ;
1307                 } else { /* the block is not there */
1308                         myseek(client->difffile,client->difffilelen*DIFFPAGESIZE) ;
1309                         client->difmap[mapcnt]=(client->server->flags&F_SPARSE)?mapcnt:client->difffilelen++;
1310                         DEBUG("Page %llu is not here, we put it at %lu\n",
1311                                (unsigned long long)mapcnt,
1312                                (unsigned long)(client->difmap[mapcnt]));
1313                         rdlen=DIFFPAGESIZE ;
1314                         if (rawexpread_fully(pagestart, pagebuf, rdlen, client))
1315                                 return -1;
1316                         memcpy(pagebuf+offset,buf,wrlen) ;
1317                         if (write(client->difffile, pagebuf, DIFFPAGESIZE) !=
1318                                         DIFFPAGESIZE)
1319                                 return -1;
1320                 }                                                   
1321                 len-=wrlen ; a+=wrlen ; buf+=wrlen ;
1322         }
1323         if (client->server->flags & F_SYNC) {
1324                 fsync(client->difffile);
1325         } else if (fua) {
1326                 /* open question: would it be cheaper to do multiple sync_file_ranges?
1327                    as we iterate through the above?
1328                  */
1329                 fdatasync(client->difffile);
1330         }
1331         return 0;
1332 }
1333
1334 /**
1335  * Flush data to a client
1336  *
1337  * @param client The client we're going to write for.
1338  * @return 0 on success, nonzero on failure
1339  **/
1340 int expflush(CLIENT *client) {
1341         gint i;
1342
1343         if (client->server->flags & F_COPYONWRITE) {
1344                 return fsync(client->difffile);
1345         }
1346         
1347         for (i = 0; i < client->export->len; i++) {
1348                 FILE_INFO fi = g_array_index(client->export, FILE_INFO, i);
1349                 if (fsync(fi.fhandle) < 0)
1350                         return -1;
1351         }
1352         
1353         return 0;
1354 }
1355
1356 /**
1357  * Do the initial negotiation.
1358  *
1359  * @param client The client we're negotiating with.
1360  **/
1361 CLIENT* negotiate(int net, CLIENT *client, GArray* servers) {
1362         char zeros[128];
1363         uint64_t size_host;
1364         uint32_t flags = NBD_FLAG_HAS_FLAGS;
1365         uint16_t smallflags = 0;
1366         uint64_t magic;
1367
1368         memset(zeros, '\0', sizeof(zeros));
1369         if(!client || !client->modern) {
1370                 /* common */
1371                 if (write(net, INIT_PASSWD, 8) < 0) {
1372                         err_nonfatal("Negotiation failed: %m");
1373                         if(client)
1374                                 exit(EXIT_FAILURE);
1375                 }
1376                 if(!client || client->modern) {
1377                         /* modern */
1378                         magic = htonll(opts_magic);
1379                 } else {
1380                         /* oldstyle */
1381                         magic = htonll(cliserv_magic);
1382                 }
1383                 if (write(net, &magic, sizeof(magic)) < 0) {
1384                         err_nonfatal("Negotiation failed: %m");
1385                         if(client)
1386                                 exit(EXIT_FAILURE);
1387                 }
1388         }
1389         if(!client) {
1390                 /* modern */
1391                 uint32_t reserved;
1392                 uint32_t opt;
1393                 uint32_t namelen;
1394                 char* name;
1395                 int i;
1396
1397                 if(!servers)
1398                         err("programmer error");
1399                 if (write(net, &smallflags, sizeof(uint16_t)) < 0)
1400                         err("Negotiation failed: %m");
1401                 if (read(net, &reserved, sizeof(reserved)) < 0)
1402                         err("Negotiation failed: %m");
1403                 if (read(net, &magic, sizeof(magic)) < 0)
1404                         err("Negotiation failed: %m");
1405                 magic = ntohll(magic);
1406                 if(magic != opts_magic) {
1407                         close(net);
1408                         return NULL;
1409                 }
1410                 if (read(net, &opt, sizeof(opt)) < 0)
1411                         err("Negotiation failed: %m");
1412                 opt = ntohl(opt);
1413                 if(opt != NBD_OPT_EXPORT_NAME) {
1414                         close(net);
1415                         return NULL;
1416                 }
1417                 if (read(net, &namelen, sizeof(namelen)) < 0)
1418                         err("Negotiation failed: %m");
1419                 namelen = ntohl(namelen);
1420                 name = malloc(namelen+1);
1421                 name[namelen]=0;
1422                 if (read(net, name, namelen) < 0)
1423                         err("Negotiation failed: %m");
1424                 for(i=0; i<servers->len; i++) {
1425                         SERVER* serve = &(g_array_index(servers, SERVER, i));
1426                         if(!strcmp(serve->servename, name)) {
1427                                 CLIENT* client = g_new0(CLIENT, 1);
1428                                 client->server = serve;
1429                                 client->exportsize = OFFT_MAX;
1430                                 client->net = net;
1431                                 client->modern = TRUE;
1432                                 client->transactionlogfd = -1;
1433                                 free(name);
1434                                 return client;
1435                         }
1436                 }
1437                 free(name);
1438                 return NULL;
1439         }
1440         /* common */
1441         size_host = htonll((u64)(client->exportsize));
1442         if (write(net, &size_host, 8) < 0)
1443                 err("Negotiation failed: %m");
1444         if (client->server->flags & F_READONLY)
1445                 flags |= NBD_FLAG_READ_ONLY;
1446         if (client->server->flags & F_FLUSH)
1447                 flags |= NBD_FLAG_SEND_FLUSH;
1448         if (client->server->flags & F_FUA)
1449                 flags |= NBD_FLAG_SEND_FUA;
1450         if (client->server->flags & F_ROTATIONAL)
1451                 flags |= NBD_FLAG_ROTATIONAL;
1452         if (!client->modern) {
1453                 /* oldstyle */
1454                 flags = htonl(flags);
1455                 if (write(client->net, &flags, 4) < 0)
1456                         err("Negotiation failed: %m");
1457         } else {
1458                 /* modern */
1459                 smallflags = (uint16_t)(flags & ~((uint16_t)0));
1460                 smallflags = htons(smallflags);
1461                 if (write(client->net, &smallflags, sizeof(smallflags)) < 0) {
1462                         err("Negotiation failed: %m");
1463                 }
1464         }
1465         /* common */
1466         if (write(client->net, zeros, 124) < 0)
1467                 err("Negotiation failed: %m");
1468         return NULL;
1469 }
1470
1471 /** sending macro. */
1472 #define SEND(net,reply) { writeit( net, &reply, sizeof( reply )); \
1473         if (client->transactionlogfd != -1) \
1474                 writeit(client->transactionlogfd, &reply, sizeof(reply)); }
1475 /** error macro. */
1476 #define ERROR(client,reply,errcode) { reply.error = htonl(errcode); SEND(client->net,reply); reply.error = 0; }
1477 /**
1478  * Serve a file to a single client.
1479  *
1480  * @todo This beast needs to be split up in many tiny little manageable
1481  * pieces. Preferably with a chainsaw.
1482  *
1483  * @param client The client we're going to serve to.
1484  * @return when the client disconnects
1485  **/
1486 int mainloop(CLIENT *client) {
1487         struct nbd_request request;
1488         struct nbd_reply reply;
1489         gboolean go_on=TRUE;
1490 #ifdef DODBG
1491         int i = 0;
1492 #endif
1493         negotiate(client->net, client, NULL);
1494         DEBUG("Entering request loop!\n");
1495         reply.magic = htonl(NBD_REPLY_MAGIC);
1496         reply.error = 0;
1497         while (go_on) {
1498                 char buf[BUFSIZE];
1499                 char* p;
1500                 size_t len;
1501                 size_t currlen;
1502                 size_t writelen;
1503                 uint16_t command;
1504 #ifdef DODBG
1505                 i++;
1506                 printf("%d: ", i);
1507 #endif
1508                 readit(client->net, &request, sizeof(request));
1509                 if (client->transactionlogfd != -1)
1510                         writeit(client->transactionlogfd, &request, sizeof(request));
1511
1512                 request.from = ntohll(request.from);
1513                 request.type = ntohl(request.type);
1514                 command = request.type & NBD_CMD_MASK_COMMAND;
1515                 len = ntohl(request.len);
1516
1517                 DEBUG("%s from %llu (%llu) len %d, ", getcommandname(command),
1518                                 (unsigned long long)request.from,
1519                                 (unsigned long long)request.from / 512, (unsigned int)len);
1520
1521                 if (request.magic != htonl(NBD_REQUEST_MAGIC))
1522                         err("Not enough magic.");
1523
1524                 memcpy(reply.handle, request.handle, sizeof(reply.handle));
1525
1526                 if ((command==NBD_CMD_WRITE) || (command==NBD_CMD_READ)) {
1527                         if ((request.from + len) > (OFFT_MAX)) {
1528                                 DEBUG("[Number too large!]");
1529                                 ERROR(client, reply, EINVAL);
1530                                 continue;
1531                         }
1532
1533                         if (((ssize_t)((off_t)request.from + len) > client->exportsize)) {
1534                                 DEBUG("[RANGE!]");
1535                                 ERROR(client, reply, EINVAL);
1536                                 continue;
1537                         }
1538
1539                         currlen = len;
1540                         if (currlen > BUFSIZE - sizeof(struct nbd_reply)) {
1541                                 currlen = BUFSIZE - sizeof(struct nbd_reply);
1542                                 msg2(LOG_INFO, "oversized request (this is not a problem)");
1543                         }
1544                 }
1545
1546                 switch (command) {
1547
1548                 case NBD_CMD_DISC:
1549                         msg2(LOG_INFO, "Disconnect request received.");
1550                         if (client->server->flags & F_COPYONWRITE) { 
1551                                 if (client->difmap) g_free(client->difmap) ;
1552                                 close(client->difffile);
1553                                 unlink(client->difffilename);
1554                                 free(client->difffilename);
1555                         }
1556                         go_on=FALSE;
1557                         continue;
1558
1559                 case NBD_CMD_WRITE:
1560                         DEBUG("wr: net->buf, ");
1561                         while(len > 0) {
1562                                 readit(client->net, buf, currlen);
1563                                 DEBUG("buf->exp, ");
1564                                 if ((client->server->flags & F_READONLY) ||
1565                                     (client->server->flags & F_AUTOREADONLY)) {
1566                                         DEBUG("[WRITE to READONLY!]");
1567                                         ERROR(client, reply, EPERM);
1568                                         consume(client->net, buf, len-currlen, BUFSIZE);
1569                                         continue;
1570                                 }
1571                                 if (expwrite(request.from, buf, currlen, client,
1572                                              request.type & NBD_CMD_FLAG_FUA)) {
1573                                         DEBUG("Write failed: %m" );
1574                                         ERROR(client, reply, errno);
1575                                         consume(client->net, buf, len-currlen, BUFSIZE);
1576                                         continue;
1577                                 }
1578                                 len -= currlen;
1579                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1580                         }
1581                         SEND(client->net, reply);
1582                         DEBUG("OK!\n");
1583                         continue;
1584
1585                 case NBD_CMD_FLUSH:
1586                         DEBUG("fl: ");
1587                         if (expflush(client)) {
1588                                 DEBUG("Flush failed: %m");
1589                                 ERROR(client, reply, errno);
1590                                 continue;
1591                         }
1592                         SEND(client->net, reply);
1593                         DEBUG("OK!\n");
1594                         continue;
1595
1596                 case NBD_CMD_READ:
1597                         DEBUG("exp->buf, ");
1598                         memcpy(buf, &reply, sizeof(struct nbd_reply));
1599                         if (client->transactionlogfd != -1)
1600                                 writeit(client->transactionlogfd, &reply, sizeof(reply));
1601                         p = buf + sizeof(struct nbd_reply);
1602                         writelen = currlen + sizeof(struct nbd_reply);
1603                         while(len > 0) {
1604                                 if (expread(request.from, p, currlen, client)) {
1605                                         DEBUG("Read failed: %m");
1606                                         ERROR(client, reply, errno);
1607                                         continue;
1608                                 }
1609                                 
1610                                 DEBUG("buf->net, ");
1611                                 writeit(client->net, buf, writelen);
1612                                 len -= currlen;
1613                                 request.from += currlen;
1614                                 currlen = (len < BUFSIZE) ? len : BUFSIZE;
1615                                 p = buf;
1616                                 writelen = currlen;
1617                         }
1618                         DEBUG("OK!\n");
1619                         continue;
1620
1621                 default:
1622                         DEBUG ("Ignoring unknown command\n");
1623                         continue;
1624                 }
1625         }
1626         return 0;
1627 }
1628
1629 /**
1630  * Set up client export array, which is an array of FILE_INFO.
1631  * Also, split a single exportfile into multiple ones, if that was asked.
1632  * @param client information on the client which we want to setup export for
1633  **/
1634 void setupexport(CLIENT* client) {
1635         int i;
1636         off_t laststartoff = 0, lastsize = 0;
1637         int multifile = (client->server->flags & F_MULTIFILE);
1638
1639         client->export = g_array_new(TRUE, TRUE, sizeof(FILE_INFO));
1640
1641         /* If multi-file, open as many files as we can.
1642          * If not, open exactly one file.
1643          * Calculate file sizes as we go to get total size. */
1644         for(i=0; ; i++) {
1645                 FILE_INFO fi;
1646                 gchar *tmpname;
1647                 gchar* error_string;
1648                 mode_t mode = (client->server->flags & F_READONLY) ? O_RDONLY : O_RDWR;
1649
1650                 if(multifile) {
1651                         tmpname=g_strdup_printf("%s.%d", client->exportname, i);
1652                 } else {
1653                         tmpname=g_strdup(client->exportname);
1654                 }
1655                 DEBUG( "Opening %s\n", tmpname );
1656                 fi.fhandle = open(tmpname, mode);
1657                 if(fi.fhandle == -1 && mode == O_RDWR) {
1658                         /* Try again because maybe media was read-only */
1659                         fi.fhandle = open(tmpname, O_RDONLY);
1660                         if(fi.fhandle != -1) {
1661                                 /* Opening the base file in copyonwrite mode is
1662                                  * okay */
1663                                 if(!(client->server->flags & F_COPYONWRITE)) {
1664                                         client->server->flags |= F_AUTOREADONLY;
1665                                         client->server->flags |= F_READONLY;
1666                                 }
1667                         }
1668                 }
1669                 if(fi.fhandle == -1) {
1670                         if(multifile && i>0)
1671                                 break;
1672                         error_string=g_strdup_printf(
1673                                 "Could not open exported file %s: %%m",
1674                                 tmpname);
1675                         err(error_string);
1676                 }
1677                 fi.startoff = laststartoff + lastsize;
1678                 g_array_append_val(client->export, fi);
1679                 g_free(tmpname);
1680
1681                 /* Starting offset and size of this file will be used to
1682                  * calculate starting offset of next file */
1683                 laststartoff = fi.startoff;
1684                 lastsize = size_autodetect(fi.fhandle);
1685
1686                 if(!multifile)
1687                         break;
1688         }
1689
1690         /* Set export size to total calculated size */
1691         client->exportsize = laststartoff + lastsize;
1692
1693         /* Export size may be overridden */
1694         if(client->server->expected_size) {
1695                 /* desired size must be <= total calculated size */
1696                 if(client->server->expected_size > client->exportsize) {
1697                         err("Size of exported file is too big\n");
1698                 }
1699
1700                 client->exportsize = client->server->expected_size;
1701         }
1702
1703         msg3(LOG_INFO, "Size of exported file/device is %llu", (unsigned long long)client->exportsize);
1704         if(multifile) {
1705                 msg3(LOG_INFO, "Total number of files: %d", i);
1706         }
1707 }
1708
1709 int copyonwrite_prepare(CLIENT* client) {
1710         off_t i;
1711         if ((client->difffilename = malloc(1024))==NULL)
1712                 err("Failed to allocate string for diff file name");
1713         snprintf(client->difffilename, 1024, "%s-%s-%d.diff",client->exportname,client->clientname,
1714                 (int)getpid()) ;
1715         client->difffilename[1023]='\0';
1716         msg3(LOG_INFO,"About to create map and diff file %s",client->difffilename) ;
1717         client->difffile=open(client->difffilename,O_RDWR | O_CREAT | O_TRUNC,0600) ;
1718         if (client->difffile<0) err("Could not create diff file (%m)") ;
1719         if ((client->difmap=calloc(client->exportsize/DIFFPAGESIZE,sizeof(u32)))==NULL)
1720                 err("Could not allocate memory") ;
1721         for (i=0;i<client->exportsize/DIFFPAGESIZE;i++) client->difmap[i]=(u32)-1 ;
1722
1723         return 0;
1724 }
1725
1726 /**
1727  * Run a command. This is used for the ``prerun'' and ``postrun'' config file
1728  * options
1729  *
1730  * @param command the command to be ran. Read from the config file
1731  * @param file the file name we're about to export
1732  **/
1733 int do_run(gchar* command, gchar* file) {
1734         gchar* cmd;
1735         int retval=0;
1736
1737         if(command && *command) {
1738                 cmd = g_strdup_printf(command, file);
1739                 retval=system(cmd);
1740                 g_free(cmd);
1741         }
1742         return retval;
1743 }
1744
1745 /**
1746  * Serve a connection. 
1747  *
1748  * @todo allow for multithreading, perhaps use libevent. Not just yet, though;
1749  * follow the road map.
1750  *
1751  * @param client a connected client
1752  **/
1753 void serveconnection(CLIENT *client) {
1754         if (client->server->transactionlog && (client->transactionlogfd == -1))
1755         {
1756                 if (-1 == (client->transactionlogfd = open(client->server->transactionlog,
1757                                                            O_WRONLY | O_CREAT,
1758                                                            S_IRUSR | S_IWUSR)))
1759                         g_warning("Could not open transaction log %s",
1760                                   client->server->transactionlog);
1761         }
1762
1763         if(do_run(client->server->prerun, client->exportname)) {
1764                 exit(EXIT_FAILURE);
1765         }
1766         setupexport(client);
1767
1768         if (client->server->flags & F_COPYONWRITE) {
1769                 copyonwrite_prepare(client);
1770         }
1771
1772         setmysockopt(client->net);
1773
1774         mainloop(client);
1775         do_run(client->server->postrun, client->exportname);
1776
1777         if (-1 != client->transactionlogfd)
1778         {
1779                 close(client->transactionlogfd);
1780                 client->transactionlogfd = -1;
1781         }
1782 }
1783
1784 /**
1785  * Find the name of the file we have to serve. This will use g_strdup_printf
1786  * to put the IP address of the client inside a filename containing
1787  * "%s" (in the form as specified by the "virtstyle" option). That name
1788  * is then written to client->exportname.
1789  *
1790  * @param net A socket connected to an nbd client
1791  * @param client information about the client. The IP address in human-readable
1792  * format will be written to a new char* buffer, the address of which will be
1793  * stored in client->clientname.
1794  **/
1795 void set_peername(int net, CLIENT *client) {
1796         struct sockaddr_storage addrin;
1797         struct sockaddr_storage netaddr;
1798         struct sockaddr_in  *netaddr4 = NULL;
1799         struct sockaddr_in6 *netaddr6 = NULL;
1800         size_t addrinlen = sizeof( addrin );
1801         struct addrinfo hints;
1802         struct addrinfo *ai = NULL;
1803         char peername[NI_MAXHOST];
1804         char netname[NI_MAXHOST];
1805         char *tmp = NULL;
1806         int i;
1807         int e;
1808         int shift;
1809
1810         if (getpeername(net, (struct sockaddr *) &addrin, (socklen_t *)&addrinlen) < 0)
1811                 err("getsockname failed: %m");
1812
1813         getnameinfo((struct sockaddr *)&addrin, (socklen_t)addrinlen,
1814                 peername, sizeof (peername), NULL, 0, NI_NUMERICHOST);
1815
1816         memset(&hints, '\0', sizeof (hints));
1817         hints.ai_flags = AI_ADDRCONFIG;
1818         e = getaddrinfo(peername, NULL, &hints, &ai);
1819
1820         if(e != 0) {
1821                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
1822                 freeaddrinfo(ai);
1823                 return;
1824         }
1825
1826         switch(client->server->virtstyle) {
1827                 case VIRT_NONE:
1828                         client->exportname=g_strdup(client->server->exportname);
1829                         break;
1830                 case VIRT_IPHASH:
1831                         for(i=0;i<strlen(peername);i++) {
1832                                 if(peername[i]=='.') {
1833                                         peername[i]='/';
1834                                 }
1835                         }
1836                 case VIRT_IPLIT:
1837                         client->exportname=g_strdup_printf(client->server->exportname, peername);
1838                         break;
1839                 case VIRT_CIDR:
1840                         memcpy(&netaddr, &addrin, addrinlen);
1841                         if(ai->ai_family == AF_INET) {
1842                                 netaddr4 = (struct sockaddr_in *)&netaddr;
1843                                 (netaddr4->sin_addr).s_addr>>=32-(client->server->cidrlen);
1844                                 (netaddr4->sin_addr).s_addr<<=32-(client->server->cidrlen);
1845
1846                                 getnameinfo((struct sockaddr *) netaddr4, (socklen_t) addrinlen,
1847                                                         netname, sizeof (netname), NULL, 0, NI_NUMERICHOST);
1848                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1849                         }else if(ai->ai_family == AF_INET6) {
1850                                 netaddr6 = (struct sockaddr_in6 *)&netaddr;
1851
1852                                 shift = 128-(client->server->cidrlen);
1853                                 i = 3;
1854                                 while(shift >= 32) {
1855                                         ((netaddr6->sin6_addr).s6_addr32[i])=0;
1856                                         shift-=32;
1857                                         i--;
1858                                 }
1859                                 (netaddr6->sin6_addr).s6_addr32[i]>>=shift;
1860                                 (netaddr6->sin6_addr).s6_addr32[i]<<=shift;
1861
1862                                 getnameinfo((struct sockaddr *)netaddr6, (socklen_t)addrinlen,
1863                                             netname, sizeof(netname), NULL, 0, NI_NUMERICHOST);
1864                                 tmp=g_strdup_printf("%s/%s", netname, peername);
1865                         }
1866
1867                         if(tmp != NULL)
1868                           client->exportname=g_strdup_printf(client->server->exportname, tmp);
1869
1870                         break;
1871         }
1872
1873         freeaddrinfo(ai);
1874         msg4(LOG_INFO, "connect from %s, assigned file is %s", 
1875              peername, client->exportname);
1876         client->clientname=g_strdup(peername);
1877 }
1878
1879 /**
1880  * Destroy a pid_t*
1881  * @param data a pointer to pid_t which should be freed
1882  **/
1883 void destroy_pid_t(gpointer data) {
1884         g_free(data);
1885 }
1886
1887 /**
1888  * Loop through the available servers, and serve them. Never returns.
1889  **/
1890 int serveloop(GArray* servers) {
1891         struct sockaddr_storage addrin;
1892         socklen_t addrinlen=sizeof(addrin);
1893         int i;
1894         int max;
1895         int sock;
1896         fd_set mset;
1897         fd_set rset;
1898
1899         /* 
1900          * Set up the master fd_set. The set of descriptors we need
1901          * to select() for never changes anyway and it buys us a *lot*
1902          * of time to only build this once. However, if we ever choose
1903          * to not fork() for clients anymore, we may have to revisit
1904          * this.
1905          */
1906         max=0;
1907         FD_ZERO(&mset);
1908         for(i=0;i<servers->len;i++) {
1909                 if((sock=(g_array_index(servers, SERVER, i)).socket)) {
1910                         FD_SET(sock, &mset);
1911                         max=sock>max?sock:max;
1912                 }
1913         }
1914         if(modernsock) {
1915                 FD_SET(modernsock, &mset);
1916                 max=modernsock>max?modernsock:max;
1917         }
1918         for(;;) {
1919                 CLIENT *client = NULL;
1920                 pid_t *pid;
1921
1922                 memcpy(&rset, &mset, sizeof(fd_set));
1923                 if(select(max+1, &rset, NULL, NULL, NULL)>0) {
1924                         int net = 0;
1925                         SERVER* serve=NULL;
1926
1927                         DEBUG("accept, ");
1928                         if(FD_ISSET(modernsock, &rset)) {
1929                                 if((net=accept(modernsock, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1930                                         err("accept: %m");
1931                                 client = negotiate(net, NULL, servers);
1932                                 if(!client) {
1933                                         err_nonfatal("negotiation failed");
1934                                         close(net);
1935                                         net=0;
1936                                         continue;
1937                                 }
1938                                 serve = client->server;
1939                         }
1940                         for(i=0;i<servers->len && !net;i++) {
1941                                 serve=&(g_array_index(servers, SERVER, i));
1942                                 if(FD_ISSET(serve->socket, &rset)) {
1943                                         if ((net=accept(serve->socket, (struct sockaddr *) &addrin, &addrinlen)) < 0)
1944                                                 err("accept: %m");
1945                                 }
1946                         }
1947                         if(net) {
1948                                 int sock_flags;
1949
1950                                 if(serve->max_connections > 0 &&
1951                                    g_hash_table_size(children) >= serve->max_connections) {
1952                                         msg2(LOG_INFO, "Max connections reached");
1953                                         close(net);
1954                                         continue;
1955                                 }
1956                                 if((sock_flags = fcntl(net, F_GETFL, 0))==-1) {
1957                                         err("fcntl F_GETFL");
1958                                 }
1959                                 if(fcntl(net, F_SETFL, sock_flags &~O_NONBLOCK)==-1) {
1960                                         err("fcntl F_SETFL ~O_NONBLOCK");
1961                                 }
1962                                 if(!client) {
1963                                         client = g_new0(CLIENT, 1);
1964                                         client->server=serve;
1965                                         client->exportsize=OFFT_MAX;
1966                                         client->net=net;
1967                                         client->transactionlogfd = -1;
1968                                 }
1969                                 set_peername(net, client);
1970                                 if (!authorized_client(client)) {
1971                                         msg2(LOG_INFO,"Unauthorized client") ;
1972                                         close(net);
1973                                         continue;
1974                                 }
1975                                 msg2(LOG_INFO,"Authorized client") ;
1976                                 pid=g_malloc(sizeof(pid_t));
1977
1978                                 if (!dontfork) {
1979                                         if ((*pid=fork())<0) {
1980                                                 msg3(LOG_INFO,"Could not fork (%s)",strerror(errno)) ;
1981                                                 close(net);
1982                                                 continue;
1983                                         }
1984                                         if (*pid>0) { /* parent */
1985                                                 close(net);
1986                                                 g_hash_table_insert(children, pid, pid);
1987                                                 continue;
1988                                         }
1989                                         /* child */
1990                                         g_hash_table_destroy(children);
1991                                         for(i=0;i<servers->len;i++) {
1992                                                 serve=&g_array_index(servers, SERVER, i);
1993                                                 close(serve->socket);
1994                                         }
1995                                         /* FALSE does not free the
1996                                            actual data. This is required,
1997                                            because the client has a
1998                                            direct reference into that
1999                                            data, and otherwise we get a
2000                                            segfault... */
2001                                         g_array_free(servers, FALSE);
2002                                 }
2003
2004                                 msg2(LOG_INFO,"Starting to serve");
2005                                 serveconnection(client);
2006                                 exit(EXIT_SUCCESS);
2007                         }
2008                 }
2009         }
2010 }
2011
2012 void dosockopts(int socket) {
2013 #ifndef sun
2014         int yes=1;
2015 #else
2016         char yes='1';
2017 #endif /* sun */
2018         int sock_flags;
2019
2020         /* lose the pesky "Address already in use" error message */
2021         if (setsockopt(socket,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
2022                 err("setsockopt SO_REUSEADDR");
2023         }
2024         if (setsockopt(socket,SOL_SOCKET,SO_KEEPALIVE,&yes,sizeof(int)) == -1) {
2025                 err("setsockopt SO_KEEPALIVE");
2026         }
2027
2028         /* make the listening socket non-blocking */
2029         if ((sock_flags = fcntl(socket, F_GETFL, 0)) == -1) {
2030                 err("fcntl F_GETFL");
2031         }
2032         if (fcntl(socket, F_SETFL, sock_flags | O_NONBLOCK) == -1) {
2033                 err("fcntl F_SETFL O_NONBLOCK");
2034         }
2035 }
2036
2037 /**
2038  * Connect a server's socket.
2039  *
2040  * @param serve the server we want to connect.
2041  **/
2042 int setup_serve(SERVER *serve) {
2043         struct addrinfo hints;
2044         struct addrinfo *ai = NULL;
2045         gchar *port = NULL;
2046         int e;
2047
2048         if(!do_oldstyle) {
2049                 return serve->servename ? 1 : 0;
2050         }
2051         memset(&hints,'\0',sizeof(hints));
2052         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG | AI_NUMERICSERV;
2053         hints.ai_socktype = SOCK_STREAM;
2054         hints.ai_family = serve->socket_family;
2055
2056         port = g_strdup_printf ("%d", serve->port);
2057         if (port == NULL)
2058                 return 0;
2059
2060         e = getaddrinfo(serve->listenaddr,port,&hints,&ai);
2061
2062         g_free(port);
2063
2064         if(e != 0) {
2065                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2066                 serve->socket = -1;
2067                 freeaddrinfo(ai);
2068                 exit(EXIT_FAILURE);
2069         }
2070
2071         if(serve->socket_family == AF_UNSPEC)
2072                 serve->socket_family = ai->ai_family;
2073
2074 #ifdef WITH_SDP
2075         if ((serve->flags) && F_SDP) {
2076                 if (ai->ai_family == AF_INET)
2077                         ai->ai_family = AF_INET_SDP;
2078                 else (ai->ai_family == AF_INET6)
2079                         ai->ai_family = AF_INET6_SDP;
2080         }
2081 #endif
2082         if ((serve->socket = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol)) < 0)
2083                 err("socket: %m");
2084
2085         dosockopts(serve->socket);
2086
2087         DEBUG("Waiting for connections... bind, ");
2088         e = bind(serve->socket, ai->ai_addr, ai->ai_addrlen);
2089         if (e != 0 && errno != EADDRINUSE)
2090                 err("bind: %m");
2091         DEBUG("listen, ");
2092         if (listen(serve->socket, 1) < 0)
2093                 err("listen: %m");
2094
2095         freeaddrinfo (ai);
2096         if(serve->servename) {
2097                 return 1;
2098         } else {
2099                 return 0;
2100         }
2101 }
2102
2103 void open_modern(void) {
2104         struct addrinfo hints;
2105         struct addrinfo* ai = NULL;
2106         struct sock_flags;
2107         int e;
2108
2109         memset(&hints, '\0', sizeof(hints));
2110         hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
2111         hints.ai_socktype = SOCK_STREAM;
2112         hints.ai_family = AF_UNSPEC;
2113         hints.ai_protocol = IPPROTO_TCP;
2114         e = getaddrinfo(modern_listen, NBD_DEFAULT_PORT, &hints, &ai);
2115         if(e != 0) {
2116                 fprintf(stderr, "getaddrinfo failed: %s\n", gai_strerror(e));
2117                 exit(EXIT_FAILURE);
2118         }
2119         if((modernsock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol))<0) {
2120                 err("socket: %m");
2121         }
2122
2123         dosockopts(modernsock);
2124
2125         if(bind(modernsock, ai->ai_addr, ai->ai_addrlen)) {
2126                 err("bind: %m");
2127         }
2128         if(listen(modernsock, 10) <0) {
2129                 err("listen: %m");
2130         }
2131
2132         freeaddrinfo(ai);
2133 }
2134
2135 /**
2136  * Connect our servers.
2137  **/
2138 void setup_servers(GArray* servers) {
2139         int i;
2140         struct sigaction sa;
2141         int want_modern=0;
2142
2143         for(i=0;i<servers->len;i++) {
2144                 want_modern |= setup_serve(&(g_array_index(servers, SERVER, i)));
2145         }
2146         if(want_modern) {
2147                 open_modern();
2148         }
2149         children=g_hash_table_new_full(g_int_hash, g_int_equal, NULL, destroy_pid_t);
2150
2151         sa.sa_handler = sigchld_handler;
2152         sigemptyset(&sa.sa_mask);
2153         sa.sa_flags = SA_RESTART;
2154         if(sigaction(SIGCHLD, &sa, NULL) == -1)
2155                 err("sigaction: %m");
2156         sa.sa_handler = sigterm_handler;
2157         sigemptyset(&sa.sa_mask);
2158         sa.sa_flags = SA_RESTART;
2159         if(sigaction(SIGTERM, &sa, NULL) == -1)
2160                 err("sigaction: %m");
2161 }
2162
2163 /**
2164  * Go daemon (unless we specified at compile time that we didn't want this)
2165  * @param serve the first server of our configuration. If its port is zero,
2166  *      then do not daemonize, because we're doing inetd then. This parameter
2167  *      is only used to create a PID file of the form
2168  *      /var/run/nbd-server.&lt;port&gt;.pid; it's not modified in any way.
2169  **/
2170 #if !defined(NODAEMON)
2171 void daemonize(SERVER* serve) {
2172         FILE*pidf;
2173
2174         if(serve && !(serve->port)) {
2175                 return;
2176         }
2177         if(daemon(0,0)<0) {
2178                 err("daemon");
2179         }
2180         if(!*pidftemplate) {
2181                 if(serve) {
2182                         strncpy(pidftemplate, "/var/run/nbd-server.%d.pid", 255);
2183                 } else {
2184                         strncpy(pidftemplate, "/var/run/nbd-server.pid", 255);
2185                 }
2186         }
2187         snprintf(pidfname, 255, pidftemplate, serve ? serve->port : 0);
2188         pidf=fopen(pidfname, "w");
2189         if(pidf) {
2190                 fprintf(pidf,"%d\n", (int)getpid());
2191                 fclose(pidf);
2192         } else {
2193                 perror("fopen");
2194                 fprintf(stderr, "Not fatal; continuing");
2195         }
2196 }
2197 #else
2198 #define daemonize(serve)
2199 #endif /* !defined(NODAEMON) */
2200
2201 /*
2202  * Everything beyond this point (in the file) is run in non-daemon mode.
2203  * The stuff above daemonize() isn't.
2204  */
2205
2206 void serve_err(SERVER* serve, const char* msg) G_GNUC_NORETURN;
2207
2208 void serve_err(SERVER* serve, const char* msg) {
2209         g_message("Export of %s on port %d failed:", serve->exportname,
2210                         serve->port);
2211         err(msg);
2212 }
2213
2214 /**
2215  * Set up user-ID and/or group-ID
2216  **/
2217 void dousers(void) {
2218         struct passwd *pw;
2219         struct group *gr;
2220         gchar* str;
2221         if(rungroup) {
2222                 gr=getgrnam(rungroup);
2223                 if(!gr) {
2224                         str = g_strdup_printf("Invalid group name: %s", rungroup);
2225                         err(str);
2226                 }
2227                 if(setgid(gr->gr_gid)<0) {
2228                         err("Could not set GID: %m"); 
2229                 }
2230         }
2231         if(runuser) {
2232                 pw=getpwnam(runuser);
2233                 if(!pw) {
2234                         str = g_strdup_printf("Invalid user name: %s", runuser);
2235                         err(str);
2236                 }
2237                 if(setuid(pw->pw_uid)<0) {
2238                         err("Could not set UID: %m");
2239                 }
2240         }
2241 }
2242
2243 #ifndef ISSERVER
2244 void glib_message_syslog_redirect(const gchar *log_domain,
2245                                   GLogLevelFlags log_level,
2246                                   const gchar *message,
2247                                   gpointer user_data)
2248 {
2249     int level=LOG_DEBUG;
2250     
2251     switch( log_level )
2252     {
2253       case G_LOG_FLAG_FATAL:
2254       case G_LOG_LEVEL_CRITICAL:
2255       case G_LOG_LEVEL_ERROR:    
2256         level=LOG_ERR; 
2257         break;
2258       case G_LOG_LEVEL_WARNING:
2259         level=LOG_WARNING;
2260         break;
2261       case G_LOG_LEVEL_MESSAGE:
2262       case G_LOG_LEVEL_INFO:
2263         level=LOG_INFO;
2264         break;
2265       case G_LOG_LEVEL_DEBUG:
2266         level=LOG_DEBUG;
2267       default:
2268         level=LOG_ERR;
2269     }
2270     syslog(level, "%s", message);
2271 }
2272 #endif
2273
2274 /**
2275  * Main entry point...
2276  **/
2277 int main(int argc, char *argv[]) {
2278         SERVER *serve;
2279         GArray *servers;
2280         GError *err=NULL;
2281
2282         if (sizeof( struct nbd_request )!=28) {
2283                 fprintf(stderr,"Bad size of structure. Alignment problems?\n");
2284                 exit(EXIT_FAILURE) ;
2285         }
2286
2287         memset(pidftemplate, '\0', 256);
2288
2289         logging();
2290         config_file_pos = g_strdup(CFILE);
2291         serve=cmdline(argc, argv);
2292         servers = parse_cfile(config_file_pos, &err);
2293         
2294         if(serve) {
2295                 serve->socket_family = AF_UNSPEC;
2296
2297                 append_serve(serve, servers);
2298      
2299                 if (!(serve->port)) {
2300                         CLIENT *client;
2301 #ifndef ISSERVER
2302                         /* You really should define ISSERVER if you're going to use
2303                          * inetd mode, but if you don't, closing stdout and stderr
2304                          * (which inetd had connected to the client socket) will let it
2305                          * work. */
2306                         close(1);
2307                         close(2);
2308                         open("/dev/null", O_WRONLY);
2309                         open("/dev/null", O_WRONLY);
2310                         g_log_set_default_handler( glib_message_syslog_redirect, NULL );
2311 #endif
2312                         client=g_malloc(sizeof(CLIENT));
2313                         client->server=serve;
2314                         client->net=0;
2315                         client->exportsize=OFFT_MAX;
2316                         set_peername(0,client);
2317                         serveconnection(client);
2318                         return 0;
2319                 }
2320         }
2321     
2322         if(!servers || !servers->len) {
2323                 if(err && !(err->domain == g_quark_from_string("parse_cfile")
2324                                 && err->code == CFILE_NOTFOUND)) {
2325                         g_warning("Could not parse config file: %s", 
2326                                         err ? err->message : "Unknown error");
2327                 }
2328         }
2329         if(serve) {
2330                 g_warning("Specifying an export on the command line is deprecated.");
2331                 g_warning("Please use a configuration file instead.");
2332         }
2333
2334         if((!serve) && (!servers||!servers->len)) {
2335                 g_message("No configured exports; quitting.");
2336                 exit(EXIT_FAILURE);
2337         }
2338         if (!dontfork)
2339                 daemonize(serve);
2340         setup_servers(servers);
2341         dousers();
2342         serveloop(servers);
2343         return 0 ;
2344 }