Statistics
| Branch: | Revision:

root / drivers / block-log.c @ abdb293f

History | View | Annotate | Download (16.2 kB)

1
/* 
2
 * Copyright (c) 2008, XenSource Inc.
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *     * Redistributions of source code must retain the above copyright
8
 *       notice, this list of conditions and the following disclaimer.
9
 *     * Redistributions in binary form must reproduce the above copyright
10
 *       notice, this list of conditions and the following disclaimer in the
11
 *       documentation and/or other materials provided with the distribution.
12
 *     * Neither the name of XenSource Inc. nor the names of its contributors
13
 *       may be used to endorse or promote products derived from this software
14
 *       without specific prior written permission.
15
 *
16
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
 */
28

    
29
/* Driver to sit on top of another disk and log writes, in order
30
 * to synchronize two distinct disks
31
 *
32
 * On receipt of a control request it can export a list of dirty
33
 * sectors in the following format:
34
 * struct writerange {
35
 *   u64 sector;
36
 *   u32 count;
37
 * }
38
 * terminated by { 0, 0 }
39
 */
40

    
41
#ifdef HAVE_CONFIG_H
42
#include "config.h"
43
#endif
44

    
45
#include <errno.h>
46
#include <stdio.h>
47
#include <fcntl.h>
48
#include <unistd.h>
49
#include <stdlib.h>
50
#include <sys/mman.h>
51
#include <sys/socket.h>
52
#include <sys/un.h>
53

    
54
#include "log.h"
55
#include "tapdisk.h"
56
#include "tapdisk-server.h"
57
#include "tapdisk-driver.h"
58
#include "tapdisk-interface.h"
59

    
60
#define MAX_CONNECTIONS 1
61

    
62
typedef struct poll_fd {
63
  int          fd;
64
  event_id_t   id;
65
} poll_fd_t;
66

    
67
struct tdlog_state {
68
  uint64_t     size;
69

    
70
  void*        writelog;
71

    
72
  char*        ctlpath;
73
  poll_fd_t    ctl;
74

    
75
  int          connected;
76
  poll_fd_t    connections[MAX_CONNECTIONS];
77

    
78
  char*        shmpath;
79
  void*        shm;
80

    
81
  log_sring_t* sring;
82
  log_back_ring_t bring;
83
};
84

    
85
#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
86

    
87
#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
88

    
89
static void ctl_accept(event_id_t, char, void *);
90
static void ctl_request(event_id_t, char, void *);
91

    
92
/* -- write log -- */
93

    
94
/* large flat bitmaps don't scale particularly well either in size or scan
95
 * time, but they'll do for now */
96
#define BITS_PER_LONG (sizeof(unsigned long) * 8)
97
#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
98

    
99
#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG]
100
#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG)
101

    
102
static inline int test_bit(int nr, void* bmap)
103
{
104
  return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1;
105
}
106

    
107
static inline void clear_bit(int nr, void* bmap)
108
{
109
  BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr));
110
}
111

    
112
static inline void set_bit(int nr, void* bmap)
113
{
114
  BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr));
115
}
116

    
117
static inline int bitmap_size(uint64_t sz)
118
{
119
  return sz >> 3;
120
}
121

    
122
static int writelog_create(struct tdlog_state *s)
123
{
124
  uint64_t bmsize;
125

    
126
  bmsize = bitmap_size(s->size);
127

    
128
  BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
129

    
130
  if (!(s->writelog = calloc(bmsize, 1))) {
131
    BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
132
    return -1;
133
  }
134

    
135
  return 0;
136
}
137

    
138
static int writelog_free(struct tdlog_state *s)
139
{
140
  if (s->writelog)
141
    free(s->writelog);
142

    
143
  return 0;
144
}
145

    
146
static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
147
{
148
  int i;
149

    
150
  for (i = 0; i < count; i++) 
151
    set_bit(sector + i, s->writelog);
152

    
153
  return 0;
154
}
155

    
156
/* if end is 0, clear to end of disk */
157
int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
158
{
159
  if (!end)
160
    end = s->size;
161

    
162
  /* clear to word boundaries */
163
  while (BITMAP_SHIFT(start))
164
    clear_bit(start++, s->writelog);
165
  while (BITMAP_SHIFT(end))
166
    clear_bit(end--, s->writelog);
167

    
168
  memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
169

    
170
  return 0;
171
}
172

    
173
/* returns last block exported (may not be end of disk if shm region
174
 * overflows) */
175
static uint64_t writelog_export(struct tdlog_state* s)
176
{
177
  struct disk_range* range = s->shm;
178
  uint64_t i = 0;
179

    
180
  BDPRINTF("sector count: %"PRIu64, s->size);
181

    
182
  for (i = 0; i < s->size; i++) {
183
    if (test_bit(i, s->writelog)) {
184
      /* range start */
185
      range->sector = i;
186
      range->count = 1;
187
      /* find end */
188
      for (i++; i < s->size && test_bit(i, s->writelog); i++)
189
        range->count++;
190

    
191
      BDPRINTF("export: dirty extent %"PRIu64":%u",
192
               range->sector, range->count);
193
      range++;
194

    
195
      /* out of space in shared memory region */
196
      if ((void*)range >= bmend(s->shm)) {
197
        BDPRINTF("out of space in shm region at sector %"PRIu64, i);
198
        return i;
199
      }
200

    
201
      /* undo forloop increment */
202
      i--;
203
    }
204
  }
205

    
206
  /* NULL-terminate range list */
207
  range->sector = 0;
208
  range->count = 0;
209

    
210
  return i;
211
}
212

    
213
/* -- communication channel -- */
214

    
215
/* remove FS special characters in up to len bytes of path */
216
static inline void path_escape(char* path, size_t len) {
217
  int i;
218

    
219
  for (i = 0; i < len && path[i]; i++)
220
    if (strchr(":/", path[i]))
221
      path[i] = '_';
222
}
223

    
224
static char* ctl_makepath(const char* name, const char* ext)
225
{
226
  char* res;
227
  char *file;
228

    
229
  file = strrchr(name, '/');
230
  if (!file) {
231
    BWPRINTF("invalid name %s\n", name);
232
    return NULL;
233
  }
234

    
235
  if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
236
    BWPRINTF("could not allocate path");
237
    return NULL;
238
  }
239

    
240
  path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
241

    
242
  return res;
243
}
244

    
245
static int shmem_open(struct tdlog_state* s, const char* name)
246
{
247
  int i, l, fd;
248

    
249
  /* device name -> path */
250
  if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
251
    BWPRINTF("could not allocate shm path");
252
    return -1;
253
  }
254

    
255
  path_escape(s->shmpath + 5, strlen(name));
256

    
257
  if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
258
    BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
259
             strerror(errno));
260
    goto err;
261
  }
262
  if (ftruncate(fd, SHMSIZE) < 0) {
263
    BWPRINTF("error truncating shmem to size %u", SHMSIZE);
264
    close(fd);
265
    goto err;
266
  }
267

    
268
  s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
269
  close(fd);
270
  if (s->shm == MAP_FAILED) {
271
    BWPRINTF("could not mmap write log shm: %s", strerror(errno));
272
    goto err;
273
  }
274
  return 0;
275

    
276
  err:
277
  s->shm = NULL;
278
  free(s->shmpath);
279
  s->shmpath = NULL;
280
  return -1;
281
}
282

    
283
static int shmem_close(struct tdlog_state* s)
284
{
285
  if (s->shm) {
286
    munmap(s->shm, SHMSIZE);
287
    s->shm = NULL;
288
  }
289

    
290
  if (s->shmpath) {
291
    shm_unlink(s->shmpath);
292
    s->shmpath = NULL;
293
  }
294

    
295
  return 0;
296
}
297

    
298
/* control socket */
299

    
300
static int ctl_open(struct tdlog_state* s, const char* name)
301
{
302
  struct sockaddr_un saddr;
303

    
304
  if (!(s->ctlpath = ctl_makepath(name, "ctl")))
305
    return -1;
306

    
307
  if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
308
    BWPRINTF("error opening control socket: %s", strerror(errno));
309
    goto err;
310
  }
311

    
312
  memset(&saddr, 0, sizeof(saddr));
313
  saddr.sun_family = AF_UNIX;
314
  memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
315
  if (unlink(s->ctlpath) && errno != ENOENT) {
316
    BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
317
             strerror(errno));
318
    goto err_sock;
319
  }
320
    
321
  if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) {
322
    BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
323
             strerror(errno));
324
    goto err_sock;
325
  }
326

    
327
  if (listen(s->ctl.fd, 1) < 0) {
328
    BWPRINTF("error listening on control socket: %s", strerror(errno));
329
    goto err_sock;
330
  }
331

    
332
  s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
333
                                            s->ctl.fd, 0, ctl_accept, s);
334
  if (s->ctl.id < 0) {
335
    BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
336
    goto err_sock;
337
  }
338

    
339
  return 0;
340

    
341
  err_sock:
342
  close(s->ctl.fd);
343
  s->ctl.fd = -1;
344
  err:
345
  free(s->ctlpath);
346
  s->ctlpath = NULL;
347

    
348
  return -1;
349
}
350

    
351
static int ctl_close(struct tdlog_state* s)
352
{
353
  while (s->connected) {
354
    tapdisk_server_unregister_event(s->connections[s->connected].id);
355
    close(s->connections[s->connected].fd);
356
    s->connections[s->connected].fd = -1;
357
    s->connections[s->connected].id = 0;
358
    s->connected--;
359
  }
360

    
361
  if (s->ctl.fd >= 0) {
362
    tapdisk_server_unregister_event(s->ctl.id);
363
    close(s->ctl.fd);
364
    s->ctl.fd = -1;
365
    s->ctl.id = 0;
366
  }
367

    
368
  if (s->ctlpath) {
369
    unlink(s->ctlpath);
370
    free(s->ctlpath);
371
    s->ctlpath = NULL;
372
  }
373

    
374
  /* XXX this must be fixed once requests are actually in flight */
375
  /* could just drain the existing ring here first */
376
  if (s->sring) {
377
    SHARED_RING_INIT(s->sring);
378
    BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
379
  }
380

    
381
  return 0;
382
}
383

    
384
/* walk list of open sockets, close matching fd */
385
static int ctl_close_sock(struct tdlog_state* s, int fd)
386
{
387
  int i;
388

    
389
  for (i = 0; i <= s->connected; i++) {
390
    if (s->connections[i].fd == fd) {
391
      tapdisk_server_unregister_event(s->connections[i].id);
392
      close(s->connections[i].fd);
393
      s->connections[i].fd = -1;
394
      s->connections[i].id = 0;
395
      s->connected--;
396
      return 0;
397
    }
398
  }
399

    
400
  BWPRINTF("requested to close unknown socket %d", fd);
401
  return -1;
402
}
403

    
404
static void ctl_accept(event_id_t id, char mode, void *private)
405
{
406
  struct tdlog_state* s = (struct tdlog_state *)private;
407
  int fd;
408
  event_id_t cid;
409

    
410
  if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
411
    BWPRINTF("error accepting control connection: %s", strerror(errno));
412
    return;
413
  }
414

    
415
  if (s->connected) {
416
    BWPRINTF("control session in progress, closing new connection");
417
    close(fd);
418
    return;
419
  }
420

    
421
  cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
422
                                      fd, 0, ctl_request, s);
423
  if (cid < 0) {
424
    BWPRINTF("error registering connection event handler: %s", strerror(cid));
425
    close(fd);
426
    return;
427
  }
428

    
429
  s->connections[s->connected].fd = fd;
430
  s->connections[s->connected].id = cid;
431
  s->connected++;
432
}
433

    
434
/* response format: 4 bytes shmsize, 0-terminated path */
435
static int ctl_get_shmpath(struct tdlog_state* s, int fd)
436
{
437
  char msg[CTLRSPLEN_SHMP + 1];
438
  uint32_t sz;
439
  int rc;
440

    
441
  BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
442
           SHMSIZE, s->shmpath);
443

    
444
  /* TMP: sanity-check shm */
445
  sz = 0xdeadbeef;
446
  memcpy(s->shm, &sz, sizeof(sz));
447

    
448
  sz = SHMSIZE;
449
  memcpy(msg, &sz, sizeof(sz));
450
  snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
451
  if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
452
    BWPRINTF("error writing shmpath: %s", strerror(errno));
453
    return -1;
454
  }
455

    
456
  return 0;
457
}
458

    
459
static int ctl_peek_writes(struct tdlog_state* s, int fd)
460
{
461
  int rc;
462

    
463
  BDPRINTF("ctl: peeking bitmap");
464

    
465
  writelog_export(s);
466

    
467
  if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
468
    BWPRINTF("error writing peek ack: %s", strerror(errno));
469
    return -1;
470
  }
471

    
472
  return 0;
473
}
474

    
475
static int ctl_clear_writes(struct tdlog_state* s, int fd)
476
{
477
  int rc;
478

    
479
  BDPRINTF("ctl: clearing bitmap");
480

    
481
  writelog_clear(s, 0, 0);
482

    
483
  if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
484
    BWPRINTF("error writing clear ack: %s", strerror(errno));
485
    return -1;
486
  }
487

    
488
  return 0;
489
}
490

    
491
/* get dirty bitmap and clear it atomically */
492
static int ctl_get_writes(struct tdlog_state* s, int fd)
493
{
494
  int rc;
495

    
496
  BDPRINTF("ctl: getting bitmap");
497

    
498
  writelog_export(s);
499
  writelog_clear(s, 0, 0);
500

    
501
  if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
502
    BWPRINTF("error writing get ack: %s", strerror(errno));
503
    return -1;
504
  }
505

    
506
  return 0;
507
}
508

    
509
/* get requests from ring */
510
static int ctl_kick(struct tdlog_state* s, int fd)
511
{
512
  RING_IDX reqstart, reqend;
513
  log_request_t req;
514

    
515
  /* XXX testing */
516
  RING_IDX rspstart, rspend;
517
  log_response_t rsp;
518
  struct log_ctlmsg msg;
519
  int rc;
520

    
521
  reqstart = s->bring.req_cons;
522
  reqend = s->sring->req_prod;
523

    
524
  BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
525

    
526
  while (reqstart != reqend) {
527
    /* XXX actually submit these! */
528
    memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
529
    BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
530
    s->bring.req_cons = ++reqstart;
531

    
532
    rsp.sector = req.sector;
533
    rsp.count = req.count;
534
    memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
535
           sizeof(rsp));
536
    s->bring.rsp_prod_pvt++;
537
  }
538

    
539
  RING_PUSH_RESPONSES(&s->bring);
540
  memset(&msg, 0, sizeof(msg));
541
  memcpy(msg.msg, LOGCMD_KICK, 4);
542
  if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
543
    BWPRINTF("error sending notify: %s", strerror(errno));
544
    return -1;
545
  } else if (rc < sizeof(msg)) {
546
    BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
547
    return -1;
548
  }
549

    
550
  return 0;
551
}
552

    
553
static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
554
{
555
  if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
556
    return ctl_get_shmpath(s, fd);
557
  } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
558
    return ctl_peek_writes(s, fd);
559
  } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
560
    return ctl_clear_writes(s, fd);
561
  } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
562
    return ctl_get_writes(s, fd);
563
  } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
564
    return ctl_kick(s, fd);
565
  }
566

    
567
  BWPRINTF("unknown control request %.4s", msg->msg);
568
  return -1;
569
}
570

    
571
static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
572
{
573
  int i;
574

    
575
  for (i = 0; i < s->connected; i++)
576
    if (s->connections[i].id == id)
577
      return s->connections[i].fd;
578

    
579
  BWPRINTF("unrecognized event callback id %d", id);
580
  return -1;
581
}
582

    
583
static void ctl_request(event_id_t id, char mode, void *private)
584
{
585
  struct tdlog_state* s = (struct tdlog_state*)private;
586
  struct log_ctlmsg msg;
587
  int rc, i, fd = -1;
588

    
589
  fd = ctl_find_connection(s, id);
590
  if (fd == -1)
591
    return;
592

    
593
  if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
594
    BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
595
             strerror(errno));
596
    ctl_close_sock(s, fd);
597
    return;
598
  } else if (rc == 0) {
599
    BDPRINTF("ctl_request: EOF, closing socket");
600
    ctl_close_sock(s, fd);
601
    return;
602
  } else if (rc < sizeof(msg)) {
603
    BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
604
             sizeof(msg));
605
    return;
606
  }
607

    
608
  ctl_do_request(s, fd, &msg);
609
}
610

    
611
/* -- interface -- */
612

    
613
static int tdlog_close(td_driver_t*);
614

    
615
static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
616
{
617
  struct tdlog_state* s = (struct tdlog_state*)driver->data;
618
  int rc;
619

    
620
  memset(s, 0, sizeof(*s));
621

    
622
  s->size = driver->info.size;
623

    
624
  if ((rc = writelog_create(s))) {
625
    tdlog_close(driver);
626
    return rc;
627
  }
628
  if ((rc = shmem_open(s, name))) {
629
    tdlog_close(driver);
630
    return rc;
631
  }
632
  if ((rc = ctl_open(s, name))) {
633
    tdlog_close(driver);
634
    return rc;
635
  }
636

    
637
  s->sring = (log_sring_t*)sringstart(s->shm);
638
  SHARED_RING_INIT(s->sring);
639
  BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
640

    
641
  BDPRINTF("opened ctl socket");
642

    
643
  return 0;
644
}
645

    
646
static int tdlog_close(td_driver_t* driver)
647
{
648
  struct tdlog_state* s = (struct tdlog_state*)driver->data;
649

    
650
  ctl_close(s);
651
  shmem_close(s);
652
  writelog_free(s);
653

    
654
  return 0;
655
}
656

    
657
static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
658
{
659
  td_forward_request(treq);
660
}
661

    
662
static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
663
{
664
  struct tdlog_state* s = (struct tdlog_state*)driver->data;
665
  int rc;
666

    
667
  writelog_set(s, treq.sec, treq.secs);
668
  td_forward_request(treq);
669
}
670

    
671
static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
672
{
673
  return -EINVAL;
674
}
675

    
676
static int tdlog_validate_parent(td_driver_t *driver,
677
                                 td_driver_t *parent, td_flag_t flags)
678
{
679
  return 0;
680
}
681

    
682
struct tap_disk tapdisk_log = {
683
  .disk_type          = "tapdisk_log",
684
  .private_data_size  = sizeof(struct tdlog_state),
685
  .flags              = 0,
686
  .td_open            = tdlog_open,
687
  .td_close           = tdlog_close,
688
  .td_queue_read      = tdlog_queue_read,
689
  .td_queue_write     = tdlog_queue_write,
690
  .td_get_parent_id   = tdlog_get_parent_id,
691
  .td_validate_parent = tdlog_validate_parent,
692
};