Home | History | Annotate | Line # | Download | only in dns
journal.c revision 1.1.1.5
      1 /*	$NetBSD: journal.c,v 1.1.1.5 2021/02/19 16:37:12 christos Exp $	*/
      2 
      3 /*
      4  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
      5  *
      6  * This Source Code Form is subject to the terms of the Mozilla Public
      7  * License, v. 2.0. If a copy of the MPL was not distributed with this
      8  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
      9  *
     10  * See the COPYRIGHT file distributed with this work for additional
     11  * information regarding copyright ownership.
     12  */
     13 
     14 #include <errno.h>
     15 #include <inttypes.h>
     16 #include <stdbool.h>
     17 #include <stdlib.h>
     18 #include <unistd.h>
     19 
     20 #include <isc/file.h>
     21 #include <isc/mem.h>
     22 #include <isc/print.h>
     23 #include <isc/stdio.h>
     24 #include <isc/string.h>
     25 #include <isc/util.h>
     26 
     27 #include <dns/compress.h>
     28 #include <dns/db.h>
     29 #include <dns/dbiterator.h>
     30 #include <dns/diff.h>
     31 #include <dns/fixedname.h>
     32 #include <dns/journal.h>
     33 #include <dns/log.h>
     34 #include <dns/rdataset.h>
     35 #include <dns/rdatasetiter.h>
     36 #include <dns/result.h>
     37 #include <dns/soa.h>
     38 
     39 /*! \file
     40  * \brief Journaling.
     41  *
     42  * A journal file consists of
     43  *
     44  *   \li A fixed-size header of type journal_rawheader_t.
     45  *
     46  *   \li The index.  This is an unordered array of index entries
     47  *     of type journal_rawpos_t giving the locations
     48  *     of some arbitrary subset of the journal's addressable
     49  *     transactions.  The index entries are used as hints to
     50  *     speed up the process of locating a transaction with a given
     51  *     serial number.  Unused index entries have an "offset"
     52  *     field of zero.  The size of the index can vary between
     53  *     journal files, but does not change during the lifetime
     54  *     of a file.  The size can be zero.
     55  *
     56  *   \li The journal data.  This  consists of one or more transactions.
     57  *     Each transaction begins with a transaction header of type
     58  *     journal_rawxhdr_t.  The transaction header is followed by a
     59  *     sequence of RRs, similar in structure to an IXFR difference
     60  *     sequence (RFC1995).  That is, the pre-transaction SOA,
     61  *     zero or more other deleted RRs, the post-transaction SOA,
     62  *     and zero or more other added RRs.  Unlike in IXFR, each RR
     63  *     is prefixed with a 32-bit length.
     64  *
     65  *     The journal data part grows as new transactions are
     66  *     appended to the file.  Only those transactions
     67  *     whose serial number is current-(2^31-1) to current
     68  *     are considered "addressable" and may be pointed
     69  *     to from the header or index.  They may be preceded
     70  *     by old transactions that are no longer addressable,
     71  *     and they may be followed by transactions that were
     72  *     appended to the journal but never committed by updating
     73  *     the "end" position in the header.  The latter will
     74  *     be overwritten when new transactions are added.
     75  */
     76 
     77 /**************************************************************************/
     78 /*
     79  * Miscellaneous utilities.
     80  */
     81 
     82 #define JOURNAL_COMMON_LOGARGS \
     83 	dns_lctx, DNS_LOGCATEGORY_GENERAL, DNS_LOGMODULE_JOURNAL
     84 
     85 #define JOURNAL_DEBUG_LOGARGS(n) JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(n)
     86 
     87 /*%
     88  * It would be non-sensical (or at least obtuse) to use FAIL() with an
     89  * ISC_R_SUCCESS code, but the test is there to keep the Solaris compiler
     90  * from complaining about "end-of-loop code not reached".
     91  */
     92 #define FAIL(code)                           \
     93 	do {                                 \
     94 		result = (code);             \
     95 		if (result != ISC_R_SUCCESS) \
     96 			goto failure;        \
     97 	} while (0)
     98 
     99 #define CHECK(op)                            \
    100 	do {                                 \
    101 		result = (op);               \
    102 		if (result != ISC_R_SUCCESS) \
    103 			goto failure;        \
    104 	} while (0)
    105 
    106 #define JOURNAL_SERIALSET 0x01U
    107 
    108 static isc_result_t
    109 index_to_disk(dns_journal_t *);
    110 
    111 static inline uint32_t
    112 decode_uint32(unsigned char *p) {
    113 	return ((p[0] << 24) + (p[1] << 16) + (p[2] << 8) + (p[3] << 0));
    114 }
    115 
    116 static inline void
    117 encode_uint32(uint32_t val, unsigned char *p) {
    118 	p[0] = (uint8_t)(val >> 24);
    119 	p[1] = (uint8_t)(val >> 16);
    120 	p[2] = (uint8_t)(val >> 8);
    121 	p[3] = (uint8_t)(val >> 0);
    122 }
    123 
    124 isc_result_t
    125 dns_db_createsoatuple(dns_db_t *db, dns_dbversion_t *ver, isc_mem_t *mctx,
    126 		      dns_diffop_t op, dns_difftuple_t **tp) {
    127 	isc_result_t result;
    128 	dns_dbnode_t *node;
    129 	dns_rdataset_t rdataset;
    130 	dns_rdata_t rdata = DNS_RDATA_INIT;
    131 	dns_fixedname_t fixed;
    132 	dns_name_t *zonename;
    133 
    134 	zonename = dns_fixedname_initname(&fixed);
    135 	dns_name_copynf(dns_db_origin(db), zonename);
    136 
    137 	node = NULL;
    138 	result = dns_db_findnode(db, zonename, false, &node);
    139 	if (result != ISC_R_SUCCESS) {
    140 		goto nonode;
    141 	}
    142 
    143 	dns_rdataset_init(&rdataset);
    144 	result = dns_db_findrdataset(db, node, ver, dns_rdatatype_soa, 0,
    145 				     (isc_stdtime_t)0, &rdataset, NULL);
    146 	if (result != ISC_R_SUCCESS) {
    147 		goto freenode;
    148 	}
    149 
    150 	result = dns_rdataset_first(&rdataset);
    151 	if (result != ISC_R_SUCCESS) {
    152 		goto freenode;
    153 	}
    154 
    155 	dns_rdataset_current(&rdataset, &rdata);
    156 	dns_rdataset_getownercase(&rdataset, zonename);
    157 
    158 	result = dns_difftuple_create(mctx, op, zonename, rdataset.ttl, &rdata,
    159 				      tp);
    160 
    161 	dns_rdataset_disassociate(&rdataset);
    162 	dns_db_detachnode(db, &node);
    163 	return (result);
    164 
    165 freenode:
    166 	dns_db_detachnode(db, &node);
    167 nonode:
    168 	UNEXPECTED_ERROR(__FILE__, __LINE__, "missing SOA");
    169 	return (result);
    170 }
    171 
    172 /* Journaling */
    173 
    174 /*%
    175  * On-disk representation of a "pointer" to a journal entry.
    176  * These are used in the journal header to locate the beginning
    177  * and end of the journal, and in the journal index to locate
    178  * other transactions.
    179  */
    180 typedef struct {
    181 	unsigned char serial[4]; /*%< SOA serial before update. */
    182 	/*
    183 	 * XXXRTH  Should offset be 8 bytes?
    184 	 * XXXDCL ... probably, since isc_offset_t is 8 bytes on many OSs.
    185 	 * XXXAG  ... but we will not be able to seek >2G anyway on many
    186 	 *            platforms as long as we are using fseek() rather
    187 	 *            than lseek().
    188 	 */
    189 	unsigned char offset[4]; /*%< Offset from beginning of file. */
    190 } journal_rawpos_t;
    191 
    192 /*%
    193  * The header is of a fixed size, with some spare room for future
    194  * extensions.
    195  */
    196 #define JOURNAL_HEADER_SIZE 64 /* Bytes. */
    197 
    198 /*%
    199  * The on-disk representation of the journal header.
    200  * All numbers are stored in big-endian order.
    201  */
    202 typedef union {
    203 	struct {
    204 		/*% File format version ID. */
    205 		unsigned char format[16];
    206 		/*% Position of the first addressable transaction */
    207 		journal_rawpos_t begin;
    208 		/*% Position of the next (yet nonexistent) transaction. */
    209 		journal_rawpos_t end;
    210 		/*% Number of index entries following the header. */
    211 		unsigned char index_size[4];
    212 		/*% Source serial number. */
    213 		unsigned char sourceserial[4];
    214 		unsigned char flags;
    215 	} h;
    216 	/* Pad the header to a fixed size. */
    217 	unsigned char pad[JOURNAL_HEADER_SIZE];
    218 } journal_rawheader_t;
    219 
    220 /*%
    221  * The on-disk representation of the transaction header.
    222  * There is one of these at the beginning of each transaction.
    223  */
    224 typedef struct {
    225 	unsigned char size[4];	  /*%< In bytes, excluding header. */
    226 	unsigned char count[4];	  /*%< Number of records in transaction */
    227 	unsigned char serial0[4]; /*%< SOA serial before update. */
    228 	unsigned char serial1[4]; /*%< SOA serial after update. */
    229 } journal_rawxhdr_t;
    230 
    231 /*%
    232  * The on-disk representation of the RR header.
    233  * There is one of these at the beginning of each RR.
    234  */
    235 typedef struct {
    236 	unsigned char size[4]; /*%< In bytes, excluding header. */
    237 } journal_rawrrhdr_t;
    238 
    239 /*%
    240  * The in-core representation of the journal header.
    241  */
    242 typedef struct {
    243 	uint32_t serial;
    244 	isc_offset_t offset;
    245 } journal_pos_t;
    246 
    247 #define POS_VALID(pos)	    ((pos).offset != 0)
    248 #define POS_INVALIDATE(pos) ((pos).offset = 0, (pos).serial = 0)
    249 
    250 typedef struct {
    251 	unsigned char format[16];
    252 	journal_pos_t begin;
    253 	journal_pos_t end;
    254 	uint32_t index_size;
    255 	uint32_t sourceserial;
    256 	bool serialset;
    257 } journal_header_t;
    258 
    259 /*%
    260  * The in-core representation of the transaction header.
    261  */
    262 typedef struct {
    263 	uint32_t size;
    264 	uint32_t count;
    265 	uint32_t serial0;
    266 	uint32_t serial1;
    267 } journal_xhdr_t;
    268 
    269 /*%
    270  * The in-core representation of the RR header.
    271  */
    272 typedef struct {
    273 	uint32_t size;
    274 } journal_rrhdr_t;
    275 
    276 /*%
    277  * Initial contents to store in the header of a newly created
    278  * journal file.
    279  *
    280  * The header starts with the magic string ";BIND LOG V9\n"
    281  * to identify the file as a BIND 9 journal file.  An ASCII
    282  * identification string is used rather than a binary magic
    283  * number to be consistent with BIND 8 (BIND 8 journal files
    284  * are ASCII text files).
    285  */
    286 
    287 static journal_header_t initial_journal_header = {
    288 	";BIND LOG V9\n", { 0, 0 }, { 0, 0 }, 0, 0, 0
    289 };
    290 
    291 #define JOURNAL_EMPTY(h) ((h)->begin.offset == (h)->end.offset)
    292 
    293 typedef enum {
    294 	JOURNAL_STATE_INVALID,
    295 	JOURNAL_STATE_READ,
    296 	JOURNAL_STATE_WRITE,
    297 	JOURNAL_STATE_TRANSACTION,
    298 	JOURNAL_STATE_INLINE
    299 } journal_state_t;
    300 
    301 struct dns_journal {
    302 	unsigned int magic; /*%< JOUR */
    303 	isc_mem_t *mctx;    /*%< Memory context */
    304 	journal_state_t state;
    305 	char *filename;		 /*%< Journal file name */
    306 	FILE *fp;		 /*%< File handle */
    307 	isc_offset_t offset;	 /*%< Current file offset */
    308 	journal_header_t header; /*%< In-core journal header */
    309 	unsigned char *rawindex; /*%< In-core buffer for journal index
    310 				  * in on-disk format */
    311 	journal_pos_t *index;	 /*%< In-core journal index */
    312 
    313 	/*% Current transaction state (when writing). */
    314 	struct {
    315 		unsigned int n_soa;   /*%< Number of SOAs seen */
    316 		unsigned int n_rr;    /*%< Number of RRs to write */
    317 		journal_pos_t pos[2]; /*%< Begin/end position */
    318 	} x;
    319 
    320 	/*% Iteration state (when reading). */
    321 	struct {
    322 		/* These define the part of the journal we iterate over. */
    323 		journal_pos_t bpos; /*%< Position before first, */
    324 		journal_pos_t epos; /*%< and after last transaction */
    325 		/* The rest is iterator state. */
    326 		uint32_t current_serial; /*%< Current SOA serial
    327 					  * */
    328 		isc_buffer_t source;	 /*%< Data from disk */
    329 		isc_buffer_t target;	 /*%< Data from _fromwire check */
    330 		dns_decompress_t dctx;	 /*%< Dummy decompression ctx */
    331 		dns_name_t name;	 /*%< Current domain name */
    332 		dns_rdata_t rdata;	 /*%< Current rdata */
    333 		uint32_t ttl;		 /*%< Current TTL */
    334 		unsigned int xsize;	 /*%< Size of transaction data */
    335 		unsigned int xpos;	 /*%< Current position in it */
    336 		isc_result_t result;	 /*%< Result of last call */
    337 	} it;
    338 };
    339 
    340 #define DNS_JOURNAL_MAGIC    ISC_MAGIC('J', 'O', 'U', 'R')
    341 #define DNS_JOURNAL_VALID(t) ISC_MAGIC_VALID(t, DNS_JOURNAL_MAGIC)
    342 
    343 static void
    344 journal_pos_decode(journal_rawpos_t *raw, journal_pos_t *cooked) {
    345 	cooked->serial = decode_uint32(raw->serial);
    346 	cooked->offset = decode_uint32(raw->offset);
    347 }
    348 
    349 static void
    350 journal_pos_encode(journal_rawpos_t *raw, journal_pos_t *cooked) {
    351 	encode_uint32(cooked->serial, raw->serial);
    352 	encode_uint32(cooked->offset, raw->offset);
    353 }
    354 
    355 static void
    356 journal_header_decode(journal_rawheader_t *raw, journal_header_t *cooked) {
    357 	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
    358 	memmove(cooked->format, raw->h.format, sizeof(cooked->format));
    359 	journal_pos_decode(&raw->h.begin, &cooked->begin);
    360 	journal_pos_decode(&raw->h.end, &cooked->end);
    361 	cooked->index_size = decode_uint32(raw->h.index_size);
    362 	cooked->sourceserial = decode_uint32(raw->h.sourceserial);
    363 	cooked->serialset = ((raw->h.flags & JOURNAL_SERIALSET) != 0);
    364 }
    365 
    366 static void
    367 journal_header_encode(journal_header_t *cooked, journal_rawheader_t *raw) {
    368 	unsigned char flags = 0;
    369 
    370 	INSIST(sizeof(cooked->format) == sizeof(raw->h.format));
    371 	memset(raw->pad, 0, sizeof(raw->pad));
    372 	memmove(raw->h.format, cooked->format, sizeof(raw->h.format));
    373 	journal_pos_encode(&raw->h.begin, &cooked->begin);
    374 	journal_pos_encode(&raw->h.end, &cooked->end);
    375 	encode_uint32(cooked->index_size, raw->h.index_size);
    376 	encode_uint32(cooked->sourceserial, raw->h.sourceserial);
    377 	if (cooked->serialset) {
    378 		flags |= JOURNAL_SERIALSET;
    379 	}
    380 	raw->h.flags = flags;
    381 }
    382 
    383 /*
    384  * Journal file I/O subroutines, with error checking and reporting.
    385  */
    386 static isc_result_t
    387 journal_seek(dns_journal_t *j, uint32_t offset) {
    388 	isc_result_t result;
    389 
    390 	result = isc_stdio_seek(j->fp, (off_t)offset, SEEK_SET);
    391 	if (result != ISC_R_SUCCESS) {
    392 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    393 			      "%s: seek: %s", j->filename,
    394 			      isc_result_totext(result));
    395 		return (ISC_R_UNEXPECTED);
    396 	}
    397 	j->offset = offset;
    398 	return (ISC_R_SUCCESS);
    399 }
    400 
    401 static isc_result_t
    402 journal_read(dns_journal_t *j, void *mem, size_t nbytes) {
    403 	isc_result_t result;
    404 
    405 	result = isc_stdio_read(mem, 1, nbytes, j->fp, NULL);
    406 	if (result != ISC_R_SUCCESS) {
    407 		if (result == ISC_R_EOF) {
    408 			return (ISC_R_NOMORE);
    409 		}
    410 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    411 			      "%s: read: %s", j->filename,
    412 			      isc_result_totext(result));
    413 		return (ISC_R_UNEXPECTED);
    414 	}
    415 	j->offset += (isc_offset_t)nbytes;
    416 	return (ISC_R_SUCCESS);
    417 }
    418 
    419 static isc_result_t
    420 journal_write(dns_journal_t *j, void *mem, size_t nbytes) {
    421 	isc_result_t result;
    422 
    423 	result = isc_stdio_write(mem, 1, nbytes, j->fp, NULL);
    424 	if (result != ISC_R_SUCCESS) {
    425 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    426 			      "%s: write: %s", j->filename,
    427 			      isc_result_totext(result));
    428 		return (ISC_R_UNEXPECTED);
    429 	}
    430 	j->offset += (isc_offset_t)nbytes;
    431 	return (ISC_R_SUCCESS);
    432 }
    433 
    434 static isc_result_t
    435 journal_fsync(dns_journal_t *j) {
    436 	isc_result_t result;
    437 	result = isc_stdio_flush(j->fp);
    438 	if (result != ISC_R_SUCCESS) {
    439 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    440 			      "%s: flush: %s", j->filename,
    441 			      isc_result_totext(result));
    442 		return (ISC_R_UNEXPECTED);
    443 	}
    444 	result = isc_stdio_sync(j->fp);
    445 	if (result != ISC_R_SUCCESS) {
    446 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    447 			      "%s: fsync: %s", j->filename,
    448 			      isc_result_totext(result));
    449 		return (ISC_R_UNEXPECTED);
    450 	}
    451 	return (ISC_R_SUCCESS);
    452 }
    453 
    454 /*
    455  * Read/write a transaction header at the current file position.
    456  */
    457 
    458 static isc_result_t
    459 journal_read_xhdr(dns_journal_t *j, journal_xhdr_t *xhdr) {
    460 	journal_rawxhdr_t raw;
    461 	isc_result_t result;
    462 	result = journal_read(j, &raw, sizeof(raw));
    463 	if (result != ISC_R_SUCCESS) {
    464 		return (result);
    465 	}
    466 	xhdr->size = decode_uint32(raw.size);
    467 	xhdr->count = decode_uint32(raw.count);
    468 	xhdr->serial0 = decode_uint32(raw.serial0);
    469 	xhdr->serial1 = decode_uint32(raw.serial1);
    470 	return (ISC_R_SUCCESS);
    471 }
    472 
    473 static isc_result_t
    474 journal_write_xhdr(dns_journal_t *j, uint32_t size, uint32_t count,
    475 		   uint32_t serial0, uint32_t serial1) {
    476 	journal_rawxhdr_t raw;
    477 	encode_uint32(size, raw.size);
    478 	encode_uint32(count, raw.count);
    479 	encode_uint32(serial0, raw.serial0);
    480 	encode_uint32(serial1, raw.serial1);
    481 	return (journal_write(j, &raw, sizeof(raw)));
    482 }
    483 
    484 /*
    485  * Read an RR header at the current file position.
    486  */
    487 
    488 static isc_result_t
    489 journal_read_rrhdr(dns_journal_t *j, journal_rrhdr_t *rrhdr) {
    490 	journal_rawrrhdr_t raw;
    491 	isc_result_t result;
    492 	result = journal_read(j, &raw, sizeof(raw));
    493 	if (result != ISC_R_SUCCESS) {
    494 		return (result);
    495 	}
    496 	rrhdr->size = decode_uint32(raw.size);
    497 	return (ISC_R_SUCCESS);
    498 }
    499 
    500 static isc_result_t
    501 journal_file_create(isc_mem_t *mctx, const char *filename) {
    502 	FILE *fp = NULL;
    503 	isc_result_t result;
    504 	journal_header_t header;
    505 	journal_rawheader_t rawheader;
    506 	int index_size = 56; /* XXX configurable */
    507 	int size;
    508 	void *mem; /* Memory for temporary index image. */
    509 
    510 	INSIST(sizeof(journal_rawheader_t) == JOURNAL_HEADER_SIZE);
    511 
    512 	result = isc_stdio_open(filename, "wb", &fp);
    513 	if (result != ISC_R_SUCCESS) {
    514 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    515 			      "%s: create: %s", filename,
    516 			      isc_result_totext(result));
    517 		return (ISC_R_UNEXPECTED);
    518 	}
    519 
    520 	header = initial_journal_header;
    521 	header.index_size = index_size;
    522 	journal_header_encode(&header, &rawheader);
    523 
    524 	size = sizeof(journal_rawheader_t) +
    525 	       index_size * sizeof(journal_rawpos_t);
    526 
    527 	mem = isc_mem_get(mctx, size);
    528 	memset(mem, 0, size);
    529 	memmove(mem, &rawheader, sizeof(rawheader));
    530 
    531 	result = isc_stdio_write(mem, 1, (size_t)size, fp, NULL);
    532 	if (result != ISC_R_SUCCESS) {
    533 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    534 			      "%s: write: %s", filename,
    535 			      isc_result_totext(result));
    536 		(void)isc_stdio_close(fp);
    537 		(void)isc_file_remove(filename);
    538 		isc_mem_put(mctx, mem, size);
    539 		return (ISC_R_UNEXPECTED);
    540 	}
    541 	isc_mem_put(mctx, mem, size);
    542 
    543 	result = isc_stdio_close(fp);
    544 	if (result != ISC_R_SUCCESS) {
    545 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    546 			      "%s: close: %s", filename,
    547 			      isc_result_totext(result));
    548 		(void)isc_file_remove(filename);
    549 		return (ISC_R_UNEXPECTED);
    550 	}
    551 
    552 	return (ISC_R_SUCCESS);
    553 }
    554 
    555 static isc_result_t
    556 journal_open(isc_mem_t *mctx, const char *filename, bool writable, bool create,
    557 	     dns_journal_t **journalp) {
    558 	FILE *fp = NULL;
    559 	isc_result_t result;
    560 	journal_rawheader_t rawheader;
    561 	dns_journal_t *j;
    562 
    563 	INSIST(journalp != NULL && *journalp == NULL);
    564 	j = isc_mem_get(mctx, sizeof(*j));
    565 
    566 	j->mctx = NULL;
    567 	isc_mem_attach(mctx, &j->mctx);
    568 	j->state = JOURNAL_STATE_INVALID;
    569 	j->fp = NULL;
    570 	j->filename = isc_mem_strdup(mctx, filename);
    571 	j->index = NULL;
    572 	j->rawindex = NULL;
    573 
    574 	if (j->filename == NULL) {
    575 		FAIL(ISC_R_NOMEMORY);
    576 	}
    577 
    578 	result = isc_stdio_open(j->filename, writable ? "rb+" : "rb", &fp);
    579 
    580 	if (result == ISC_R_FILENOTFOUND) {
    581 		if (create) {
    582 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_DEBUG(1),
    583 				      "journal file %s does not exist, "
    584 				      "creating it",
    585 				      j->filename);
    586 			CHECK(journal_file_create(mctx, filename));
    587 			/*
    588 			 * Retry.
    589 			 */
    590 			result = isc_stdio_open(j->filename, "rb+", &fp);
    591 		} else {
    592 			FAIL(ISC_R_NOTFOUND);
    593 		}
    594 	}
    595 	if (result != ISC_R_SUCCESS) {
    596 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    597 			      "%s: open: %s", j->filename,
    598 			      isc_result_totext(result));
    599 		FAIL(ISC_R_UNEXPECTED);
    600 	}
    601 
    602 	j->fp = fp;
    603 
    604 	/*
    605 	 * Set magic early so that seek/read can succeed.
    606 	 */
    607 	j->magic = DNS_JOURNAL_MAGIC;
    608 
    609 	CHECK(journal_seek(j, 0));
    610 	CHECK(journal_read(j, &rawheader, sizeof(rawheader)));
    611 
    612 	if (memcmp(rawheader.h.format, initial_journal_header.format,
    613 		   sizeof(initial_journal_header.format)) != 0)
    614 	{
    615 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    616 			      "%s: journal format not recognized", j->filename);
    617 		FAIL(ISC_R_UNEXPECTED);
    618 	}
    619 	journal_header_decode(&rawheader, &j->header);
    620 
    621 	/*
    622 	 * If there is an index, read the raw index into a dynamically
    623 	 * allocated buffer and then convert it into a cooked index.
    624 	 */
    625 	if (j->header.index_size != 0) {
    626 		unsigned int i;
    627 		unsigned int rawbytes;
    628 		unsigned char *p;
    629 
    630 		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
    631 		j->rawindex = isc_mem_get(mctx, rawbytes);
    632 
    633 		CHECK(journal_read(j, j->rawindex, rawbytes));
    634 
    635 		j->index = isc_mem_get(mctx, j->header.index_size *
    636 						     sizeof(journal_pos_t));
    637 
    638 		p = j->rawindex;
    639 		for (i = 0; i < j->header.index_size; i++) {
    640 			j->index[i].serial = decode_uint32(p);
    641 			p += 4;
    642 			j->index[i].offset = decode_uint32(p);
    643 			p += 4;
    644 		}
    645 		INSIST(p == j->rawindex + rawbytes);
    646 	}
    647 	j->offset = -1; /* Invalid, must seek explicitly. */
    648 
    649 	/*
    650 	 * Initialize the iterator.
    651 	 */
    652 	dns_name_init(&j->it.name, NULL);
    653 	dns_rdata_init(&j->it.rdata);
    654 
    655 	/*
    656 	 * Set up empty initial buffers for unchecked and checked
    657 	 * wire format RR data.  They will be reallocated
    658 	 * later.
    659 	 */
    660 	isc_buffer_init(&j->it.source, NULL, 0);
    661 	isc_buffer_init(&j->it.target, NULL, 0);
    662 	dns_decompress_init(&j->it.dctx, -1, DNS_DECOMPRESS_NONE);
    663 
    664 	j->state = writable ? JOURNAL_STATE_WRITE : JOURNAL_STATE_READ;
    665 
    666 	*journalp = j;
    667 	return (ISC_R_SUCCESS);
    668 
    669 failure:
    670 	j->magic = 0;
    671 	if (j->rawindex != NULL) {
    672 		isc_mem_put(j->mctx, j->rawindex,
    673 			    j->header.index_size * sizeof(journal_rawpos_t));
    674 	}
    675 	if (j->index != NULL) {
    676 		isc_mem_put(j->mctx, j->index,
    677 			    j->header.index_size * sizeof(journal_pos_t));
    678 	}
    679 	if (j->filename != NULL) {
    680 		isc_mem_free(j->mctx, j->filename);
    681 	}
    682 	if (j->fp != NULL) {
    683 		(void)isc_stdio_close(j->fp);
    684 	}
    685 	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
    686 	return (result);
    687 }
    688 
    689 isc_result_t
    690 dns_journal_open(isc_mem_t *mctx, const char *filename, unsigned int mode,
    691 		 dns_journal_t **journalp) {
    692 	isc_result_t result;
    693 	size_t namelen;
    694 	char backup[1024];
    695 	bool writable, create;
    696 
    697 	create = ((mode & DNS_JOURNAL_CREATE) != 0);
    698 	writable = ((mode & (DNS_JOURNAL_WRITE | DNS_JOURNAL_CREATE)) != 0);
    699 
    700 	result = journal_open(mctx, filename, writable, create, journalp);
    701 	if (result == ISC_R_NOTFOUND) {
    702 		namelen = strlen(filename);
    703 		if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0)
    704 		{
    705 			namelen -= 4;
    706 		}
    707 
    708 		result = snprintf(backup, sizeof(backup), "%.*s.jbk",
    709 				  (int)namelen, filename);
    710 		if (result >= sizeof(backup)) {
    711 			return (ISC_R_NOSPACE);
    712 		}
    713 		result = journal_open(mctx, backup, writable, writable,
    714 				      journalp);
    715 	}
    716 	return (result);
    717 }
    718 
    719 /*
    720  * A comparison function defining the sorting order for
    721  * entries in the IXFR-style journal file.
    722  *
    723  * The IXFR format requires that deletions are sorted before
    724  * additions, and within either one, SOA records are sorted
    725  * before others.
    726  *
    727  * Also sort the non-SOA records by type as a courtesy to the
    728  * server receiving the IXFR - it may help reduce the amount of
    729  * rdataset merging it has to do.
    730  */
    731 static int
    732 ixfr_order(const void *av, const void *bv) {
    733 	dns_difftuple_t const *const *ap = av;
    734 	dns_difftuple_t const *const *bp = bv;
    735 	dns_difftuple_t const *a = *ap;
    736 	dns_difftuple_t const *b = *bp;
    737 	int r;
    738 	int bop = 0, aop = 0;
    739 
    740 	switch (a->op) {
    741 	case DNS_DIFFOP_DEL:
    742 	case DNS_DIFFOP_DELRESIGN:
    743 		aop = 1;
    744 		break;
    745 	case DNS_DIFFOP_ADD:
    746 	case DNS_DIFFOP_ADDRESIGN:
    747 		aop = 0;
    748 		break;
    749 	default:
    750 		INSIST(0);
    751 		ISC_UNREACHABLE();
    752 	}
    753 
    754 	switch (b->op) {
    755 	case DNS_DIFFOP_DEL:
    756 	case DNS_DIFFOP_DELRESIGN:
    757 		bop = 1;
    758 		break;
    759 	case DNS_DIFFOP_ADD:
    760 	case DNS_DIFFOP_ADDRESIGN:
    761 		bop = 0;
    762 		break;
    763 	default:
    764 		INSIST(0);
    765 		ISC_UNREACHABLE();
    766 	}
    767 
    768 	r = bop - aop;
    769 	if (r != 0) {
    770 		return (r);
    771 	}
    772 
    773 	r = (b->rdata.type == dns_rdatatype_soa) -
    774 	    (a->rdata.type == dns_rdatatype_soa);
    775 	if (r != 0) {
    776 		return (r);
    777 	}
    778 
    779 	r = (a->rdata.type - b->rdata.type);
    780 	return (r);
    781 }
    782 
    783 /*
    784  * Advance '*pos' to the next journal transaction.
    785  *
    786  * Requires:
    787  *	*pos refers to a valid journal transaction.
    788  *
    789  * Ensures:
    790  *	When ISC_R_SUCCESS is returned,
    791  *	*pos refers to the next journal transaction.
    792  *
    793  * Returns one of:
    794  *
    795  *    ISC_R_SUCCESS
    796  *    ISC_R_NOMORE 	*pos pointed at the last transaction
    797  *    Other results due to file errors are possible.
    798  */
    799 static isc_result_t
    800 journal_next(dns_journal_t *j, journal_pos_t *pos) {
    801 	isc_result_t result;
    802 	journal_xhdr_t xhdr;
    803 	REQUIRE(DNS_JOURNAL_VALID(j));
    804 
    805 	result = journal_seek(j, pos->offset);
    806 	if (result != ISC_R_SUCCESS) {
    807 		return (result);
    808 	}
    809 
    810 	if (pos->serial == j->header.end.serial) {
    811 		return (ISC_R_NOMORE);
    812 	}
    813 	/*
    814 	 * Read the header of the current transaction.
    815 	 * This will return ISC_R_NOMORE if we are at EOF.
    816 	 */
    817 	result = journal_read_xhdr(j, &xhdr);
    818 	if (result != ISC_R_SUCCESS) {
    819 		return (result);
    820 	}
    821 
    822 	/*
    823 	 * Check serial number consistency.
    824 	 */
    825 	if (xhdr.serial0 != pos->serial) {
    826 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    827 			      "%s: journal file corrupt: "
    828 			      "expected serial %u, got %u",
    829 			      j->filename, pos->serial, xhdr.serial0);
    830 		return (ISC_R_UNEXPECTED);
    831 	}
    832 
    833 	/*
    834 	 * Check for offset wraparound.
    835 	 */
    836 	if ((isc_offset_t)(pos->offset + sizeof(journal_rawxhdr_t) +
    837 			   xhdr.size) < pos->offset)
    838 	{
    839 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
    840 			      "%s: offset too large", j->filename);
    841 		return (ISC_R_UNEXPECTED);
    842 	}
    843 
    844 	pos->offset += sizeof(journal_rawxhdr_t) + xhdr.size;
    845 	pos->serial = xhdr.serial1;
    846 	return (ISC_R_SUCCESS);
    847 }
    848 
    849 /*
    850  * If the index of the journal 'j' contains an entry "better"
    851  * than '*best_guess', replace '*best_guess' with it.
    852  *
    853  * "Better" means having a serial number closer to 'serial'
    854  * but not greater than 'serial'.
    855  */
    856 static void
    857 index_find(dns_journal_t *j, uint32_t serial, journal_pos_t *best_guess) {
    858 	unsigned int i;
    859 	if (j->index == NULL) {
    860 		return;
    861 	}
    862 	for (i = 0; i < j->header.index_size; i++) {
    863 		if (POS_VALID(j->index[i]) &&
    864 		    DNS_SERIAL_GE(serial, j->index[i].serial) &&
    865 		    DNS_SERIAL_GT(j->index[i].serial, best_guess->serial))
    866 		{
    867 			*best_guess = j->index[i];
    868 		}
    869 	}
    870 }
    871 
    872 /*
    873  * Add a new index entry.  If there is no room, make room by removing
    874  * the odd-numbered entries and compacting the others into the first
    875  * half of the index.  This decimates old index entries exponentially
    876  * over time, so that the index always contains a much larger fraction
    877  * of recent serial numbers than of old ones.  This is deliberate -
    878  * most index searches are for outgoing IXFR, and IXFR tends to request
    879  * recent versions more often than old ones.
    880  */
    881 static void
    882 index_add(dns_journal_t *j, journal_pos_t *pos) {
    883 	unsigned int i;
    884 	if (j->index == NULL) {
    885 		return;
    886 	}
    887 	/*
    888 	 * Search for a vacant position.
    889 	 */
    890 	for (i = 0; i < j->header.index_size; i++) {
    891 		if (!POS_VALID(j->index[i])) {
    892 			break;
    893 		}
    894 	}
    895 	if (i == j->header.index_size) {
    896 		unsigned int k = 0;
    897 		/*
    898 		 * Found no vacant position.  Make some room.
    899 		 */
    900 		for (i = 0; i < j->header.index_size; i += 2) {
    901 			j->index[k++] = j->index[i];
    902 		}
    903 		i = k; /* 'i' identifies the first vacant position. */
    904 		while (k < j->header.index_size) {
    905 			POS_INVALIDATE(j->index[k]);
    906 			k++;
    907 		}
    908 	}
    909 	INSIST(i < j->header.index_size);
    910 	INSIST(!POS_VALID(j->index[i]));
    911 
    912 	/*
    913 	 * Store the new index entry.
    914 	 */
    915 	j->index[i] = *pos;
    916 }
    917 
    918 /*
    919  * Invalidate any existing index entries that could become
    920  * ambiguous when a new transaction with number 'serial' is added.
    921  */
    922 static void
    923 index_invalidate(dns_journal_t *j, uint32_t serial) {
    924 	unsigned int i;
    925 	if (j->index == NULL) {
    926 		return;
    927 	}
    928 	for (i = 0; i < j->header.index_size; i++) {
    929 		if (!DNS_SERIAL_GT(serial, j->index[i].serial)) {
    930 			POS_INVALIDATE(j->index[i]);
    931 		}
    932 	}
    933 }
    934 
    935 /*
    936  * Try to find a transaction with initial serial number 'serial'
    937  * in the journal 'j'.
    938  *
    939  * If found, store its position at '*pos' and return ISC_R_SUCCESS.
    940  *
    941  * If 'serial' is current (= the ending serial number of the
    942  * last transaction in the journal), set '*pos' to
    943  * the position immediately following the last transaction and
    944  * return ISC_R_SUCCESS.
    945  *
    946  * If 'serial' is within the range of addressable serial numbers
    947  * covered by the journal but that particular serial number is missing
    948  * (from the journal, not just from the index), return ISC_R_NOTFOUND.
    949  *
    950  * If 'serial' is outside the range of addressable serial numbers
    951  * covered by the journal, return ISC_R_RANGE.
    952  *
    953  */
    954 static isc_result_t
    955 journal_find(dns_journal_t *j, uint32_t serial, journal_pos_t *pos) {
    956 	isc_result_t result;
    957 	journal_pos_t current_pos;
    958 	REQUIRE(DNS_JOURNAL_VALID(j));
    959 
    960 	if (DNS_SERIAL_GT(j->header.begin.serial, serial)) {
    961 		return (ISC_R_RANGE);
    962 	}
    963 	if (DNS_SERIAL_GT(serial, j->header.end.serial)) {
    964 		return (ISC_R_RANGE);
    965 	}
    966 	if (serial == j->header.end.serial) {
    967 		*pos = j->header.end;
    968 		return (ISC_R_SUCCESS);
    969 	}
    970 
    971 	current_pos = j->header.begin;
    972 	index_find(j, serial, &current_pos);
    973 
    974 	while (current_pos.serial != serial) {
    975 		if (DNS_SERIAL_GT(current_pos.serial, serial)) {
    976 			return (ISC_R_NOTFOUND);
    977 		}
    978 		result = journal_next(j, &current_pos);
    979 		if (result != ISC_R_SUCCESS) {
    980 			return (result);
    981 		}
    982 	}
    983 	*pos = current_pos;
    984 	return (ISC_R_SUCCESS);
    985 }
    986 
    987 isc_result_t
    988 dns_journal_begin_transaction(dns_journal_t *j) {
    989 	uint32_t offset;
    990 	isc_result_t result;
    991 	journal_rawxhdr_t hdr;
    992 
    993 	REQUIRE(DNS_JOURNAL_VALID(j));
    994 	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
    995 		j->state == JOURNAL_STATE_INLINE);
    996 
    997 	/*
    998 	 * Find the file offset where the new transaction should
    999 	 * be written, and seek there.
   1000 	 */
   1001 	if (JOURNAL_EMPTY(&j->header)) {
   1002 		offset = sizeof(journal_rawheader_t) +
   1003 			 j->header.index_size * sizeof(journal_rawpos_t);
   1004 	} else {
   1005 		offset = j->header.end.offset;
   1006 	}
   1007 	j->x.pos[0].offset = offset;
   1008 	j->x.pos[1].offset = offset; /* Initial value, will be incremented. */
   1009 	j->x.n_soa = 0;
   1010 
   1011 	CHECK(journal_seek(j, offset));
   1012 
   1013 	/*
   1014 	 * Write a dummy transaction header of all zeroes to reserve
   1015 	 * space.  It will be filled in when the transaction is
   1016 	 * finished.
   1017 	 */
   1018 	memset(&hdr, 0, sizeof(hdr));
   1019 	CHECK(journal_write(j, &hdr, sizeof(hdr)));
   1020 	j->x.pos[1].offset = j->offset;
   1021 
   1022 	j->state = JOURNAL_STATE_TRANSACTION;
   1023 	result = ISC_R_SUCCESS;
   1024 failure:
   1025 	return (result);
   1026 }
   1027 
   1028 isc_result_t
   1029 dns_journal_writediff(dns_journal_t *j, dns_diff_t *diff) {
   1030 	dns_difftuple_t *t;
   1031 	isc_buffer_t buffer;
   1032 	void *mem = NULL;
   1033 	uint64_t size = 0;
   1034 	uint32_t rrcount = 0;
   1035 	isc_result_t result;
   1036 	isc_region_t used;
   1037 
   1038 	REQUIRE(DNS_DIFF_VALID(diff));
   1039 	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION);
   1040 
   1041 	isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "writing to journal");
   1042 	(void)dns_diff_print(diff, NULL);
   1043 
   1044 	/*
   1045 	 * Pass 1: determine the buffer size needed, and
   1046 	 * keep track of SOA serial numbers.
   1047 	 */
   1048 	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
   1049 	     t = ISC_LIST_NEXT(t, link)) {
   1050 		if (t->rdata.type == dns_rdatatype_soa) {
   1051 			if (j->x.n_soa < 2) {
   1052 				j->x.pos[j->x.n_soa].serial =
   1053 					dns_soa_getserial(&t->rdata);
   1054 			}
   1055 			j->x.n_soa++;
   1056 		}
   1057 		size += sizeof(journal_rawrrhdr_t);
   1058 		size += t->name.length; /* XXX should have access macro? */
   1059 		size += 10;
   1060 		size += t->rdata.length;
   1061 	}
   1062 
   1063 	if (size >= DNS_JOURNAL_SIZE_MAX) {
   1064 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1065 			      "dns_journal_writediff: %s: journal entry "
   1066 			      "too big to be stored: %" PRIu64 " bytes",
   1067 			      j->filename, size);
   1068 		return (ISC_R_NOSPACE);
   1069 	}
   1070 
   1071 	mem = isc_mem_get(j->mctx, size);
   1072 
   1073 	isc_buffer_init(&buffer, mem, size);
   1074 
   1075 	/*
   1076 	 * Pass 2.  Write RRs to buffer.
   1077 	 */
   1078 	for (t = ISC_LIST_HEAD(diff->tuples); t != NULL;
   1079 	     t = ISC_LIST_NEXT(t, link)) {
   1080 		/*
   1081 		 * Write the RR header.
   1082 		 */
   1083 		isc_buffer_putuint32(&buffer,
   1084 				     t->name.length + 10 + t->rdata.length);
   1085 		/*
   1086 		 * Write the owner name, RR header, and RR data.
   1087 		 */
   1088 		isc_buffer_putmem(&buffer, t->name.ndata, t->name.length);
   1089 		isc_buffer_putuint16(&buffer, t->rdata.type);
   1090 		isc_buffer_putuint16(&buffer, t->rdata.rdclass);
   1091 		isc_buffer_putuint32(&buffer, t->ttl);
   1092 		INSIST(t->rdata.length < 65536);
   1093 		isc_buffer_putuint16(&buffer, (uint16_t)t->rdata.length);
   1094 		INSIST(isc_buffer_availablelength(&buffer) >= t->rdata.length);
   1095 		isc_buffer_putmem(&buffer, t->rdata.data, t->rdata.length);
   1096 
   1097 		rrcount++;
   1098 	}
   1099 
   1100 	isc_buffer_usedregion(&buffer, &used);
   1101 	INSIST(used.length == size);
   1102 
   1103 	j->x.pos[1].offset += used.length;
   1104 	j->x.n_rr = rrcount;
   1105 
   1106 	/*
   1107 	 * Write the buffer contents to the journal file.
   1108 	 */
   1109 	CHECK(journal_write(j, used.base, used.length));
   1110 
   1111 	result = ISC_R_SUCCESS;
   1112 
   1113 failure:
   1114 	if (mem != NULL) {
   1115 		isc_mem_put(j->mctx, mem, size);
   1116 	}
   1117 	return (result);
   1118 }
   1119 
   1120 isc_result_t
   1121 dns_journal_commit(dns_journal_t *j) {
   1122 	isc_result_t result;
   1123 	journal_rawheader_t rawheader;
   1124 	uint64_t total;
   1125 
   1126 	REQUIRE(DNS_JOURNAL_VALID(j));
   1127 	REQUIRE(j->state == JOURNAL_STATE_TRANSACTION ||
   1128 		j->state == JOURNAL_STATE_INLINE);
   1129 
   1130 	/*
   1131 	 * Just write out a updated header.
   1132 	 */
   1133 	if (j->state == JOURNAL_STATE_INLINE) {
   1134 		CHECK(journal_fsync(j));
   1135 		journal_header_encode(&j->header, &rawheader);
   1136 		CHECK(journal_seek(j, 0));
   1137 		CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
   1138 		CHECK(journal_fsync(j));
   1139 		j->state = JOURNAL_STATE_WRITE;
   1140 		return (ISC_R_SUCCESS);
   1141 	}
   1142 
   1143 	/*
   1144 	 * Perform some basic consistency checks.
   1145 	 */
   1146 	if (j->x.n_soa != 2) {
   1147 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1148 			      "%s: malformed transaction: %d SOAs", j->filename,
   1149 			      j->x.n_soa);
   1150 		return (ISC_R_UNEXPECTED);
   1151 	}
   1152 	if (!DNS_SERIAL_GT(j->x.pos[1].serial, j->x.pos[0].serial)) {
   1153 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1154 			      "%s: malformed transaction: serial number "
   1155 			      "did not increase",
   1156 			      j->filename);
   1157 		return (ISC_R_UNEXPECTED);
   1158 	}
   1159 	if (!JOURNAL_EMPTY(&j->header)) {
   1160 		if (j->x.pos[0].serial != j->header.end.serial) {
   1161 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1162 				      "malformed transaction: "
   1163 				      "%s last serial %u != "
   1164 				      "transaction first serial %u",
   1165 				      j->filename, j->header.end.serial,
   1166 				      j->x.pos[0].serial);
   1167 			return (ISC_R_UNEXPECTED);
   1168 		}
   1169 	}
   1170 
   1171 	/*
   1172 	 * We currently don't support huge journal entries.
   1173 	 */
   1174 	total = j->x.pos[1].offset - j->x.pos[0].offset;
   1175 	if (total >= DNS_JOURNAL_SIZE_MAX) {
   1176 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1177 			      "transaction too big to be stored in journal: "
   1178 			      "%" PRIu64 "b (max is %" PRIu64 "b)",
   1179 			      total, (uint64_t)DNS_JOURNAL_SIZE_MAX);
   1180 		return (ISC_R_UNEXPECTED);
   1181 	}
   1182 
   1183 	/*
   1184 	 * Some old journal entries may become non-addressable
   1185 	 * when we increment the current serial number.  Purge them
   1186 	 * by stepping header.begin forward to the first addressable
   1187 	 * transaction.  Also purge them from the index.
   1188 	 */
   1189 	if (!JOURNAL_EMPTY(&j->header)) {
   1190 		while (!DNS_SERIAL_GT(j->x.pos[1].serial,
   1191 				      j->header.begin.serial)) {
   1192 			CHECK(journal_next(j, &j->header.begin));
   1193 		}
   1194 		index_invalidate(j, j->x.pos[1].serial);
   1195 	}
   1196 #ifdef notyet
   1197 	if (DNS_SERIAL_GT(last_dumped_serial, j->x.pos[1].serial)) {
   1198 		force_dump(...);
   1199 	}
   1200 #endif /* ifdef notyet */
   1201 
   1202 	/*
   1203 	 * Commit the transaction data to stable storage.
   1204 	 */
   1205 	CHECK(journal_fsync(j));
   1206 
   1207 	if (j->state == JOURNAL_STATE_TRANSACTION) {
   1208 		isc_offset_t offset;
   1209 		offset = (j->x.pos[1].offset - j->x.pos[0].offset) -
   1210 			 sizeof(journal_rawxhdr_t);
   1211 		/*
   1212 		 * Update the transaction header.
   1213 		 */
   1214 		CHECK(journal_seek(j, j->x.pos[0].offset));
   1215 		CHECK(journal_write_xhdr(j, offset, j->x.n_rr,
   1216 					 j->x.pos[0].serial,
   1217 					 j->x.pos[1].serial));
   1218 	}
   1219 
   1220 	/*
   1221 	 * Update the journal header.
   1222 	 */
   1223 	if (JOURNAL_EMPTY(&j->header)) {
   1224 		j->header.begin = j->x.pos[0];
   1225 	}
   1226 	j->header.end = j->x.pos[1];
   1227 	journal_header_encode(&j->header, &rawheader);
   1228 	CHECK(journal_seek(j, 0));
   1229 	CHECK(journal_write(j, &rawheader, sizeof(rawheader)));
   1230 
   1231 	/*
   1232 	 * Update the index.
   1233 	 */
   1234 	index_add(j, &j->x.pos[0]);
   1235 
   1236 	/*
   1237 	 * Convert the index into on-disk format and write
   1238 	 * it to disk.
   1239 	 */
   1240 	CHECK(index_to_disk(j));
   1241 
   1242 	/*
   1243 	 * Commit the header to stable storage.
   1244 	 */
   1245 	CHECK(journal_fsync(j));
   1246 
   1247 	/*
   1248 	 * We no longer have a transaction open.
   1249 	 */
   1250 	j->state = JOURNAL_STATE_WRITE;
   1251 
   1252 	result = ISC_R_SUCCESS;
   1253 
   1254 failure:
   1255 	return (result);
   1256 }
   1257 
   1258 isc_result_t
   1259 dns_journal_write_transaction(dns_journal_t *j, dns_diff_t *diff) {
   1260 	isc_result_t result;
   1261 	CHECK(dns_diff_sort(diff, ixfr_order));
   1262 	CHECK(dns_journal_begin_transaction(j));
   1263 	CHECK(dns_journal_writediff(j, diff));
   1264 	CHECK(dns_journal_commit(j));
   1265 	result = ISC_R_SUCCESS;
   1266 failure:
   1267 	return (result);
   1268 }
   1269 
   1270 void
   1271 dns_journal_destroy(dns_journal_t **journalp) {
   1272 	dns_journal_t *j = *journalp;
   1273 	*journalp = NULL;
   1274 	REQUIRE(DNS_JOURNAL_VALID(j));
   1275 
   1276 	j->it.result = ISC_R_FAILURE;
   1277 	dns_name_invalidate(&j->it.name);
   1278 	dns_decompress_invalidate(&j->it.dctx);
   1279 	if (j->rawindex != NULL) {
   1280 		isc_mem_put(j->mctx, j->rawindex,
   1281 			    j->header.index_size * sizeof(journal_rawpos_t));
   1282 	}
   1283 	if (j->index != NULL) {
   1284 		isc_mem_put(j->mctx, j->index,
   1285 			    j->header.index_size * sizeof(journal_pos_t));
   1286 	}
   1287 	if (j->it.target.base != NULL) {
   1288 		isc_mem_put(j->mctx, j->it.target.base, j->it.target.length);
   1289 	}
   1290 	if (j->it.source.base != NULL) {
   1291 		isc_mem_put(j->mctx, j->it.source.base, j->it.source.length);
   1292 	}
   1293 	if (j->filename != NULL) {
   1294 		isc_mem_free(j->mctx, j->filename);
   1295 	}
   1296 	if (j->fp != NULL) {
   1297 		(void)isc_stdio_close(j->fp);
   1298 	}
   1299 	j->magic = 0;
   1300 	isc_mem_putanddetach(&j->mctx, j, sizeof(*j));
   1301 }
   1302 
   1303 /*
   1304  * Roll the open journal 'j' into the database 'db'.
   1305  * A new database version will be created.
   1306  */
   1307 
   1308 /* XXX Share code with incoming IXFR? */
   1309 
   1310 static isc_result_t
   1311 roll_forward(dns_journal_t *j, dns_db_t *db, unsigned int options) {
   1312 	isc_buffer_t source; /* Transaction data from disk */
   1313 	isc_buffer_t target; /* Ditto after _fromwire check */
   1314 	uint32_t db_serial;  /* Database SOA serial */
   1315 	uint32_t end_serial; /* Last journal SOA serial */
   1316 	isc_result_t result;
   1317 	dns_dbversion_t *ver = NULL;
   1318 	journal_pos_t pos;
   1319 	dns_diff_t diff;
   1320 	unsigned int n_soa = 0;
   1321 	unsigned int n_put = 0;
   1322 	dns_diffop_t op;
   1323 
   1324 	REQUIRE(DNS_JOURNAL_VALID(j));
   1325 	REQUIRE(DNS_DB_VALID(db));
   1326 
   1327 	dns_diff_init(j->mctx, &diff);
   1328 
   1329 	/*
   1330 	 * Set up empty initial buffers for unchecked and checked
   1331 	 * wire format transaction data.  They will be reallocated
   1332 	 * later.
   1333 	 */
   1334 	isc_buffer_init(&source, NULL, 0);
   1335 	isc_buffer_init(&target, NULL, 0);
   1336 
   1337 	/*
   1338 	 * Create the new database version.
   1339 	 */
   1340 	CHECK(dns_db_newversion(db, &ver));
   1341 
   1342 	/*
   1343 	 * Get the current database SOA serial number.
   1344 	 */
   1345 	CHECK(dns_db_getsoaserial(db, ver, &db_serial));
   1346 
   1347 	/*
   1348 	 * Locate a journal entry for the current database serial.
   1349 	 */
   1350 	CHECK(journal_find(j, db_serial, &pos));
   1351 	/*
   1352 	 * XXX do more drastic things, like marking zone stale,
   1353 	 * if this fails?
   1354 	 */
   1355 	/*
   1356 	 * XXXRTH  The zone code should probably mark the zone as bad and
   1357 	 *         scream loudly into the log if this is a dynamic update
   1358 	 *	   log reply that failed.
   1359 	 */
   1360 
   1361 	end_serial = dns_journal_last_serial(j);
   1362 	if (db_serial == end_serial) {
   1363 		CHECK(DNS_R_UPTODATE);
   1364 	}
   1365 
   1366 	CHECK(dns_journal_iter_init(j, db_serial, end_serial, NULL));
   1367 
   1368 	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
   1369 	     result = dns_journal_next_rr(j))
   1370 	{
   1371 		dns_name_t *name;
   1372 		uint32_t ttl;
   1373 		dns_rdata_t *rdata;
   1374 		dns_difftuple_t *tuple = NULL;
   1375 
   1376 		name = NULL;
   1377 		rdata = NULL;
   1378 		dns_journal_current_rr(j, &name, &ttl, &rdata);
   1379 
   1380 		if (rdata->type == dns_rdatatype_soa) {
   1381 			n_soa++;
   1382 			if (n_soa == 2) {
   1383 				db_serial = j->it.current_serial;
   1384 			}
   1385 		}
   1386 
   1387 		if (n_soa == 3) {
   1388 			n_soa = 1;
   1389 		}
   1390 		if (n_soa == 0) {
   1391 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1392 				      "%s: journal file corrupt: missing "
   1393 				      "initial SOA",
   1394 				      j->filename);
   1395 			FAIL(ISC_R_UNEXPECTED);
   1396 		}
   1397 		if ((options & DNS_JOURNALOPT_RESIGN) != 0) {
   1398 			op = (n_soa == 1) ? DNS_DIFFOP_DELRESIGN
   1399 					  : DNS_DIFFOP_ADDRESIGN;
   1400 		} else {
   1401 			op = (n_soa == 1) ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD;
   1402 		}
   1403 
   1404 		CHECK(dns_difftuple_create(diff.mctx, op, name, ttl, rdata,
   1405 					   &tuple));
   1406 		dns_diff_append(&diff, &tuple);
   1407 
   1408 		if (++n_put > 100) {
   1409 			isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
   1410 				      "%s: applying diff to database (%u)",
   1411 				      j->filename, db_serial);
   1412 			(void)dns_diff_print(&diff, NULL);
   1413 			CHECK(dns_diff_apply(&diff, db, ver));
   1414 			dns_diff_clear(&diff);
   1415 			n_put = 0;
   1416 		}
   1417 	}
   1418 	if (result == ISC_R_NOMORE) {
   1419 		result = ISC_R_SUCCESS;
   1420 	}
   1421 	CHECK(result);
   1422 
   1423 	if (n_put != 0) {
   1424 		isc_log_write(JOURNAL_DEBUG_LOGARGS(3),
   1425 			      "%s: applying final diff to database (%u)",
   1426 			      j->filename, db_serial);
   1427 		(void)dns_diff_print(&diff, NULL);
   1428 		CHECK(dns_diff_apply(&diff, db, ver));
   1429 		dns_diff_clear(&diff);
   1430 	}
   1431 
   1432 failure:
   1433 	if (ver != NULL) {
   1434 		dns_db_closeversion(db, &ver,
   1435 				    result == ISC_R_SUCCESS ? true : false);
   1436 	}
   1437 
   1438 	if (source.base != NULL) {
   1439 		isc_mem_put(j->mctx, source.base, source.length);
   1440 	}
   1441 	if (target.base != NULL) {
   1442 		isc_mem_put(j->mctx, target.base, target.length);
   1443 	}
   1444 
   1445 	dns_diff_clear(&diff);
   1446 
   1447 	INSIST(ver == NULL);
   1448 
   1449 	return (result);
   1450 }
   1451 
   1452 isc_result_t
   1453 dns_journal_rollforward(isc_mem_t *mctx, dns_db_t *db, unsigned int options,
   1454 			const char *filename) {
   1455 	dns_journal_t *j;
   1456 	isc_result_t result;
   1457 
   1458 	REQUIRE(DNS_DB_VALID(db));
   1459 	REQUIRE(filename != NULL);
   1460 
   1461 	j = NULL;
   1462 	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
   1463 	if (result == ISC_R_NOTFOUND) {
   1464 		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file, but "
   1465 							"that's OK");
   1466 		return (DNS_R_NOJOURNAL);
   1467 	}
   1468 	if (result != ISC_R_SUCCESS) {
   1469 		return (result);
   1470 	}
   1471 	if (JOURNAL_EMPTY(&j->header)) {
   1472 		result = DNS_R_UPTODATE;
   1473 	} else {
   1474 		result = roll_forward(j, db, options);
   1475 	}
   1476 
   1477 	dns_journal_destroy(&j);
   1478 
   1479 	return (result);
   1480 }
   1481 
   1482 isc_result_t
   1483 dns_journal_print(isc_mem_t *mctx, const char *filename, FILE *file) {
   1484 	dns_journal_t *j;
   1485 	isc_buffer_t source;   /* Transaction data from disk */
   1486 	isc_buffer_t target;   /* Ditto after _fromwire check */
   1487 	uint32_t start_serial; /* Database SOA serial */
   1488 	uint32_t end_serial;   /* Last journal SOA serial */
   1489 	isc_result_t result;
   1490 	dns_diff_t diff;
   1491 	unsigned int n_soa = 0;
   1492 	unsigned int n_put = 0;
   1493 
   1494 	REQUIRE(filename != NULL);
   1495 
   1496 	j = NULL;
   1497 	result = dns_journal_open(mctx, filename, DNS_JOURNAL_READ, &j);
   1498 	if (result == ISC_R_NOTFOUND) {
   1499 		isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no journal file");
   1500 		return (DNS_R_NOJOURNAL);
   1501 	}
   1502 
   1503 	if (result != ISC_R_SUCCESS) {
   1504 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1505 			      "journal open failure: %s: %s",
   1506 			      isc_result_totext(result), filename);
   1507 		return (result);
   1508 	}
   1509 
   1510 	if (j->header.serialset) {
   1511 		fprintf(file, "Source serial = %u\n", j->header.sourceserial);
   1512 	}
   1513 	dns_diff_init(j->mctx, &diff);
   1514 
   1515 	/*
   1516 	 * Set up empty initial buffers for unchecked and checked
   1517 	 * wire format transaction data.  They will be reallocated
   1518 	 * later.
   1519 	 */
   1520 	isc_buffer_init(&source, NULL, 0);
   1521 	isc_buffer_init(&target, NULL, 0);
   1522 
   1523 	start_serial = dns_journal_first_serial(j);
   1524 	end_serial = dns_journal_last_serial(j);
   1525 
   1526 	CHECK(dns_journal_iter_init(j, start_serial, end_serial, NULL));
   1527 
   1528 	for (result = dns_journal_first_rr(j); result == ISC_R_SUCCESS;
   1529 	     result = dns_journal_next_rr(j))
   1530 	{
   1531 		dns_name_t *name;
   1532 		uint32_t ttl;
   1533 		dns_rdata_t *rdata;
   1534 		dns_difftuple_t *tuple = NULL;
   1535 
   1536 		name = NULL;
   1537 		rdata = NULL;
   1538 		dns_journal_current_rr(j, &name, &ttl, &rdata);
   1539 
   1540 		if (rdata->type == dns_rdatatype_soa) {
   1541 			n_soa++;
   1542 		}
   1543 
   1544 		if (n_soa == 3) {
   1545 			n_soa = 1;
   1546 		}
   1547 		if (n_soa == 0) {
   1548 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1549 				      "%s: journal file corrupt: missing "
   1550 				      "initial SOA",
   1551 				      j->filename);
   1552 			FAIL(ISC_R_UNEXPECTED);
   1553 		}
   1554 		CHECK(dns_difftuple_create(
   1555 			diff.mctx, n_soa == 1 ? DNS_DIFFOP_DEL : DNS_DIFFOP_ADD,
   1556 			name, ttl, rdata, &tuple));
   1557 		dns_diff_append(&diff, &tuple);
   1558 
   1559 		if (++n_put > 100) {
   1560 			result = dns_diff_print(&diff, file);
   1561 			dns_diff_clear(&diff);
   1562 			n_put = 0;
   1563 			if (result != ISC_R_SUCCESS) {
   1564 				break;
   1565 			}
   1566 		}
   1567 	}
   1568 	if (result == ISC_R_NOMORE) {
   1569 		result = ISC_R_SUCCESS;
   1570 	}
   1571 	CHECK(result);
   1572 
   1573 	if (n_put != 0) {
   1574 		result = dns_diff_print(&diff, file);
   1575 		dns_diff_clear(&diff);
   1576 	}
   1577 	goto cleanup;
   1578 
   1579 failure:
   1580 	isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1581 		      "%s: cannot print: journal file corrupt", j->filename);
   1582 
   1583 cleanup:
   1584 	if (source.base != NULL) {
   1585 		isc_mem_put(j->mctx, source.base, source.length);
   1586 	}
   1587 	if (target.base != NULL) {
   1588 		isc_mem_put(j->mctx, target.base, target.length);
   1589 	}
   1590 
   1591 	dns_diff_clear(&diff);
   1592 	dns_journal_destroy(&j);
   1593 
   1594 	return (result);
   1595 }
   1596 
   1597 /**************************************************************************/
   1598 /*
   1599  * Miscellaneous accessors.
   1600  */
   1601 bool
   1602 dns_journal_empty(dns_journal_t *j) {
   1603 	return (JOURNAL_EMPTY(&j->header));
   1604 }
   1605 
   1606 uint32_t
   1607 dns_journal_first_serial(dns_journal_t *j) {
   1608 	return (j->header.begin.serial);
   1609 }
   1610 
   1611 uint32_t
   1612 dns_journal_last_serial(dns_journal_t *j) {
   1613 	return (j->header.end.serial);
   1614 }
   1615 
   1616 void
   1617 dns_journal_set_sourceserial(dns_journal_t *j, uint32_t sourceserial) {
   1618 	REQUIRE(j->state == JOURNAL_STATE_WRITE ||
   1619 		j->state == JOURNAL_STATE_INLINE ||
   1620 		j->state == JOURNAL_STATE_TRANSACTION);
   1621 
   1622 	j->header.sourceserial = sourceserial;
   1623 	j->header.serialset = true;
   1624 	if (j->state == JOURNAL_STATE_WRITE) {
   1625 		j->state = JOURNAL_STATE_INLINE;
   1626 	}
   1627 }
   1628 
   1629 bool
   1630 dns_journal_get_sourceserial(dns_journal_t *j, uint32_t *sourceserial) {
   1631 	REQUIRE(sourceserial != NULL);
   1632 
   1633 	if (!j->header.serialset) {
   1634 		return (false);
   1635 	}
   1636 	*sourceserial = j->header.sourceserial;
   1637 	return (true);
   1638 }
   1639 
   1640 /**************************************************************************/
   1641 /*
   1642  * Iteration support.
   1643  *
   1644  * When serving an outgoing IXFR, we transmit a part the journal starting
   1645  * at the serial number in the IXFR request and ending at the serial
   1646  * number that is current when the IXFR request arrives.  The ending
   1647  * serial number is not necessarily at the end of the journal:
   1648  * the journal may grow while the IXFR is in progress, but we stop
   1649  * when we reach the serial number that was current when the IXFR started.
   1650  */
   1651 
   1652 static isc_result_t
   1653 read_one_rr(dns_journal_t *j);
   1654 
   1655 /*
   1656  * Make sure the buffer 'b' is has at least 'size' bytes
   1657  * allocated, and clear it.
   1658  *
   1659  * Requires:
   1660  *	Either b->base is NULL, or it points to b->length bytes of memory
   1661  *	previously allocated by isc_mem_get().
   1662  */
   1663 
   1664 static isc_result_t
   1665 size_buffer(isc_mem_t *mctx, isc_buffer_t *b, unsigned size) {
   1666 	if (b->length < size) {
   1667 		void *mem = isc_mem_get(mctx, size);
   1668 		if (mem == NULL) {
   1669 			return (ISC_R_NOMEMORY);
   1670 		}
   1671 		if (b->base != NULL) {
   1672 			isc_mem_put(mctx, b->base, b->length);
   1673 		}
   1674 		b->base = mem;
   1675 		b->length = size;
   1676 	}
   1677 	isc_buffer_clear(b);
   1678 	return (ISC_R_SUCCESS);
   1679 }
   1680 
   1681 isc_result_t
   1682 dns_journal_iter_init(dns_journal_t *j, uint32_t begin_serial,
   1683 		      uint32_t end_serial, size_t *xfrsizep) {
   1684 	isc_result_t result;
   1685 
   1686 	CHECK(journal_find(j, begin_serial, &j->it.bpos));
   1687 	INSIST(j->it.bpos.serial == begin_serial);
   1688 
   1689 	CHECK(journal_find(j, end_serial, &j->it.epos));
   1690 	INSIST(j->it.epos.serial == end_serial);
   1691 
   1692 	if (xfrsizep != NULL) {
   1693 		journal_pos_t pos = j->it.bpos;
   1694 		journal_xhdr_t xhdr;
   1695 		uint64_t size = 0;
   1696 		uint32_t count = 0;
   1697 
   1698 		/*
   1699 		 * We already know the beginning and ending serial
   1700 		 * numbers are in the journal. Scan through them,
   1701 		 * adding up sizes and RR counts so we can calculate
   1702 		 * the IXFR size.
   1703 		 */
   1704 		CHECK(journal_seek(j, pos.offset));
   1705 		do {
   1706 			CHECK(journal_read_xhdr(j, &xhdr));
   1707 
   1708 			size += xhdr.size;
   1709 			count += xhdr.count;
   1710 
   1711 			result = journal_next(j, &pos);
   1712 			if (result == ISC_R_NOMORE) {
   1713 				result = ISC_R_SUCCESS;
   1714 			}
   1715 			CHECK(result);
   1716 		} while (pos.serial != end_serial);
   1717 
   1718 		/*
   1719 		 * For each RR, subtract the length of the RR header,
   1720 		 * as this would not be present in IXFR messages.
   1721 		 * (We don't need to worry about the transaction header
   1722 		 * because that was already excluded from xdr.size.)
   1723 		 */
   1724 		*xfrsizep = size - (count * sizeof(journal_rawrrhdr_t));
   1725 	}
   1726 
   1727 	result = ISC_R_SUCCESS;
   1728 failure:
   1729 	j->it.result = result;
   1730 	return (j->it.result);
   1731 }
   1732 
   1733 isc_result_t
   1734 dns_journal_first_rr(dns_journal_t *j) {
   1735 	isc_result_t result;
   1736 
   1737 	/*
   1738 	 * Seek to the beginning of the first transaction we are
   1739 	 * interested in.
   1740 	 */
   1741 	CHECK(journal_seek(j, j->it.bpos.offset));
   1742 	j->it.current_serial = j->it.bpos.serial;
   1743 
   1744 	j->it.xsize = 0; /* We have no transaction data yet... */
   1745 	j->it.xpos = 0;	 /* ...and haven't used any of it. */
   1746 
   1747 	return (read_one_rr(j));
   1748 
   1749 failure:
   1750 	return (result);
   1751 }
   1752 
   1753 static isc_result_t
   1754 read_one_rr(dns_journal_t *j) {
   1755 	isc_result_t result;
   1756 
   1757 	dns_rdatatype_t rdtype;
   1758 	dns_rdataclass_t rdclass;
   1759 	unsigned int rdlen;
   1760 	uint32_t ttl;
   1761 	journal_xhdr_t xhdr;
   1762 	journal_rrhdr_t rrhdr;
   1763 
   1764 	if (j->offset > j->it.epos.offset) {
   1765 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1766 			      "%s: journal corrupt: possible integer overflow",
   1767 			      j->filename);
   1768 		return (ISC_R_UNEXPECTED);
   1769 	}
   1770 	if (j->offset == j->it.epos.offset) {
   1771 		return (ISC_R_NOMORE);
   1772 	}
   1773 	if (j->it.xpos == j->it.xsize) {
   1774 		/*
   1775 		 * We are at a transaction boundary.
   1776 		 * Read another transaction header.
   1777 		 */
   1778 		CHECK(journal_read_xhdr(j, &xhdr));
   1779 		if (xhdr.size == 0) {
   1780 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1781 				      "%s: journal corrupt: empty transaction",
   1782 				      j->filename);
   1783 			FAIL(ISC_R_UNEXPECTED);
   1784 		}
   1785 		if (xhdr.serial0 != j->it.current_serial) {
   1786 			isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1787 				      "%s: journal file corrupt: "
   1788 				      "expected serial %u, got %u",
   1789 				      j->filename, j->it.current_serial,
   1790 				      xhdr.serial0);
   1791 			FAIL(ISC_R_UNEXPECTED);
   1792 		}
   1793 		j->it.xsize = xhdr.size;
   1794 		j->it.xpos = 0;
   1795 	}
   1796 	/*
   1797 	 * Read an RR.
   1798 	 */
   1799 	CHECK(journal_read_rrhdr(j, &rrhdr));
   1800 	/*
   1801 	 * Perform a sanity check on the journal RR size.
   1802 	 * The smallest possible RR has a 1-byte owner name
   1803 	 * and a 10-byte header.  The largest possible
   1804 	 * RR has 65535 bytes of data, a header, and a maximum-
   1805 	 * size owner name, well below 70 k total.
   1806 	 */
   1807 	if (rrhdr.size < 1 + 10 || rrhdr.size > 70000) {
   1808 		isc_log_write(JOURNAL_COMMON_LOGARGS, ISC_LOG_ERROR,
   1809 			      "%s: journal corrupt: impossible RR size "
   1810 			      "(%d bytes)",
   1811 			      j->filename, rrhdr.size);
   1812 		FAIL(ISC_R_UNEXPECTED);
   1813 	}
   1814 
   1815 	CHECK(size_buffer(j->mctx, &j->it.source, rrhdr.size));
   1816 	CHECK(journal_read(j, j->it.source.base, rrhdr.size));
   1817 	isc_buffer_add(&j->it.source, rrhdr.size);
   1818 
   1819 	/*
   1820 	 * The target buffer is made the same size
   1821 	 * as the source buffer, with the assumption that when
   1822 	 * no compression in present, the output of dns_*_fromwire()
   1823 	 * is no larger than the input.
   1824 	 */
   1825 	CHECK(size_buffer(j->mctx, &j->it.target, rrhdr.size));
   1826 
   1827 	/*
   1828 	 * Parse the owner name.  We don't know where it
   1829 	 * ends yet, so we make the entire "remaining"
   1830 	 * part of the buffer "active".
   1831 	 */
   1832 	isc_buffer_setactive(&j->it.source,
   1833 			     j->it.source.used - j->it.source.current);
   1834 	CHECK(dns_name_fromwire(&j->it.name, &j->it.source, &j->it.dctx, 0,
   1835 				&j->it.target));
   1836 
   1837 	/*
   1838 	 * Check that the RR header is there, and parse it.
   1839 	 */
   1840 	if (isc_buffer_remaininglength(&j->it.source) < 10) {
   1841 		FAIL(DNS_R_FORMERR);
   1842 	}
   1843 
   1844 	rdtype = isc_buffer_getuint16(&j->it.source);
   1845 	rdclass = isc_buffer_getuint16(&j->it.source);
   1846 	ttl = isc_buffer_getuint32(&j->it.source);
   1847 	rdlen = isc_buffer_getuint16(&j->it.source);
   1848 
   1849 	/*
   1850 	 * Parse the rdata.
   1851 	 */
   1852 	if (isc_buffer_remaininglength(&j->it.source) != rdlen) {
   1853 		FAIL(DNS_R_FORMERR);
   1854 	}
   1855 	isc_buffer_setactive(&j->it.source, rdlen);
   1856 	dns_rdata_reset(&j->it.rdata);
   1857 	CHECK(dns_rdata_fromwire(&j->it.rdata, rdclass, rdtype, &j->it.source,
   1858 				 &j->it.dctx, 0, &j->it.target));
   1859 	j->it.ttl = ttl;
   1860 
   1861 	j->it.xpos += sizeof(journal_rawrrhdr_t) + rrhdr.size;
   1862 	if (rdtype == dns_rdatatype_soa) {
   1863 		/* XXX could do additional consistency checks here */
   1864 		j->it.current_serial = dns_soa_getserial(&j->it.rdata);
   1865 	}
   1866 
   1867 	result = ISC_R_SUCCESS;
   1868 
   1869 failure:
   1870 	j->it.result = result;
   1871 	return (result);
   1872 }
   1873 
   1874 isc_result_t
   1875 dns_journal_next_rr(dns_journal_t *j) {
   1876 	j->it.result = read_one_rr(j);
   1877 	return (j->it.result);
   1878 }
   1879 
   1880 void
   1881 dns_journal_current_rr(dns_journal_t *j, dns_name_t **name, uint32_t *ttl,
   1882 		       dns_rdata_t **rdata) {
   1883 	REQUIRE(j->it.result == ISC_R_SUCCESS);
   1884 	*name = &j->it.name;
   1885 	*ttl = j->it.ttl;
   1886 	*rdata = &j->it.rdata;
   1887 }
   1888 
   1889 /**************************************************************************/
   1890 /*
   1891  * Generating diffs from databases
   1892  */
   1893 
   1894 /*
   1895  * Construct a diff containing all the RRs at the current name of the
   1896  * database iterator 'dbit' in database 'db', version 'ver'.
   1897  * Set '*name' to the current name, and append the diff to 'diff'.
   1898  * All new tuples will have the operation 'op'.
   1899  *
   1900  * Requires: 'name' must have buffer large enough to hold the name.
   1901  * Typically, a dns_fixedname_t would be used.
   1902  */
   1903 static isc_result_t
   1904 get_name_diff(dns_db_t *db, dns_dbversion_t *ver, isc_stdtime_t now,
   1905 	      dns_dbiterator_t *dbit, dns_name_t *name, dns_diffop_t op,
   1906 	      dns_diff_t *diff) {
   1907 	isc_result_t result;
   1908 	dns_dbnode_t *node = NULL;
   1909 	dns_rdatasetiter_t *rdsiter = NULL;
   1910 	dns_difftuple_t *tuple = NULL;
   1911 
   1912 	result = dns_dbiterator_current(dbit, &node, name);
   1913 	if (result != ISC_R_SUCCESS) {
   1914 		return (result);
   1915 	}
   1916 
   1917 	result = dns_db_allrdatasets(db, node, ver, now, &rdsiter);
   1918 	if (result != ISC_R_SUCCESS) {
   1919 		goto cleanup_node;
   1920 	}
   1921 
   1922 	for (result = dns_rdatasetiter_first(rdsiter); result == ISC_R_SUCCESS;
   1923 	     result = dns_rdatasetiter_next(rdsiter))
   1924 	{
   1925 		dns_rdataset_t rdataset;
   1926 
   1927 		dns_rdataset_init(&rdataset);
   1928 		dns_rdatasetiter_current(rdsiter, &rdataset);
   1929 
   1930 		for (result = dns_rdataset_first(&rdataset);
   1931 		     result == ISC_R_SUCCESS;
   1932 		     result = dns_rdataset_next(&rdataset))
   1933 		{
   1934 			dns_rdata_t rdata = DNS_RDATA_INIT;
   1935 			dns_rdataset_current(&rdataset, &rdata);
   1936 			result = dns_difftuple_create(diff->mctx, op, name,
   1937 						      rdataset.ttl, &rdata,
   1938 						      &tuple);
   1939 			if (result != ISC_R_SUCCESS) {
   1940 				dns_rdataset_disassociate(&rdataset);
   1941 				goto cleanup_iterator;
   1942 			}
   1943 			dns_diff_append(diff, &tuple);
   1944 		}
   1945 		dns_rdataset_disassociate(&rdataset);
   1946 		if (result != ISC_R_NOMORE) {
   1947 			goto cleanup_iterator;
   1948 		}
   1949 	}
   1950 	if (result != ISC_R_NOMORE) {
   1951 		goto cleanup_iterator;
   1952 	}
   1953 
   1954 	result = ISC_R_SUCCESS;
   1955 
   1956 cleanup_iterator:
   1957 	dns_rdatasetiter_destroy(&rdsiter);
   1958 
   1959 cleanup_node:
   1960 	dns_db_detachnode(db, &node);
   1961 
   1962 	return (result);
   1963 }
   1964 
   1965 /*
   1966  * Comparison function for use by dns_diff_subtract when sorting
   1967  * the diffs to be subtracted.  The sort keys are the rdata type
   1968  * and the rdata itself.  The owner name is ignored, because
   1969  * it is known to be the same for all tuples.
   1970  */
   1971 static int
   1972 rdata_order(const void *av, const void *bv) {
   1973 	dns_difftuple_t const *const *ap = av;
   1974 	dns_difftuple_t const *const *bp = bv;
   1975 	dns_difftuple_t const *a = *ap;
   1976 	dns_difftuple_t const *b = *bp;
   1977 	int r;
   1978 	r = (b->rdata.type - a->rdata.type);
   1979 	if (r != 0) {
   1980 		return (r);
   1981 	}
   1982 	r = dns_rdata_compare(&a->rdata, &b->rdata);
   1983 	return (r);
   1984 }
   1985 
   1986 static isc_result_t
   1987 dns_diff_subtract(dns_diff_t diff[2], dns_diff_t *r) {
   1988 	isc_result_t result;
   1989 	dns_difftuple_t *p[2];
   1990 	int i, t;
   1991 	bool append;
   1992 
   1993 	CHECK(dns_diff_sort(&diff[0], rdata_order));
   1994 	CHECK(dns_diff_sort(&diff[1], rdata_order));
   1995 
   1996 	for (;;) {
   1997 		p[0] = ISC_LIST_HEAD(diff[0].tuples);
   1998 		p[1] = ISC_LIST_HEAD(diff[1].tuples);
   1999 		if (p[0] == NULL && p[1] == NULL) {
   2000 			break;
   2001 		}
   2002 
   2003 		for (i = 0; i < 2; i++) {
   2004 			if (p[!i] == NULL) {
   2005 				{
   2006 					ISC_LIST_UNLINK(diff[i].tuples, p[i],
   2007 							link);
   2008 					ISC_LIST_APPEND(r->tuples, p[i], link);
   2009 					goto next;
   2010 				}
   2011 			}
   2012 		}
   2013 		t = rdata_order(&p[0], &p[1]);
   2014 		if (t < 0) {
   2015 			ISC_LIST_UNLINK(diff[0].tuples, p[0], link);
   2016 			ISC_LIST_APPEND(r->tuples, p[0], link);
   2017 			goto next;
   2018 		}
   2019 		if (t > 0) {
   2020 			ISC_LIST_UNLINK(diff[1].tuples, p[1], link);
   2021 			ISC_LIST_APPEND(r->tuples, p[1], link);
   2022 			goto next;
   2023 		}
   2024 		INSIST(t == 0);
   2025 		/*
   2026 		 * Identical RRs in both databases; skip them both
   2027 		 * if the ttl differs.
   2028 		 */
   2029 		append = (p[0]->ttl != p[1]->ttl);
   2030 		for (i = 0; i < 2; i++) {
   2031 			ISC_LIST_UNLINK(diff[i].tuples, p[i], link);
   2032 			if (append) {
   2033 				ISC_LIST_APPEND(r->tuples, p[i], link);
   2034 			} else {
   2035 				dns_difftuple_free(&p[i]);
   2036 			}
   2037 		}
   2038 	next:;
   2039 	}
   2040 	result = ISC_R_SUCCESS;
   2041 failure:
   2042 	return (result);
   2043 }
   2044 
   2045 static isc_result_t
   2046 diff_namespace(dns_db_t *dba, dns_dbversion_t *dbvera, dns_db_t *dbb,
   2047 	       dns_dbversion_t *dbverb, unsigned int options,
   2048 	       dns_diff_t *resultdiff) {
   2049 	dns_db_t *db[2];
   2050 	dns_dbversion_t *ver[2];
   2051 	dns_dbiterator_t *dbit[2] = { NULL, NULL };
   2052 	bool have[2] = { false, false };
   2053 	dns_fixedname_t fixname[2];
   2054 	isc_result_t result, itresult[2];
   2055 	dns_diff_t diff[2];
   2056 	int i, t;
   2057 
   2058 	db[0] = dba, db[1] = dbb;
   2059 	ver[0] = dbvera, ver[1] = dbverb;
   2060 
   2061 	dns_diff_init(resultdiff->mctx, &diff[0]);
   2062 	dns_diff_init(resultdiff->mctx, &diff[1]);
   2063 
   2064 	dns_fixedname_init(&fixname[0]);
   2065 	dns_fixedname_init(&fixname[1]);
   2066 
   2067 	result = dns_db_createiterator(db[0], options, &dbit[0]);
   2068 	if (result != ISC_R_SUCCESS) {
   2069 		return (result);
   2070 	}
   2071 	result = dns_db_createiterator(db[1], options, &dbit[1]);
   2072 	if (result != ISC_R_SUCCESS) {
   2073 		goto cleanup_iterator;
   2074 	}
   2075 
   2076 	itresult[0] = dns_dbiterator_first(dbit[0]);
   2077 	itresult[1] = dns_dbiterator_first(dbit[1]);
   2078 
   2079 	for (;;) {
   2080 		for (i = 0; i < 2; i++) {
   2081 			if (!have[i] && itresult[i] == ISC_R_SUCCESS) {
   2082 				CHECK(get_name_diff(
   2083 					db[i], ver[i], 0, dbit[i],
   2084 					dns_fixedname_name(&fixname[i]),
   2085 					i == 0 ? DNS_DIFFOP_ADD
   2086 					       : DNS_DIFFOP_DEL,
   2087 					&diff[i]));
   2088 				itresult[i] = dns_dbiterator_next(dbit[i]);
   2089 				have[i] = true;
   2090 			}
   2091 		}
   2092 
   2093 		if (!have[0] && !have[1]) {
   2094 			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
   2095 			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
   2096 			break;
   2097 		}
   2098 
   2099 		for (i = 0; i < 2; i++) {
   2100 			if (!have[!i]) {
   2101 				ISC_LIST_APPENDLIST(resultdiff->tuples,
   2102 						    diff[i].tuples, link);
   2103 				INSIST(ISC_LIST_EMPTY(diff[i].tuples));
   2104 				have[i] = false;
   2105 				goto next;
   2106 			}
   2107 		}
   2108 
   2109 		t = dns_name_compare(dns_fixedname_name(&fixname[0]),
   2110 				     dns_fixedname_name(&fixname[1]));
   2111 		if (t < 0) {
   2112 			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[0].tuples,
   2113 					    link);
   2114 			INSIST(ISC_LIST_EMPTY(diff[0].tuples));
   2115 			have[0] = false;
   2116 			continue;
   2117 		}
   2118 		if (t > 0) {
   2119 			ISC_LIST_APPENDLIST(resultdiff->tuples, diff[1].tuples,
   2120 					    link);
   2121 			INSIST(ISC_LIST_EMPTY(diff[1].tuples));
   2122 			have[1] = false;
   2123 			continue;
   2124 		}
   2125 		INSIST(t == 0);
   2126 		CHECK(dns_diff_subtract(diff, resultdiff));
   2127 		INSIST(ISC_LIST_EMPTY(diff[0].tuples));
   2128 		INSIST(ISC_LIST_EMPTY(diff[1].tuples));
   2129 		have[0] = have[1] = false;
   2130 	next:;
   2131 	}
   2132 	if (itresult[0] != ISC_R_NOMORE) {
   2133 		FAIL(itresult[0]);
   2134 	}
   2135 	if (itresult[1] != ISC_R_NOMORE) {
   2136 		FAIL(itresult[1]);
   2137 	}
   2138 
   2139 	INSIST(ISC_LIST_EMPTY(diff[0].tuples));
   2140 	INSIST(ISC_LIST_EMPTY(diff[1].tuples));
   2141 
   2142 failure:
   2143 	dns_dbiterator_destroy(&dbit[1]);
   2144 
   2145 cleanup_iterator:
   2146 	dns_dbiterator_destroy(&dbit[0]);
   2147 	dns_diff_clear(&diff[0]);
   2148 	dns_diff_clear(&diff[1]);
   2149 	return (result);
   2150 }
   2151 
   2152 /*
   2153  * Compare the databases 'dba' and 'dbb' and generate a journal
   2154  * entry containing the changes to make 'dba' from 'dbb' (note
   2155  * the order).  This journal entry will consist of a single,
   2156  * possibly very large transaction.
   2157  */
   2158 isc_result_t
   2159 dns_db_diff(isc_mem_t *mctx, dns_db_t *dba, dns_dbversion_t *dbvera,
   2160 	    dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
   2161 	isc_result_t result;
   2162 	dns_diff_t diff;
   2163 
   2164 	dns_diff_init(mctx, &diff);
   2165 
   2166 	result = dns_db_diffx(&diff, dba, dbvera, dbb, dbverb, filename);
   2167 
   2168 	dns_diff_clear(&diff);
   2169 
   2170 	return (result);
   2171 }
   2172 
   2173 isc_result_t
   2174 dns_db_diffx(dns_diff_t *diff, dns_db_t *dba, dns_dbversion_t *dbvera,
   2175 	     dns_db_t *dbb, dns_dbversion_t *dbverb, const char *filename) {
   2176 	isc_result_t result;
   2177 	dns_journal_t *journal = NULL;
   2178 
   2179 	if (filename != NULL) {
   2180 		result = dns_journal_open(diff->mctx, filename,
   2181 					  DNS_JOURNAL_CREATE, &journal);
   2182 		if (result != ISC_R_SUCCESS) {
   2183 			return (result);
   2184 		}
   2185 	}
   2186 
   2187 	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NONSEC3, diff));
   2188 	CHECK(diff_namespace(dba, dbvera, dbb, dbverb, DNS_DB_NSEC3ONLY, diff));
   2189 
   2190 	if (journal != NULL) {
   2191 		if (ISC_LIST_EMPTY(diff->tuples)) {
   2192 			isc_log_write(JOURNAL_DEBUG_LOGARGS(3), "no changes");
   2193 		} else {
   2194 			CHECK(dns_journal_write_transaction(journal, diff));
   2195 		}
   2196 	}
   2197 
   2198 failure:
   2199 	if (journal != NULL) {
   2200 		dns_journal_destroy(&journal);
   2201 	}
   2202 	return (result);
   2203 }
   2204 
   2205 isc_result_t
   2206 dns_journal_compact(isc_mem_t *mctx, char *filename, uint32_t serial,
   2207 		    uint32_t target_size) {
   2208 	unsigned int i;
   2209 	journal_pos_t best_guess;
   2210 	journal_pos_t current_pos;
   2211 	dns_journal_t *j1 = NULL;
   2212 	dns_journal_t *j2 = NULL;
   2213 	journal_rawheader_t rawheader;
   2214 	unsigned int copy_length;
   2215 	size_t namelen;
   2216 	char *buf = NULL;
   2217 	unsigned int size = 0;
   2218 	isc_result_t result;
   2219 	unsigned int indexend;
   2220 	char newname[PATH_MAX];
   2221 	char backup[PATH_MAX];
   2222 	bool is_backup = false;
   2223 
   2224 	REQUIRE(filename != NULL);
   2225 
   2226 	namelen = strlen(filename);
   2227 	if (namelen > 4U && strcmp(filename + namelen - 4, ".jnl") == 0) {
   2228 		namelen -= 4;
   2229 	}
   2230 
   2231 	result = snprintf(newname, sizeof(newname), "%.*s.jnw", (int)namelen,
   2232 			  filename);
   2233 	RUNTIME_CHECK(result < sizeof(newname));
   2234 
   2235 	result = snprintf(backup, sizeof(backup), "%.*s.jbk", (int)namelen,
   2236 			  filename);
   2237 	RUNTIME_CHECK(result < sizeof(backup));
   2238 
   2239 	result = journal_open(mctx, filename, false, false, &j1);
   2240 	if (result == ISC_R_NOTFOUND) {
   2241 		is_backup = true;
   2242 		result = journal_open(mctx, backup, false, false, &j1);
   2243 	}
   2244 	if (result != ISC_R_SUCCESS) {
   2245 		return (result);
   2246 	}
   2247 
   2248 	if (JOURNAL_EMPTY(&j1->header)) {
   2249 		dns_journal_destroy(&j1);
   2250 		return (ISC_R_SUCCESS);
   2251 	}
   2252 
   2253 	if (DNS_SERIAL_GT(j1->header.begin.serial, serial) ||
   2254 	    DNS_SERIAL_GT(serial, j1->header.end.serial))
   2255 	{
   2256 		dns_journal_destroy(&j1);
   2257 		return (ISC_R_RANGE);
   2258 	}
   2259 
   2260 	/*
   2261 	 * Cope with very small target sizes.
   2262 	 */
   2263 	indexend = sizeof(journal_rawheader_t) +
   2264 		   j1->header.index_size * sizeof(journal_rawpos_t);
   2265 	if (target_size < DNS_JOURNAL_SIZE_MIN) {
   2266 		target_size = DNS_JOURNAL_SIZE_MIN;
   2267 	}
   2268 	if (target_size < indexend * 2) {
   2269 		target_size = target_size / 2 + indexend;
   2270 	}
   2271 
   2272 	/*
   2273 	 * See if there is any work to do.
   2274 	 */
   2275 	if ((uint32_t)j1->header.end.offset < target_size) {
   2276 		dns_journal_destroy(&j1);
   2277 		return (ISC_R_SUCCESS);
   2278 	}
   2279 
   2280 	CHECK(journal_open(mctx, newname, true, true, &j2));
   2281 
   2282 	/*
   2283 	 * Remove overhead so space test below can succeed.
   2284 	 */
   2285 	if (target_size >= indexend) {
   2286 		target_size -= indexend;
   2287 	}
   2288 
   2289 	/*
   2290 	 * Find if we can create enough free space.
   2291 	 */
   2292 	best_guess = j1->header.begin;
   2293 	for (i = 0; i < j1->header.index_size; i++) {
   2294 		if (POS_VALID(j1->index[i]) &&
   2295 		    DNS_SERIAL_GE(serial, j1->index[i].serial) &&
   2296 		    ((uint32_t)(j1->header.end.offset - j1->index[i].offset) >=
   2297 		     target_size / 2) &&
   2298 		    j1->index[i].offset > best_guess.offset)
   2299 		{
   2300 			best_guess = j1->index[i];
   2301 		}
   2302 	}
   2303 
   2304 	current_pos = best_guess;
   2305 	while (current_pos.serial != serial) {
   2306 		CHECK(journal_next(j1, &current_pos));
   2307 		if (current_pos.serial == j1->header.end.serial) {
   2308 			break;
   2309 		}
   2310 
   2311 		if (DNS_SERIAL_GE(serial, current_pos.serial) &&
   2312 		    ((uint32_t)(j1->header.end.offset - current_pos.offset) >=
   2313 		     (target_size / 2)) &&
   2314 		    current_pos.offset > best_guess.offset)
   2315 		{
   2316 			best_guess = current_pos;
   2317 		} else {
   2318 			break;
   2319 		}
   2320 	}
   2321 
   2322 	INSIST(best_guess.serial != j1->header.end.serial);
   2323 	if (best_guess.serial != serial) {
   2324 		CHECK(journal_next(j1, &best_guess));
   2325 	}
   2326 
   2327 	/*
   2328 	 * We should now be roughly half target_size provided
   2329 	 * we did not reach 'serial'.  If not we will just copy
   2330 	 * all uncommitted deltas regardless of the size.
   2331 	 */
   2332 	copy_length = j1->header.end.offset - best_guess.offset;
   2333 
   2334 	if (copy_length != 0) {
   2335 		/*
   2336 		 * Copy best_guess to end into space just freed.
   2337 		 */
   2338 		size = 64 * 1024;
   2339 		if (copy_length < size) {
   2340 			size = copy_length;
   2341 		}
   2342 		buf = isc_mem_get(mctx, size);
   2343 
   2344 		CHECK(journal_seek(j1, best_guess.offset));
   2345 		CHECK(journal_seek(j2, indexend));
   2346 		for (i = 0; i < copy_length; i += size) {
   2347 			unsigned int len = (copy_length - i) > size
   2348 						   ? size
   2349 						   : (copy_length - i);
   2350 			CHECK(journal_read(j1, buf, len));
   2351 			CHECK(journal_write(j2, buf, len));
   2352 		}
   2353 
   2354 		CHECK(journal_fsync(j2));
   2355 
   2356 		/*
   2357 		 * Compute new header.
   2358 		 */
   2359 		j2->header.begin.serial = best_guess.serial;
   2360 		j2->header.begin.offset = indexend;
   2361 		j2->header.end.serial = j1->header.end.serial;
   2362 		j2->header.end.offset = indexend + copy_length;
   2363 		j2->header.sourceserial = j1->header.sourceserial;
   2364 		j2->header.serialset = j1->header.serialset;
   2365 
   2366 		/*
   2367 		 * Update the journal header.
   2368 		 */
   2369 		journal_header_encode(&j2->header, &rawheader);
   2370 		CHECK(journal_seek(j2, 0));
   2371 		CHECK(journal_write(j2, &rawheader, sizeof(rawheader)));
   2372 		CHECK(journal_fsync(j2));
   2373 
   2374 		/*
   2375 		 * Build new index.
   2376 		 */
   2377 		current_pos = j2->header.begin;
   2378 		while (current_pos.serial != j2->header.end.serial) {
   2379 			index_add(j2, &current_pos);
   2380 			CHECK(journal_next(j2, &current_pos));
   2381 		}
   2382 
   2383 		/*
   2384 		 * Write index.
   2385 		 */
   2386 		CHECK(index_to_disk(j2));
   2387 		CHECK(journal_fsync(j2));
   2388 
   2389 		indexend = j2->header.end.offset;
   2390 		POST(indexend);
   2391 	}
   2392 
   2393 	/*
   2394 	 * Close both journals before trying to rename files (this is
   2395 	 * necessary on WIN32).
   2396 	 */
   2397 	dns_journal_destroy(&j1);
   2398 	dns_journal_destroy(&j2);
   2399 
   2400 	/*
   2401 	 * With a UFS file system this should just succeed and be atomic.
   2402 	 * Any IXFR outs will just continue and the old journal will be
   2403 	 * removed on final close.
   2404 	 *
   2405 	 * With MSDOS / NTFS we need to do a two stage rename, triggered
   2406 	 * by EEXIST.  (If any IXFR's are running in other threads, however,
   2407 	 * this will fail, and the journal will not be compacted.  But
   2408 	 * if so, hopefully they'll be finished by the next time we
   2409 	 * compact.)
   2410 	 */
   2411 	if (rename(newname, filename) == -1) {
   2412 		if (errno == EEXIST && !is_backup) {
   2413 			result = isc_file_remove(backup);
   2414 			if (result != ISC_R_SUCCESS &&
   2415 			    result != ISC_R_FILENOTFOUND) {
   2416 				goto failure;
   2417 			}
   2418 			if (rename(filename, backup) == -1) {
   2419 				goto maperrno;
   2420 			}
   2421 			if (rename(newname, filename) == -1) {
   2422 				goto maperrno;
   2423 			}
   2424 			(void)isc_file_remove(backup);
   2425 		} else {
   2426 		maperrno:
   2427 			result = ISC_R_FAILURE;
   2428 			goto failure;
   2429 		}
   2430 	}
   2431 
   2432 	result = ISC_R_SUCCESS;
   2433 
   2434 failure:
   2435 	(void)isc_file_remove(newname);
   2436 	if (buf != NULL) {
   2437 		isc_mem_put(mctx, buf, size);
   2438 	}
   2439 	if (j1 != NULL) {
   2440 		dns_journal_destroy(&j1);
   2441 	}
   2442 	if (j2 != NULL) {
   2443 		dns_journal_destroy(&j2);
   2444 	}
   2445 	return (result);
   2446 }
   2447 
   2448 static isc_result_t
   2449 index_to_disk(dns_journal_t *j) {
   2450 	isc_result_t result = ISC_R_SUCCESS;
   2451 
   2452 	if (j->header.index_size != 0) {
   2453 		unsigned int i;
   2454 		unsigned char *p;
   2455 		unsigned int rawbytes;
   2456 
   2457 		rawbytes = j->header.index_size * sizeof(journal_rawpos_t);
   2458 
   2459 		p = j->rawindex;
   2460 		for (i = 0; i < j->header.index_size; i++) {
   2461 			encode_uint32(j->index[i].serial, p);
   2462 			p += 4;
   2463 			encode_uint32(j->index[i].offset, p);
   2464 			p += 4;
   2465 		}
   2466 		INSIST(p == j->rawindex + rawbytes);
   2467 
   2468 		CHECK(journal_seek(j, sizeof(journal_rawheader_t)));
   2469 		CHECK(journal_write(j, j->rawindex, rawbytes));
   2470 	}
   2471 failure:
   2472 	return (result);
   2473 }
   2474