/* * Copyright (c) 2008 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * UNDO ALGORITHM: * * The UNDO algorithm is trivial. The nominal UNDO range in the * FIFO is determined by taking the first/next offset stored in * the volume header. The next offset may not be correct since * UNDO flushes are not required to flush the volume header, so * the code also scans forward until it finds a discontinuous * sequence number. * * The UNDOs are then scanned and executed in reverse order. These * UNDOs are effectively just data restorations based on HAMMER offsets. * * REDO ALGORITHM: * * REDO records are laid down in the UNDO/REDO FIFO for nominal * writes, truncations, and file extension ops. On a per-inode * basis two types of REDO records are generated, REDO_WRITE * and REDO_TRUNC. * * Essentially the recovery block will contain UNDO records backing * out partial operations and REDO records to regenerate those partial * operations guaranteed by the filesystem during recovery. * * REDO generation is optional, and can also be started and then * later stopped due to excessive write()s inbetween fsyncs, or not * started at all. Because of this the recovery code must determine * when REDOs are valid and when they are not. Additional records are * generated to help figure it out. * * The REDO_TERM_WRITE and REDO_TERM_TRUNC records are generated * during a flush cycle indicating which records the flush cycle * has synched meta-data for, and HAMMER_REDO_SYNC is generated in * each flush cycle to indicate how far back in the UNDO/REDO FIFO * the recovery code must go to find the earliest applicable REDO * record. Applicable REDO records can be far outside the nominal * UNDO recovery range, for example if a write() lays down a REDO but * the related file is not flushed for several cycles. * * The SYNC reference is to a point prior to the nominal UNDO FIFO * range, creating an extended REDO range which must be scanned. * * Any REDO_WRITE/REDO_TRUNC encountered within the extended range * which have no matching REDO_TERM_WRITE/REDO_TERM_TRUNC records * prior to the start of the nominal UNDO range are applicable. * That is, any REDO_TERM_* records in the extended range but not in * the nominal undo range will mask any redo operations for prior REDO * records. This is necessary because once the TERM is laid down * followup operations may make additional changes to the related * records but not necessarily record them as REDOs (because REDOs are * optional). * * REDO_TERM_WRITE/REDO_TERM_TRUNC records in the nominal UNDO range * must be ignored since they represent meta-data flushes which are * undone by the UNDOs in that nominal UNDO range by the recovery * code. Only REDO_TERM_* records in the extended range but not * in the nominal undo range are applicable. * * The REDO_SYNC record itself always exists in the nominal UNDO range * (this is how the extended range is determined). For recovery * purposes the most recent REDO_SYNC record is always used if several * are found. * * CRASHES DURING UNDO/REDO * * A crash during the UNDO phase requires no additional effort. The * UNDOs will simply be re-run again. The state of the UNDO/REDO fifo * remains unchanged and has no re-crash issues. * * A crash during the REDO phase is more complex because the REDOs * run normal filesystem ops and generate additional UNDO/REDO records. * REDO is disabled during REDO recovery and any SYNC records generated * by flushes during REDO recovery must continue to reference the * original extended range. * * If multiple crashes occur and the UNDO/REDO FIFO wraps, REDO recovery * may become impossible. This is detected when the start of the * extended range fails to have monotonically increasing sequence * numbers leading into the nominal undo range. */ #include "hammer.h" /* * Specify the way we want to handle stage2 errors. * * Following values are accepted: * * 0 - Run redo recovery normally and fail to mount if * the operation fails (default). * 1 - Run redo recovery, but don't fail to mount if the * operation fails. * 2 - Completely skip redo recovery (only for severe error * conditions and/or debugging. */ static int hammer_skip_redo = 0; TUNABLE_INT("vfs.hammer.skip_redo", &hammer_skip_redo); /* * Each rterm entry has a list of fifo offsets indicating termination * points. These are stripped as the scan progresses. */ typedef struct hammer_rterm_entry { struct hammer_rterm_entry *next; hammer_off_t fifo_offset; } *hammer_rterm_entry_t; /* * rterm entries sorted in RB tree are indexed by objid, flags, and offset. * TRUNC entries ignore the offset. */ typedef struct hammer_rterm { RB_ENTRY(hammer_rterm) rb_node; int64_t redo_objid; uint32_t redo_localization; uint32_t redo_flags; hammer_off_t redo_offset; hammer_rterm_entry_t term_list; } *hammer_rterm_t; static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2); struct hammer_rterm_rb_tree; RB_HEAD(hammer_rterm_rb_tree, hammer_rterm); RB_PROTOTYPE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); static int hammer_check_tail_signature(hammer_mount_t hmp, hammer_fifo_tail_t tail, hammer_off_t end_off); static int hammer_check_head_signature(hammer_mount_t hmp, hammer_fifo_head_t head, hammer_off_t beg_off); static void hammer_recover_copy_undo(hammer_off_t undo_offset, char *src, char *dst, int bytes); static hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_off_t *scan_offsetp, int *errorp, hammer_buffer_t *bufferp); static hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_off_t *scan_offsetp, int *errorp, hammer_buffer_t *bufferp); #if 0 static void hammer_recover_debug_dump(int w, char *buf, int bytes); #endif static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_fifo_undo_t undo); static int hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); static int hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, hammer_off_t redo_fifo_offset, hammer_fifo_redo_t redo); static void hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo); RB_GENERATE(hammer_rterm_rb_tree, hammer_rterm, rb_node, hammer_rterm_rb_cmp); /* * Recover filesystem meta-data on mount. This procedure figures out the * UNDO FIFO range and runs the UNDOs backwards. The FIFO pointers are not * resynchronized by this procedure. * * This procedure is run near the beginning of the mount sequence, before * any B-Tree or high-level accesses are enabled, and is responsible for * restoring the meta-data to a consistent state. High level HAMMER data * structures (such as the B-Tree) cannot be accessed here. * * NOTE: No information from the root volume has been cached in the * hammer_mount structure yet, so we need to access the root volume's * buffer directly. * * NOTE: */ int hammer_recover_stage1(hammer_mount_t hmp, hammer_volume_t root_volume) { hammer_blockmap_t rootmap; hammer_buffer_t buffer; hammer_off_t scan_offset; hammer_off_t scan_offset_save; hammer_off_t bytes; hammer_fifo_any_t head; hammer_off_t first_offset; hammer_off_t last_offset; uint32_t seqno; int error; int degenerate_case = 0; /* * Examine the UNDO FIFO indices in the volume header. */ rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; first_offset = rootmap->first_offset; last_offset = rootmap->next_offset; buffer = NULL; error = 0; hmp->recover_stage2_offset = 0; if (first_offset > rootmap->alloc_offset || last_offset > rootmap->alloc_offset) { hvkprintf(root_volume, "Illegal UNDO FIFO index range " "%016jx, %016jx limit %016jx\n", (intmax_t)first_offset, (intmax_t)last_offset, (intmax_t)rootmap->alloc_offset); error = EIO; goto done; } /* * In HAMMER version 4+ filesystems the volume header does NOT * contain definitive UNDO FIFO state. In particular, the * rootmap->next_offset may not be indexed completely to the * end of the active UNDO FIFO. */ if (hmp->version >= HAMMER_VOL_VERSION_FOUR) { /* * To find the definitive range we must first scan backwards * from first_offset to locate the first real record and * extract the sequence number from it. This record is not * part of the active undo space. */ scan_offset = first_offset; seqno = 0; for (;;) { head = hammer_recover_scan_rev(hmp, root_volume, &scan_offset, &error, &buffer); if (error) break; if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { seqno = head->head.hdr_seq; break; } } if (error) { hvkprintf(root_volume, "recovery failure during seqno backscan\n"); goto done; } /* * Scan forwards from first_offset and (seqno+1) looking * for a sequence space discontinuity. This denotes the * end of the active FIFO area. * * NOTE: For the case where the FIFO is empty the very first * record we find will be discontinuous. * * NOTE: Do not include trailing PADs in the scan range, * and remember the returned scan_offset after a * fwd iteration points to the end of the returned * record. */ hvkprintf(root_volume, "recovery check seqno=%08x\n", seqno); scan_offset = first_offset; scan_offset_save = scan_offset; ++seqno; hmp->recover_stage2_seqno = seqno; for (;;) { head = hammer_recover_scan_fwd(hmp, root_volume, &scan_offset, &error, &buffer); if (error) break; if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { if (seqno != head->head.hdr_seq) { scan_offset = scan_offset_save; break; } scan_offset_save = scan_offset; ++seqno; } #if 0 /* * If the forward scan is grossly ahead of last_offset * then something is wrong. last_offset is supposed * to be flushed out */ if (last_offset >= scan_offset) { bytes = last_offset - scan_offset; } else { bytes = rootmap->alloc_offset - scan_offset + HAMMER_OFF_LONG_ENCODE(last_offset); } if (bytes > HAMMER_OFF_LONG_ENCODE(rootmap->alloc_offset) * 4 / 5) { hvkprintf(root_volume, "recovery forward scan is " "grossly beyond the last_offset in " "the volume header, this can't be " "right.\n"); error = EIO; break; } #endif } /* * Store the seqno. This will be the next seqno we lay down * when generating new UNDOs. */ hmp->undo_seqno = seqno; if (error) { hvkprintf(root_volume, "recovery failure during seqno fwdscan\n"); goto done; } last_offset = scan_offset; hvkprintf(root_volume, "recovery range %016jx-%016jx\n", (intmax_t)first_offset, (intmax_t)last_offset); hvkprintf(root_volume, "recovery nexto %016jx endseqno=%08x\n", (intmax_t)rootmap->next_offset, seqno); } /* * Calculate the size of the active portion of the FIFO. If the * FIFO is empty the filesystem is clean and no further action is * needed. */ if (last_offset >= first_offset) { bytes = last_offset - first_offset; } else { bytes = rootmap->alloc_offset - first_offset + HAMMER_OFF_LONG_ENCODE(last_offset); } if (bytes == 0) { degenerate_case = 1; error = 0; goto done; } hvkprintf(root_volume, "recovery undo %016jx-%016jx (%jd bytes)%s\n", (intmax_t)first_offset, (intmax_t)last_offset, (intmax_t)bytes, (hmp->ronly ? " (RO)" : "(RW)")); if (bytes > HAMMER_OFF_LONG_ENCODE(rootmap->alloc_offset)) { hkprintf("Undo size is absurd, unable to mount\n"); error = EIO; goto done; } /* * Scan the UNDOs backwards. */ scan_offset = last_offset; while ((int64_t)bytes > 0) { KKASSERT(scan_offset != first_offset); head = hammer_recover_scan_rev(hmp, root_volume, &scan_offset, &error, &buffer); if (error) break; /* * Normal UNDO */ error = hammer_recover_undo(hmp, root_volume, &head->undo); if (error) { hvkprintf(root_volume, "UNDO record at %016jx failed\n", (intmax_t)scan_offset - head->head.hdr_size); break; } /* * The first REDO_SYNC record encountered (scanning backwards) * enables REDO processing. */ if (head->head.hdr_type == HAMMER_HEAD_TYPE_REDO && head->redo.redo_flags == HAMMER_REDO_SYNC) { if (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) { hvkprintf(root_volume, "Ignoring extra REDO_SYNC " "records in UNDO/REDO FIFO.\n"); } else { hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_REQ; hmp->recover_stage2_offset = head->redo.redo_offset; hvkprintf(root_volume, "Found REDO_SYNC %016jx\n", (intmax_t)head->redo.redo_offset); } } bytes -= head->head.hdr_size; /* * If too many dirty buffers have built up we have to flush'm * out. As long as we do not flush out the volume header * a crash here should not cause any problems. * * buffer must be released so the flush can assert that * all buffers are idle. */ if (hammer_flusher_meta_limit(hmp)) { if (buffer) { hammer_rel_buffer(buffer, 0); buffer = NULL; } if (hmp->ronly == 0) { hammer_recover_flush_buffers(hmp, root_volume, 0); hvkprintf(root_volume, "Continuing recovery\n"); } else { hvkprintf(root_volume, "Recovery failure: " "Insufficient buffer cache to hold " "dirty buffers on read-only mount!\n"); error = EIO; break; } } } KKASSERT(error || bytes == 0); done: if (buffer) { hammer_rel_buffer(buffer, 0); buffer = NULL; } /* * After completely flushing all the recovered buffers the volume * header will also be flushed. */ if (root_volume->io.recovered == 0) { hammer_ref_volume(root_volume); root_volume->io.recovered = 1; } /* * Finish up flushing (or discarding) recovered buffers. FIFO * indices in the volume header are updated to the actual undo * range but will not be collapsed until stage 2. */ if (error == 0) { hammer_modify_volume_noundo(NULL, root_volume); rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; rootmap->first_offset = first_offset; rootmap->next_offset = last_offset; hammer_modify_volume_done(root_volume); if (hmp->ronly == 0) hammer_recover_flush_buffers(hmp, root_volume, 1); } else { hammer_recover_flush_buffers(hmp, root_volume, -1); } if (degenerate_case == 0) { hvkprintf(root_volume, "recovery complete\n"); } else { hvkprintf(root_volume, "mounted clean, no recovery needed\n"); } return (error); } /* * Execute redo operations * * This procedure is run at the end of the mount sequence, after the hammer * mount structure has been completely initialized but before the filesystem * goes live. It can access standard cursors, the B-Tree, flush the * filesystem, and so forth. * * This code may only be called for read-write mounts or when a mount * switches from read-only to read-write. vnodes may or may not be present. * * The stage1 code will have already calculated the correct FIFO range * for the nominal UNDO FIFO and stored it in the rootmap. The extended * range for REDO is stored in hmp->recover_stage2_offset. */ int hammer_recover_stage2(hammer_mount_t hmp, hammer_volume_t root_volume) { hammer_blockmap_t rootmap; hammer_buffer_t buffer; hammer_off_t scan_offset; hammer_off_t oscan_offset; hammer_off_t bytes; hammer_off_t ext_bytes; hammer_fifo_any_t head; hammer_off_t first_offset; hammer_off_t last_offset; hammer_off_t ext_offset; struct hammer_rterm_rb_tree rterm_root; uint32_t seqno; int error; int verbose = 0; int dorscan; /* * Stage 2 can only be run on a RW mount, or when the mount is * switched from RO to RW. */ KKASSERT(hmp->ronly == 0); RB_INIT(&rterm_root); if (hammer_skip_redo == 1) hvkprintf(root_volume, "recovery redo marked as optional\n"); if (hammer_skip_redo == 2) { hvkprintf(root_volume, "recovery redo skipped.\n"); return (0); } /* * Examine the UNDO FIFO. If it is empty the filesystem is clean * and no action need be taken. */ rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; first_offset = rootmap->first_offset; last_offset = rootmap->next_offset; if (first_offset == last_offset) { KKASSERT((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0); return(0); } /* * Stage2 must only be run once, and will not be run at all * if Stage1 did not find a REDO_SYNC record. */ error = 0; buffer = NULL; if ((hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_REQ) == 0) goto done; hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_REQ; hmp->flags |= HAMMER_MOUNT_REDO_RECOVERY_RUN; ext_offset = hmp->recover_stage2_offset; if (ext_offset == 0) { hvkprintf(root_volume, "REDO stage specified but no REDO_SYNC " "offset, ignoring\n"); goto done; } /* * Calculate nominal UNDO range (this is not yet the extended * range). */ if (last_offset >= first_offset) { bytes = last_offset - first_offset; } else { bytes = rootmap->alloc_offset - first_offset + HAMMER_OFF_LONG_ENCODE(last_offset); } hvkprintf(root_volume, "recovery redo %016jx-%016jx (%jd bytes)%s\n", (intmax_t)first_offset, (intmax_t)last_offset, (intmax_t)bytes, (hmp->ronly ? " (RO)" : "(RW)")); verbose = 1; if (bytes > HAMMER_OFF_LONG_ENCODE(rootmap->alloc_offset)) { hkprintf("Undo size is absurd, unable to mount\n"); error = EIO; goto fatal; } /* * Scan the REDOs backwards collecting REDO_TERM_* information. * This information is only collected for the extended range, * non-inclusive of any TERMs in the nominal UNDO range. * * If the stage2 extended range is inside the nominal undo range * we have nothing to scan. * * This must fit in memory! */ if (first_offset < last_offset) { /* * [ first_offset........last_offset ] */ if (ext_offset < first_offset) { dorscan = 1; ext_bytes = first_offset - ext_offset; } else if (ext_offset > last_offset) { dorscan = 1; ext_bytes = (rootmap->alloc_offset - ext_offset) + HAMMER_OFF_LONG_ENCODE(first_offset); } else { ext_bytes = -(ext_offset - first_offset); dorscan = 0; } } else { /* * [......last_offset first_offset.....] */ if (ext_offset < last_offset) { ext_bytes = -((rootmap->alloc_offset - first_offset) + HAMMER_OFF_LONG_ENCODE(ext_offset)); dorscan = 0; } else if (ext_offset > first_offset) { ext_bytes = -(ext_offset - first_offset); dorscan = 0; } else { ext_bytes = first_offset - ext_offset; dorscan = 1; } } if (dorscan) { scan_offset = first_offset; hvkprintf(root_volume, "Find extended redo %016jx, %jd extbytes\n", (intmax_t)ext_offset, (intmax_t)ext_bytes); seqno = hmp->recover_stage2_seqno - 1; for (;;) { head = hammer_recover_scan_rev(hmp, root_volume, &scan_offset, &error, &buffer); if (error) break; if (head->head.hdr_type != HAMMER_HEAD_TYPE_PAD) { if (head->head.hdr_seq != seqno) { error = ERANGE; break; } error = hammer_recover_redo_rec( hmp, &rterm_root, scan_offset, &head->redo); --seqno; } if (scan_offset == ext_offset) break; } if (error) { hvkprintf(root_volume, "Find extended redo failed %d, " "unable to run REDO\n", error); goto done; } } else { hvkprintf(root_volume, "Embedded extended redo %016jx, %jd extbytes\n", (intmax_t)ext_offset, (intmax_t)ext_bytes); } /* * Scan the REDO forwards through the entire extended range. * Anything with a previously recorded matching TERM is discarded. */ scan_offset = ext_offset; bytes += ext_bytes; /* * NOTE: when doing a forward scan the returned scan_offset is * for the record following the returned record, so we * have to play a bit. */ while ((int64_t)bytes > 0) { KKASSERT(scan_offset != last_offset); oscan_offset = scan_offset; head = hammer_recover_scan_fwd(hmp, root_volume, &scan_offset, &error, &buffer); if (error) break; error = hammer_recover_redo_run(hmp, &rterm_root, oscan_offset, &head->redo); if (error) { hvkprintf(root_volume, "UNDO record at %016jx failed\n", (intmax_t)scan_offset - head->head.hdr_size); break; } bytes -= head->head.hdr_size; } KKASSERT(error || bytes == 0); done: if (buffer) { hammer_rel_buffer(buffer, 0); buffer = NULL; } /* * Cleanup rterm tree */ { hammer_rterm_t rterm; hammer_rterm_entry_t rte; while ((rterm = RB_ROOT(&rterm_root)) != NULL) { RB_REMOVE(hammer_rterm_rb_tree, &rterm_root, rterm); while ((rte = rterm->term_list) != NULL) { rterm->term_list = rte->next; kfree(rte, hmp->m_misc); } kfree(rterm, hmp->m_misc); } } /* * Finish up flushing (or discarding) recovered buffers by executing * a normal flush cycle. Setting HMNT_UNDO_DIRTY bypasses degenerate * case tests and forces the flush in order to update the FIFO indices. * * If a crash occurs during the flush the entire undo/redo will be * re-run during recovery on the next mount. */ if (error == 0) { if (rootmap->first_offset != rootmap->next_offset) hmp->hflags |= HMNT_UNDO_DIRTY; hammer_flusher_sync(hmp); } fatal: hmp->flags &= ~HAMMER_MOUNT_REDO_RECOVERY_RUN; if (verbose) { hvkprintf(root_volume, "End redo recovery\n"); } if (error && hammer_skip_redo == 1) hvkprintf(root_volume, "recovery redo error %d, skipping.\n", error); return (hammer_skip_redo ? 0 : error); } /* * Scan backwards from *scan_offsetp, return the FIFO record prior to the * record at *scan_offsetp or NULL if an error occured. * * On return *scan_offsetp will be the offset of the returned record. */ hammer_fifo_any_t hammer_recover_scan_rev(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_off_t *scan_offsetp, int *errorp, hammer_buffer_t *bufferp) { hammer_off_t scan_offset; hammer_blockmap_t rootmap; hammer_fifo_any_t head; hammer_fifo_tail_t tail; rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; scan_offset = *scan_offsetp; if (hammer_debug_general & 0x0080) hdkprintf("rev scan_offset %016jx\n", (intmax_t)scan_offset); if (scan_offset == HAMMER_ENCODE_UNDO(0)) scan_offset = rootmap->alloc_offset; if (scan_offset - sizeof(*tail) < HAMMER_ENCODE_UNDO(0)) { hvkprintf(root_volume, "UNDO record at %016jx FIFO underflow\n", (intmax_t)scan_offset); *errorp = EIO; return (NULL); } tail = hammer_bread(hmp, scan_offset - sizeof(*tail), errorp, bufferp); if (*errorp) { hvkprintf(root_volume, "Unable to read UNDO TAIL at %016jx\n", (intmax_t)scan_offset - sizeof(*tail)); return (NULL); } if (hammer_check_tail_signature(hmp, tail, scan_offset) != 0) { hvkprintf(root_volume, "Illegal UNDO TAIL signature at %016jx\n", (intmax_t)scan_offset - sizeof(*tail)); *errorp = EIO; return (NULL); } head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); *scan_offsetp = scan_offset - head->head.hdr_size; return (head); } /* * Scan forwards from *scan_offsetp, return the FIFO record or NULL if * an error occured. * * On return *scan_offsetp will be the offset of the record following * the returned record. */ hammer_fifo_any_t hammer_recover_scan_fwd(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_off_t *scan_offsetp, int *errorp, hammer_buffer_t *bufferp) { hammer_off_t scan_offset; hammer_blockmap_t rootmap; hammer_fifo_any_t head; rootmap = &root_volume->ondisk->vol0_blockmap[HAMMER_ZONE_UNDO_INDEX]; scan_offset = *scan_offsetp; if (hammer_debug_general & 0x0080) hdkprintf("fwd scan_offset %016jx\n", (intmax_t)scan_offset); if (scan_offset == rootmap->alloc_offset) scan_offset = HAMMER_ENCODE_UNDO(0); head = hammer_bread(hmp, scan_offset, errorp, bufferp); if (*errorp) { hvkprintf(root_volume, "Unable to read UNDO HEAD at %016jx\n", (intmax_t)scan_offset); return (NULL); } if (hammer_check_head_signature(hmp, &head->head, scan_offset) != 0) { hvkprintf(root_volume, "Illegal UNDO TAIL signature at %016jx\n", (intmax_t)scan_offset); *errorp = EIO; return (NULL); } scan_offset += head->head.hdr_size; if (scan_offset == rootmap->alloc_offset) scan_offset = HAMMER_ENCODE_UNDO(0); *scan_offsetp = scan_offset; return (head); } /* * Helper function for hammer_check_{head,tail}_signature(). Check stuff * once the head and tail has been established. * * This function validates the entire FIFO record wrapper. */ static __inline int _hammer_check_signature(hammer_mount_t hmp, hammer_fifo_head_t head, hammer_fifo_tail_t tail, hammer_off_t beg_off) { hammer_off_t end_off; int bytes; /* * Check signatures. The tail signature is allowed to be the * head signature only for 8-byte PADs. */ if (head->hdr_signature != HAMMER_HEAD_SIGNATURE) { hkprintf("FIFO record bad head signature %04x at %016jx\n", head->hdr_signature, (intmax_t)beg_off); return(2); } if (head->hdr_size < HAMMER_HEAD_ALIGN || (head->hdr_size & HAMMER_HEAD_ALIGN_MASK)) { hkprintf("FIFO record unaligned or bad size %04x at %016jx\n", head->hdr_size, (intmax_t)beg_off); return(2); } end_off = beg_off + head->hdr_size; if (head->hdr_type != HAMMER_HEAD_TYPE_PAD || (size_t)(end_off - beg_off) != sizeof(*tail)) { if (head->hdr_type != tail->tail_type) { hkprintf("FIFO record head/tail type mismatch " "%04x %04x at %016jx\n", head->hdr_type, tail->tail_type, (intmax_t)beg_off); return(2); } if (head->hdr_size != tail->tail_size) { hkprintf("FIFO record head/tail size mismatch " "%04x %04x at %016jx\n", head->hdr_size, tail->tail_size, (intmax_t)beg_off); return(2); } if (tail->tail_signature != HAMMER_TAIL_SIGNATURE) { hkprintf("FIFO record bad tail signature " "%04x at %016jx\n", tail->tail_signature, (intmax_t)beg_off); return(3); } } /* * Non-PAD records must have a CRC and must be sized at * least large enough to fit the head and tail. */ if (head->hdr_type != HAMMER_HEAD_TYPE_PAD) { if (hammer_crc_test_fifo_head(hmp->version, head, head->hdr_size) == 0) { hkprintf("FIFO record CRC failed %08x at %016jx\n", head->hdr_crc, (intmax_t)beg_off); return(EIO); } if (head->hdr_size < sizeof(*head) + sizeof(*tail)) { hkprintf("FIFO record too small %04x at %016jx\n", head->hdr_size, (intmax_t)beg_off); return(EIO); } } /* * Check the tail */ bytes = head->hdr_size; tail = (void *)((char *)head + bytes - sizeof(*tail)); if (tail->tail_size != head->hdr_size) { hkprintf("Bad tail size %04x vs %04x at %016jx\n", tail->tail_size, head->hdr_size, (intmax_t)beg_off); return(EIO); } if (tail->tail_type != head->hdr_type) { hkprintf("Bad tail type %04x vs %04x at %016jx\n", tail->tail_type, head->hdr_type, (intmax_t)beg_off); return(EIO); } return(0); } /* * Check that the FIFO record is in-bounds given the head and the * hammer offset. * * Also checks that the head and tail structures agree with each other, * but does not check beyond the signature, type, and size. */ static int hammer_check_head_signature(hammer_mount_t hmp, hammer_fifo_head_t head, hammer_off_t beg_off) { hammer_fifo_tail_t tail; hammer_off_t end_off; /* * head overlaps buffer boundary. This could be a PAD so only * check the minimum PAD size here. */ if (((beg_off + sizeof(*tail) - 1) ^ (beg_off)) & ~HAMMER_BUFMASK64) return(1); /* * Calculate the ending offset and make sure the record does * not cross a buffer boundary. */ end_off = beg_off + head->hdr_size; if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) return(1); tail = (void *)((char *)head + head->hdr_size - sizeof(*tail)); return (_hammer_check_signature(hmp, head, tail, beg_off)); } /* * Check that the FIFO record is in-bounds given the tail and the * hammer offset. The offset is pointing at the ending boundary of the * record. * * Also checks that the head and tail structures agree with each other, * but does not check beyond the signature, type, and size. */ static int hammer_check_tail_signature(hammer_mount_t hmp, hammer_fifo_tail_t tail, hammer_off_t end_off) { hammer_fifo_head_t head; hammer_off_t beg_off; /* * tail overlaps buffer boundary */ if (((end_off - sizeof(*tail)) ^ (end_off - 1)) & ~HAMMER_BUFMASK64) return(1); /* * Calculate the begining offset and make sure the record does * not cross a buffer boundary. */ beg_off = end_off - tail->tail_size; if ((beg_off ^ (end_off - 1)) & ~HAMMER_BUFMASK64) return(1); head = (void *)((char *)tail + sizeof(*tail) - tail->tail_size); return (_hammer_check_signature(hmp, head, tail, beg_off)); } static int hammer_recover_undo(hammer_mount_t hmp, hammer_volume_t root_volume, hammer_fifo_undo_t undo) { hammer_volume_t volume; hammer_buffer_t buffer; hammer_off_t buf_offset; int zone; int error; int vol_no; int bytes; uint32_t offset; /* * Only process UNDO records. Flag if we find other records to * optimize stage2 recovery. */ if (undo->head.hdr_type != HAMMER_HEAD_TYPE_UNDO) return(0); /* * Validate the UNDO record. */ bytes = undo->head.hdr_size - sizeof(*undo) - sizeof(struct hammer_fifo_tail); if (bytes < 0 || undo->undo_data_bytes < 0 || undo->undo_data_bytes > bytes) { hkprintf("Corrupt UNDO record, undo_data_bytes %d/%d\n", undo->undo_data_bytes, bytes); return(EIO); } bytes = undo->undo_data_bytes; /* * The undo offset may only be a zone-1 or zone-2 offset. * * Currently we only support a zone-1 offset representing the * volume header. */ zone = HAMMER_ZONE_DECODE(undo->undo_offset); offset = undo->undo_offset & HAMMER_BUFMASK; if (offset + bytes > HAMMER_BUFSIZE) { hkprintf("Corrupt UNDO record, bad offset\n"); return (EIO); } switch(zone) { case HAMMER_ZONE_RAW_VOLUME_INDEX: vol_no = HAMMER_VOL_DECODE(undo->undo_offset); volume = hammer_get_volume(hmp, vol_no, &error); if (volume == NULL) { hkprintf("UNDO record, cannot access volume %d\n", vol_no); break; } hammer_modify_volume_noundo(NULL, volume); hammer_recover_copy_undo(undo->undo_offset, (char *)(undo + 1), (char *)volume->ondisk + offset, bytes); hammer_modify_volume_done(volume); /* * Multiple modifications may be made to the same buffer. * Also, the volume header cannot be written out until * everything else has been flushed. This also * covers the read-only case by preventing the kernel from * flushing the buffer. */ if (volume->io.recovered == 0) volume->io.recovered = 1; else hammer_rel_volume(volume, 0); break; case HAMMER_ZONE_RAW_BUFFER_INDEX: buf_offset = undo->undo_offset & ~HAMMER_BUFMASK64; buffer = hammer_get_buffer(hmp, buf_offset, HAMMER_BUFSIZE, 0, &error); if (buffer == NULL) { hkprintf("UNDO record, cannot access buffer %016jx\n", (intmax_t)undo->undo_offset); break; } hammer_modify_buffer_noundo(NULL, buffer); hammer_recover_copy_undo(undo->undo_offset, (char *)(undo + 1), (char *)buffer->ondisk + offset, bytes); hammer_modify_buffer_done(buffer); /* * Multiple modifications may be made to the same buffer, * improve performance by delaying the flush. This also * covers the read-only case by preventing the kernel from * flushing the buffer. */ if (buffer->io.recovered == 0) buffer->io.recovered = 1; else hammer_rel_buffer(buffer, 0); break; default: hkprintf("Corrupt UNDO record\n"); error = EIO; } return (error); } static void hammer_recover_copy_undo(hammer_off_t undo_offset, char *src, char *dst, int bytes) { if (hammer_debug_general & 0x0080) { hdkprintf("UNDO %016jx: %d\n", (intmax_t)undo_offset, bytes); } #if 0 hkprintf("UNDO %016jx:", (intmax_t)undo_offset); hammer_recover_debug_dump(22, dst, bytes); kprintf("%22s", "to:"); hammer_recover_debug_dump(22, src, bytes); #endif bcopy(src, dst, bytes); } /* * Record HAMMER_REDO_TERM_WRITE and HAMMER_REDO_TERM_TRUNC operations * during the backwards scan of the extended UNDO/REDO FIFO. This scan * does not include the nominal UNDO range, just the extended range. */ int hammer_recover_redo_rec(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, hammer_off_t scan_offset, hammer_fifo_redo_t redo) { hammer_rterm_t rterm; hammer_rterm_t nrterm; hammer_rterm_entry_t rte; if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) return(0); if (redo->redo_flags != HAMMER_REDO_TERM_WRITE && redo->redo_flags != HAMMER_REDO_TERM_TRUNC) { return(0); } nrterm = kmalloc(sizeof(*nrterm), hmp->m_misc, M_WAITOK|M_ZERO); nrterm->redo_objid = redo->redo_objid; nrterm->redo_localization = redo->redo_localization; nrterm->redo_flags = redo->redo_flags; nrterm->redo_offset = redo->redo_offset; rterm = RB_INSERT(hammer_rterm_rb_tree, root, nrterm); if (rterm) kfree(nrterm, hmp->m_misc); else rterm = nrterm; if (bootverbose) { hkprintf("record record %016jx objid %016jx " "offset %016jx flags %08x\n", (intmax_t)scan_offset, (intmax_t)redo->redo_objid, (intmax_t)redo->redo_offset, (int)redo->redo_flags); } /* * Scan in reverse order, rte prepended, so the rte list will be * in forward order. */ rte = kmalloc(sizeof(*rte), hmp->m_misc, M_WAITOK|M_ZERO); rte->fifo_offset = scan_offset; rte->next = rterm->term_list; rterm->term_list = rte; return(0); } /* * Execute HAMMER_REDO_WRITE and HAMMER_REDO_TRUNC operations during * the forwards scan of the entire extended UNDO/REDO FIFO range. * * Records matching previously recorded TERMs have already been committed * and are ignored. */ int hammer_recover_redo_run(hammer_mount_t hmp, struct hammer_rterm_rb_tree *root, hammer_off_t scan_offset, hammer_fifo_redo_t redo) { struct hammer_rterm rtval; hammer_rterm_t rterm; hammer_rterm_entry_t rte; if (redo->head.hdr_type != HAMMER_HEAD_TYPE_REDO) return(0); switch(redo->redo_flags) { case HAMMER_REDO_WRITE: case HAMMER_REDO_TRUNC: /* * We hit a REDO request. The REDO request is only executed * if there is no matching TERM. */ bzero(&rtval, sizeof(rtval)); rtval.redo_objid = redo->redo_objid; rtval.redo_localization = redo->redo_localization; rtval.redo_offset = redo->redo_offset; rtval.redo_flags = (redo->redo_flags == HAMMER_REDO_WRITE) ? HAMMER_REDO_TERM_WRITE : HAMMER_REDO_TERM_TRUNC; rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); if (rterm) { if (bootverbose) { hkprintf("ignore record %016jx objid %016jx " "offset %016jx flags %08x\n", (intmax_t)scan_offset, (intmax_t)redo->redo_objid, (intmax_t)redo->redo_offset, (int)redo->redo_flags); } break; } if (bootverbose) { hkprintf("run record %016jx objid %016jx " "offset %016jx flags %08x\n", (intmax_t)scan_offset, (intmax_t)redo->redo_objid, (intmax_t)redo->redo_offset, (int)redo->redo_flags); } /* * Redo stage2 can access a live filesystem, acquire the * vnode. */ hammer_recover_redo_exec(hmp, redo); break; case HAMMER_REDO_TERM_WRITE: case HAMMER_REDO_TERM_TRUNC: /* * As we encounter TERMs in the forward scan we remove * them. Once the forward scan hits the nominal undo range * there will be no more recorded TERMs. */ bzero(&rtval, sizeof(rtval)); rtval.redo_objid = redo->redo_objid; rtval.redo_localization = redo->redo_localization; rtval.redo_flags = redo->redo_flags; rtval.redo_offset = redo->redo_offset; rterm = RB_FIND(hammer_rterm_rb_tree, root, &rtval); if (rterm) { if ((rte = rterm->term_list) != NULL) { KKASSERT(rte->fifo_offset == scan_offset); rterm->term_list = rte->next; kfree(rte, hmp->m_misc); } } break; } return(0); } static void hammer_recover_redo_exec(hammer_mount_t hmp, hammer_fifo_redo_t redo) { struct hammer_transaction trans; struct vattr va; hammer_inode_t ip; struct vnode *vp = NULL; int error; hammer_start_transaction(&trans, hmp); ip = hammer_get_inode(&trans, NULL, redo->redo_objid, HAMMER_MAX_TID, redo->redo_localization, 0, &error); if (ip == NULL) { hkprintf("unable to find objid %016jx:%08x\n", (intmax_t)redo->redo_objid, redo->redo_localization); goto done2; } error = hammer_get_vnode(ip, &vp); if (error) { hkprintf("unable to acquire vnode for %016jx:%08x\n", (intmax_t)redo->redo_objid, redo->redo_localization); goto done1; } switch(redo->redo_flags) { case HAMMER_REDO_WRITE: error = VOP_OPEN(vp, FREAD|FWRITE, proc0.p_ucred, NULL); if (error) { hkprintf("vn_rdwr open %016jx:%08x returned %d\n", (intmax_t)redo->redo_objid, redo->redo_localization, error); break; } vn_unlock(vp); error = vn_rdwr(UIO_WRITE, vp, (void *)(redo + 1), redo->redo_data_bytes, redo->redo_offset, UIO_SYSSPACE, 0, proc0.p_ucred, NULL); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error) { hkprintf("write %016jx:%08x returned %d\n", (intmax_t)redo->redo_objid, redo->redo_localization, error); } VOP_CLOSE(vp, FREAD|FWRITE, NULL); break; case HAMMER_REDO_TRUNC: VATTR_NULL(&va); va.va_size = redo->redo_offset; error = VOP_SETATTR(vp, &va, proc0.p_ucred); if (error) { hkprintf("setattr offset %016jx error %d\n", (intmax_t)redo->redo_offset, error); } break; } vput(vp); done1: hammer_rel_inode(ip, 0); done2: hammer_done_transaction(&trans); } /* * RB tree compare function. Note that REDO_TERM_TRUNC ops ignore * the offset. * * WRITE@0 TERM@0 WRITE@0 .... (no TERM@0) etc. */ static int hammer_rterm_rb_cmp(hammer_rterm_t rt1, hammer_rterm_t rt2) { if (rt1->redo_objid < rt2->redo_objid) return(-1); if (rt1->redo_objid > rt2->redo_objid) return(1); if (rt1->redo_localization < rt2->redo_localization) return(-1); if (rt1->redo_localization > rt2->redo_localization) return(1); if (rt1->redo_flags < rt2->redo_flags) return(-1); if (rt1->redo_flags > rt2->redo_flags) return(1); if (rt1->redo_flags != HAMMER_REDO_TERM_TRUNC) { if (rt1->redo_offset < rt2->redo_offset) return(-1); if (rt1->redo_offset > rt2->redo_offset) return(1); } return(0); } #if 0 static void hammer_recover_debug_dump(int w, char *buf, int bytes) { int i; for (i = 0; i < bytes; ++i) { if (i && (i & 15) == 0) kprintf("\n%*.*s", w, w, ""); kprintf(" %02x", (unsigned char)buf[i]); } kprintf("\n"); } #endif /* * Flush recovered buffers from recovery operations. The call to this * routine may be delayed if a read-only mount was made and then later * upgraded to read-write. This routine is also called when unmounting * a read-only mount to clean out recovered (dirty) buffers which we * couldn't flush (because the mount is read-only). * * The volume header is always written last. The UNDO FIFO will be forced * to zero-length by setting next_offset to first_offset. This leaves the * (now stale) UNDO information used to recover the disk available for * forensic analysis. * * final is typically 0 or 1. The volume header is only written if final * is 1. If final is -1 the recovered buffers are discarded instead of * written and root_volume can also be passed as NULL in that case. */ static int hammer_recover_flush_volume_callback(hammer_volume_t, void *); static int hammer_recover_flush_buffer_callback(hammer_buffer_t, void *); void hammer_recover_flush_buffers(hammer_mount_t hmp, hammer_volume_t root_volume, int final) { /* * Flush the buffers out asynchronously, wait for all the I/O to * complete, then do it again to destroy the buffer cache buffer * so it doesn't alias something later on. */ RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, hammer_recover_flush_buffer_callback, &final); hammer_io_wait_all(hmp, "hmrrcw", 1); RB_SCAN(hammer_buf_rb_tree, &hmp->rb_bufs_root, NULL, hammer_recover_flush_buffer_callback, &final); /* * Flush all volume headers except the root volume. If final < 0 * we discard all volume headers including the root volume. */ if (final >= 0) { RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_recover_flush_volume_callback, root_volume); } else { RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL, hammer_recover_flush_volume_callback, NULL); } /* * Finalize the root volume header. * * No interlock is needed, volume buffers are not * messed with by bioops. */ if (root_volume && root_volume->io.recovered && final > 0) { hammer_io_wait_all(hmp, "hmrflx", 1); root_volume->io.recovered = 0; hammer_io_flush(&root_volume->io, 0); hammer_rel_volume(root_volume, 0); hammer_io_wait_all(hmp, "hmrfly", 1); } } /* * Callback to flush volume headers. If discarding data will be NULL and * all volume headers (including the root volume) will be discarded. * Otherwise data is the root_volume and we flush all volume headers * EXCEPT the root_volume. * * Clear any I/O error or modified condition when discarding buffers to * clean up the reference count, otherwise the buffer may have extra refs * on it. */ static int hammer_recover_flush_volume_callback(hammer_volume_t volume, void *data) { hammer_volume_t root_volume = data; if (volume->io.recovered && volume != root_volume) { volume->io.recovered = 0; if (root_volume != NULL) { /* * No interlock is needed, volume buffers are not * messed with by bioops. */ hammer_io_flush(&volume->io, 0); } else { hammer_io_clear_error(&volume->io); hammer_io_clear_modify(&volume->io, 1); } hammer_rel_volume(volume, 0); } return(0); } /* * Flush or discard recovered I/O buffers. * * Clear any I/O error or modified condition when discarding buffers to * clean up the reference count, otherwise the buffer may have extra refs * on it. */ static int hammer_recover_flush_buffer_callback(hammer_buffer_t buffer, void *data) { int final = *(int *)data; int flush; if (buffer->io.recovered) { buffer->io.recovered = 0; buffer->io.reclaim = 1; if (final < 0) { hammer_io_clear_error(&buffer->io); hammer_io_clear_modify(&buffer->io, 1); } else { hammer_io_write_interlock(&buffer->io); hammer_io_flush(&buffer->io, 0); hammer_io_done_interlock(&buffer->io); } hammer_rel_buffer(buffer, 0); } else { flush = hammer_ref_interlock(&buffer->io.lock); if (flush) atomic_add_int(&hammer_count_refedbufs, 1); if (final < 0) { hammer_io_clear_error(&buffer->io); hammer_io_clear_modify(&buffer->io, 1); } KKASSERT(hammer_oneref(&buffer->io.lock)); buffer->io.reclaim = 1; hammer_rel_buffer(buffer, flush); } return(0); }