1/*-
2 * Copyright (c) 1986, 1988, 1991, 1993
3 *  The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *  @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
35 * $FreeBSD: src/sys/kern/kern_shutdown.c,v 1.72.2.12 2002/02/21 19:15:10 dillon Exp $
36 */
37
38#include "opt_ddb.h"
39#include "opt_ddb_trace.h"
40#include "opt_panic.h"
41#include "use_gpio.h"
42
43#include <sys/param.h>
44#include <sys/systm.h>
45#include <sys/eventhandler.h>
46#include <sys/buf.h>
47#include <sys/disk.h>
48#include <sys/diskslice.h>
49#include <sys/reboot.h>
50#include <sys/proc.h>
51#include <sys/priv.h>
52#include <sys/fcntl.h>      /* FREAD    */
53#include <sys/stat.h>       /* S_IFCHR  */
54#include <sys/vnode.h>
55#include <sys/kernel.h>
56#include <sys/kerneldump.h>
57#include <sys/kthread.h>
58#include <sys/malloc.h>
59#include <sys/mount.h>
60#include <sys/queue.h>
61#include <sys/sysctl.h>
62#include <sys/vkernel.h>
63#include <sys/conf.h>
64#include <sys/sysproto.h>
65#include <sys/device.h>
66#include <sys/cons.h>
67#include <sys/kbio.h>
68#include <sys/shm.h>
69#include <sys/kern_syscall.h>
70#include <vm/vm_map.h>
71#include <vm/pmap.h>
72
73#include <sys/thread2.h>
74#include <sys/buf2.h>
75#include <sys/mplock2.h>
76
77#include <machine/cpu.h>
78#include <machine/clock.h>
79#include <machine/md_var.h>
80#include <machine/smp.h>        /* smp_active_mask, cpuid */
81#include <machine/vmparam.h>
82#include <machine/thread.h>
83
84#include <sys/signalvar.h>
85
86#include <sys/wdog.h>
87#include <dev/acpica/acpi_pvpanic/panic_notifier.h>
88#include <dev/misc/gpio/gpio.h>
89
90#ifndef PANIC_REBOOT_WAIT_TIME
91#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
92#endif
93
94/*
95 * Note that stdarg.h and the ANSI style va_start macro is used for both
96 * ANSI and traditional C compilers.  We use the machine version to stay
97 * within the confines of the kernel header files.
98 */
99#include <machine/stdarg.h>
100
101#ifdef DDB
102#include <ddb/ddb.h>
103#ifdef DDB_UNATTENDED
104int debugger_on_panic = 0;
105#else
106int debugger_on_panic = 1;
107#endif
108SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
109    &debugger_on_panic, 0, "Run debugger on kernel panic");
110
111#ifdef DDB_TRACE
112int trace_on_panic = 1;
113#else
114int trace_on_panic = 0;
115#endif
116SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RW,
117    &trace_on_panic, 0, "Print stack trace on kernel panic");
118#endif
119
120static int sync_on_panic = 0;
121SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW,
122    &sync_on_panic, 0, "Do a sync before rebooting from a panic");
123
124SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");
125
126/*
127 * Variable panicstr contains argument to first call to panic; used as flag
128 * to indicate that the kernel has already called panic.
129 */
130const char *panicstr;
131
132int dumping;                /* system is dumping */
133static struct dumperinfo dumper;    /* selected dumper */
134
135globaldata_t panic_cpu_gd;      /* which cpu took the panic */
136struct lwkt_tokref panic_tokens[LWKT_MAXTOKENS];
137int panic_tokens_count;
138
139int bootverbose = 0;            /* note: assignment to force non-bss */
140SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW,
141       &bootverbose, 0, "Verbose kernel messages");
142
143int cold = 1;               /* note: assignment to force non-bss */
144int dumplo;             /* OBSOLETE - savecore compat */
145u_int64_t dumplo64;
146
147static void boot (int) __dead2;
148static int setdumpdev (cdev_t dev);
149static void poweroff_wait (void *, int);
150static void print_uptime (void);
151static void shutdown_halt (void *junk, int howto);
152static void shutdown_panic (void *junk, int howto);
153static void shutdown_reset (void *junk, int howto);
154static int shutdown_busycount1(struct buf *bp, void *info);
155static int shutdown_busycount2(struct buf *bp, void *info);
156static void shutdown_cleanup_proc(struct proc *p);
157
158/* register various local shutdown events */
159static void
160shutdown_conf(void *unused)
161{
162    EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL, SHUTDOWN_PRI_FIRST);
163    EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL, SHUTDOWN_PRI_LAST + 100);
164    EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL, SHUTDOWN_PRI_LAST + 100);
165    EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL, SHUTDOWN_PRI_LAST + 200);
166}
167
168SYSINIT(shutdown_conf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, shutdown_conf, NULL);
169
170/* ARGSUSED */
171
172/*
173 * The system call that results in a reboot
174 *
175 * MPALMOSTSAFE
176 */
177int
178sys_reboot(struct reboot_args *uap)
179{
180    struct thread *td = curthread;
181    int error;
182
183    if ((error = priv_check(td, PRIV_REBOOT)))
184        return (error);
185
186    get_mplock();
187    boot(uap->opt);
188    rel_mplock();
189    return (0);
190}
191
192/*
193 * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
194 */
195static int shutdown_howto = 0;
196
197void
198shutdown_nice(int howto)
199{
200    shutdown_howto = howto;
201
202    /* Send a signal to init(8) and have it shutdown the world */
203    if (initproc != NULL) {
204        ksignal(initproc, SIGINT);
205    } else {
206        /* No init(8) running, so simply reboot */
207        boot(RB_NOSYNC);
208    }
209    return;
210}
211static int  waittime = -1;
212struct pcb  dumppcb;
213struct thread   *dumpthread;
214
215static void
216print_uptime(void)
217{
218    int f;
219    struct timespec ts;
220
221    getnanouptime(&ts);
222    kprintf("Uptime: ");
223    f = 0;
224    if (ts.tv_sec >= 86400) {
225        kprintf("%ldd", ts.tv_sec / 86400);
226        ts.tv_sec %= 86400;
227        f = 1;
228    }
229    if (f || ts.tv_sec >= 3600) {
230        kprintf("%ldh", ts.tv_sec / 3600);
231        ts.tv_sec %= 3600;
232        f = 1;
233    }
234    if (f || ts.tv_sec >= 60) {
235        kprintf("%ldm", ts.tv_sec / 60);
236        ts.tv_sec %= 60;
237        f = 1;
238    }
239    kprintf("%lds\n", ts.tv_sec);
240}
241
242/*
243 *  Go through the rigmarole of shutting down..
244 * this used to be in machdep.c but I'll be dammned if I could see
245 * anything machine dependant in it.
246 */
247static void
248boot(int howto)
249{
250    /*
251     * Get rid of any user scheduler baggage and then give
252     * us a high priority.
253     */
254    if (curthread->td_release)
255        curthread->td_release(curthread);
256    lwkt_setpri_self(TDPRI_MAX);
257
258    /* collect extra flags that shutdown_nice might have set */
259    howto |= shutdown_howto;
260
261    /*
262     * We really want to shutdown on the BSP.  Subsystems such as ACPI
263     * can't power-down the box otherwise.
264     */
265    if (!CPUMASK_ISUP(smp_active_mask)) {
266        kprintf("boot() called on cpu#%d\n", mycpu->gd_cpuid);
267    }
268    if (panicstr == NULL && mycpu->gd_cpuid != 0) {
269        kprintf("Switching to cpu #0 for shutdown\n");
270        lwkt_setcpu_self(globaldata_find(0));
271    }
272    /*
273     * Do any callouts that should be done BEFORE syncing the filesystems.
274     */
275    EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);
276
277    /*
278     * Try to get rid of any remaining FS references.  The calling
279     * process, proc0, and init may still hold references.  The
280     * VFS cache subsystem may still hold a root reference to root.
281     *
282     * XXX this needs work.  We really need to SIGSTOP all remaining
283     * processes in order to avoid blowups due to proc0's filesystem
284     * references going away.  For now just make sure that the init
285     * process is stopped.
286     */
287    if (panicstr == NULL) {
288        shutdown_cleanup_proc(curproc);
289        shutdown_cleanup_proc(&proc0);
290        if (initproc) {
291            if (initproc != curproc) {
292                ksignal(initproc, SIGSTOP);
293                tsleep(boot, 0, "shutdn", hz / 20);
294            }
295            shutdown_cleanup_proc(initproc);
296        }
297        vfs_cache_setroot(NULL, NULL);
298    }
299
300    /*
301     * Now sync filesystems
302     */
303    if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
304        int iter, nbusy, pbusy;
305        int zcount;
306
307        waittime = 0;
308        zcount = 0;
309        kprintf("\nsyncing disks... ");
310
311        sys_sync(NULL);
312
313        /*
314         * With soft updates, some buffers that are written will be
315         * remarked as dirty until other buffers are written.
316         *
317         * sys_sync() usually runs asynchronously, to give us a
318         * better chance of syncing the rest of the filesystems when
319         * one or more of them are stuck.
320         */
321        for (iter = pbusy = 0; iter < 20 + zcount; iter++) {
322            if (iter <= 10)
323                nbusy = scan_all_buffers(shutdown_busycount1,
324                             &iter);
325            else
326                nbusy = scan_all_buffers(shutdown_busycount2,
327                             &iter);
328            kprintf("%d ", nbusy);
329            if (nbusy == 0) {
330                if (++zcount == 3)
331                    break;
332            } else {
333                zcount = 0;
334            }
335
336            /*
337             * There could be a lot to sync, only allow iter to
338             * proceed while there is progress.
339             */
340            if (nbusy < pbusy) {
341                if (iter > 10)
342                    iter = 10;
343                else
344                    iter = 0;
345            }
346            pbusy = nbusy;
347
348            /*
349             * XXX:
350             * Process soft update work queue if buffers don't sync
351             * after 6 iterations by permitting the syncer to run.
352             */
353            if (iter > 5)
354                bio_ops_sync(NULL);
355
356            sys_sync(NULL);
357            tsleep(boot, 0, "shutdn", hz * iter / 20 + 1);
358        }
359        kprintf("\n");
360
361        if (zcount < 3) {
362            /*
363             * Failed to sync all blocks. Indicate this and don't
364             * unmount filesystems (thus forcing an fsck on reboot).
365             */
366            kprintf("giving up on %d buffers\n", nbusy);
367#ifdef DDB
368            if (debugger_on_panic)
369                Debugger("busy buffer problem");
370#endif /* DDB */
371            tsleep(boot, 0, "shutdn", hz * 5 + 1);
372        } else {
373            kprintf("done\n");
374
375            /*
376             * Unmount filesystems
377             */
378            if (panicstr == NULL)
379                vfs_unmountall(1);
380        }
381        tsleep(boot, 0, "shutdn", hz / 10 + 1);
382    }
383
384    print_uptime();
385
386    /*
387     * Dump before doing post_sync shutdown ops
388     */
389    crit_enter();
390    if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold) {
391        dumpsys();
392    }
393
394    /*
395     * Ok, now do things that assume all filesystem activity has
396     * been completed.  This will also call the device shutdown
397     * methods.
398     */
399    EVENTHANDLER_INVOKE(shutdown_post_sync, howto);
400
401    /* Now that we're going to really halt the system... */
402    EVENTHANDLER_INVOKE(shutdown_final, howto);
403
404    for(;;) ;   /* safety against shutdown_reset not working */
405    /* NOTREACHED */
406}
407
408/*
409 * Pass 1 - Figure out if there are any busy or dirty buffers still present.
410 *
411 *  We ignore TMPFS mounts in this pass.
412 */
413static int
414shutdown_busycount1(struct buf *bp, void *info __unused)
415{
416    struct vnode *vp;
417
418    if ((vp = bp->b_vp) != NULL && vp->v_tag == VT_TMPFS)
419        return (0);
420    if ((bp->b_flags & B_INVAL) == 0 && BUF_LOCKINUSE(bp))
421        return(1);
422    if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI)
423        return (1);
424    return (0);
425}
426
427/*
428 * Pass 2 - only run after pass 1 has completed or has given up
429 *
430 *  We ignore TMPFS, NFS, MFS, and SMBFS mounts in this pass.
431 */
432static int
433shutdown_busycount2(struct buf *bp, void *info)
434{
435    struct vnode *vp;
436    int *iterp = info;
437    const char *mpath;
438
439    /*
440     * Ignore tmpfs and nfs mounts
441     */
442    if ((vp = bp->b_vp) != NULL) {
443        if (vp->v_tag == VT_TMPFS)
444            return (0);
445        if (vp->v_tag == VT_NFS)
446            return (0);
447        if (vp->v_tag == VT_MFS)
448            return (0);
449        if (vp->v_tag == VT_SMBFS)
450            return (0);
451    }
452
453    /*
454     * Only count buffers stuck on I/O, ignore everything else
455     */
456    if (((bp->b_flags & B_INVAL) == 0 && BUF_LOCKINUSE(bp)) ||
457        ((bp->b_flags & (B_DELWRI|B_INVAL)) == B_DELWRI)) {
458        /*
459         * Only count buffers undergoing write I/O
460         * on the related vnode.
461         */
462        if (bp->b_vp == NULL ||
463            bio_track_active(&bp->b_vp->v_track_write) == 0) {
464            return (0);
465        }
466        if (*iterp > 15) {
467            mpath = "?";
468            if (bp->b_vp->v_mount)
469                mpath = bp->b_vp->v_mount->mnt_stat.f_mntonname;
470
471            kprintf("%p on %s, flags:%08x, loffset:%jd, "
472                "doffset:%jd\n",
473                bp,
474                mpath,
475                bp->b_flags,
476                (intmax_t)bp->b_loffset,
477                (intmax_t)bp->b_bio2.bio_offset);
478        }
479        return(1);
480    }
481    return(0);
482}
483
484/*
485 * If the shutdown was a clean halt, behave accordingly.
486 */
487static void
488shutdown_halt(void *junk, int howto)
489{
490    if (howto & RB_HALT) {
491        kprintf("\n");
492        kprintf("The operating system has halted.\n");
493#ifdef _KERNEL_VIRTUAL
494        cpu_halt();
495#else
496        kprintf("Please press any key to reboot.\n\n");
497        switch (cngetc()) {
498        case -1:        /* No console, just die */
499            cpu_halt();
500            /* NOTREACHED */
501        default:
502            howto &= ~RB_HALT;
503            break;
504        }
505#endif
506    }
507}
508
509/*
510 * Check to see if the system paniced, pause and then reboot
511 * according to the specified delay.
512 */
513static void
514shutdown_panic(void *junk, int howto)
515{
516    int loop;
517    int c;
518
519    if (howto & RB_DUMP) {
520        if (PANIC_REBOOT_WAIT_TIME != 0) {
521            if (PANIC_REBOOT_WAIT_TIME != -1) {
522                kprintf("Automatic reboot in %d seconds - "
523                       "press a key on the console to abort\n",
524                    PANIC_REBOOT_WAIT_TIME);
525                for (loop = PANIC_REBOOT_WAIT_TIME * 10;
526                     loop > 0; --loop) {
527                    DELAY(1000 * 100); /* 1/10th second */
528                    /* Did user type a key? */
529                    c = cncheckc();
530                    if (c != -1 && c != NOKEY)
531                        break;
532                }
533                if (!loop)
534                    return;
535            }
536        } else { /* zero time specified - reboot NOW */
537            return;
538        }
539        kprintf("--> Press a key on the console to reboot,\n");
540        kprintf("--> or switch off the system now.\n");
541        cngetc();
542    }
543}
544
545/*
546 * Everything done, now reset
547 */
548static void
549shutdown_reset(void *junk, int howto)
550{
551    kprintf("Rebooting...\n");
552    DELAY(1000000); /* wait 1 sec for kprintf's to complete and be read */
553    /* cpu_boot(howto); */ /* doesn't do anything at the moment */
554    cpu_reset();
555    /* NOTREACHED */ /* assuming reset worked */
556}
557
558/*
559 * Try to remove FS references in the specified process.  This function
560 * is used during shutdown
561 */
562static
563void
564shutdown_cleanup_proc(struct proc *p)
565{
566    struct filedesc *fdp;
567    struct vmspace *vm;
568
569    if (p == NULL)
570        return;
571    if ((fdp = p->p_fd) != NULL) {
572        kern_closefrom(0);
573        if (fdp->fd_cdir) {
574            cache_drop(&fdp->fd_ncdir);
575            vrele(fdp->fd_cdir);
576            fdp->fd_cdir = NULL;
577        }
578        if (fdp->fd_rdir) {
579            cache_drop(&fdp->fd_nrdir);
580            vrele(fdp->fd_rdir);
581            fdp->fd_rdir = NULL;
582        }
583        if (fdp->fd_jdir) {
584            cache_drop(&fdp->fd_njdir);
585            vrele(fdp->fd_jdir);
586            fdp->fd_jdir = NULL;
587        }
588    }
589    if (p->p_vkernel)
590        vkernel_exit(p);
591    if (p->p_textvp) {
592        vrele(p->p_textvp);
593        p->p_textvp = NULL;
594    }
595    vm = p->p_vmspace;
596    if (vm != NULL) {
597        pmap_remove_pages(vmspace_pmap(vm),
598                  VM_MIN_USER_ADDRESS,
599                  VM_MAX_USER_ADDRESS);
600        vm_map_remove(&vm->vm_map,
601                  VM_MIN_USER_ADDRESS,
602                  VM_MAX_USER_ADDRESS);
603    }
604}
605
606/*
607 * Magic number for savecore
608 *
609 * exported (symorder) and used at least by savecore(8)
610 *
611 * Mark it as used so that gcc doesn't optimize it away.
612 */
613__attribute__((__used__))
614    static u_long const dumpmag = 0x8fca0101UL;
615
616__attribute__((__used__))
617    static int  dumpsize = 0;       /* also for savecore */
618
619static int  dodump = 1;
620
621SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0,
622    "Try to perform coredump on kernel panic");
623
624void
625mkdumpheader(struct kerneldumpheader *kdh, char *magic, uint32_t archver,
626    uint64_t dumplen, uint32_t blksz)
627{
628    bzero(kdh, sizeof(*kdh));
629    strncpy(kdh->magic, magic, sizeof(kdh->magic));
630    strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
631    kdh->version = htod32(KERNELDUMPVERSION);
632    kdh->architectureversion = htod32(archver);
633    kdh->dumplength = htod64(dumplen);
634    kdh->dumptime = htod64(time_second);
635    kdh->blocksize = htod32(blksz);
636    strncpy(kdh->hostname, hostname, sizeof(kdh->hostname));
637    strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
638    if (panicstr != NULL)
639        strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
640    kdh->parity = kerneldump_parity(kdh);
641}
642
643static int
644setdumpdev(cdev_t dev)
645{
646    int error;
647    int doopen;
648
649    if (dev == NULL) {
650        disk_dumpconf(NULL, 0/*off*/);
651        dumpdev = NULL;
652        return (0);
653    }
654
655    /*
656     * We have to open the device before we can perform ioctls on it,
657     * or the slice/label data may not be present.  Device opens are
658     * usually tracked by specfs, but the dump device can be set in
659     * early boot and may not be open so this is somewhat of a hack.
660     */
661    doopen = (dev->si_sysref.refcnt == 1);
662    if (doopen) {
663        error = dev_dopen(dev, FREAD, S_IFCHR, proc0.p_ucred, NULL);
664        if (error)
665            return (error);
666    }
667    error = disk_dumpconf(dev, 1/*on*/);
668    if (error == 0)
669        dumpdev = dev;
670
671    return error;
672}
673
674/* ARGSUSED */
675static void dump_conf (void *dummy);
676static void
677dump_conf(void *dummy)
678{
679    char *path;
680    cdev_t dev;
681    int _dummy;
682
683    path = kmalloc(MNAMELEN, M_TEMP, M_WAITOK);
684    if (TUNABLE_STR_FETCH("dumpdev", path, MNAMELEN) != 0) {
685        /*
686         * Make sure all disk devices created so far have also been
687         * probed, and also make sure that the newly created device
688         * nodes for probed disks are ready, too.
689         *
690         * XXX - Delay an additional 2 seconds to help drivers which
691         *   pickup devices asynchronously and are not caught by
692         *   CAM's initial probe.
693         */
694        sync_devs();
695        tsleep(&_dummy, 0, "syncer", hz*2);
696
697        dev = kgetdiskbyname(path);
698        if (dev != NULL)
699            dumpdev = dev;
700    }
701    kfree(path, M_TEMP);
702    if (setdumpdev(dumpdev) != 0)
703        dumpdev = NULL;
704}
705
706SYSINIT(dump_conf, SI_SUB_DUMP_CONF, SI_ORDER_FIRST, dump_conf, NULL);
707
708static int
709sysctl_kern_dumpdev(SYSCTL_HANDLER_ARGS)
710{
711    int error;
712    udev_t ndumpdev;
713
714    ndumpdev = dev2udev(dumpdev);
715    error = sysctl_handle_opaque(oidp, &ndumpdev, sizeof ndumpdev, req);
716    if (error == 0 && req->newptr != NULL)
717        error = setdumpdev(udev2dev(ndumpdev, 0));
718    return (error);
719}
720
721SYSCTL_PROC(_kern, KERN_DUMPDEV, dumpdev, CTLTYPE_OPAQUE|CTLFLAG_RW,
722    0, sizeof dumpdev, sysctl_kern_dumpdev, "T,udev_t", "");
723
724static struct panicerinfo *panic_notifier;
725
726int
727set_panic_notifier(struct panicerinfo *info)
728{
729    if (info == NULL)
730        panic_notifier = NULL;
731    else if (panic_notifier != NULL)
732        return 1;
733    else
734        panic_notifier = info;
735
736    return 0;
737}
738
739/*
740 * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
741 * and then reboots.  If we are called twice, then we avoid trying to sync
742 * the disks as this often leads to recursive panics.
743 */
744void
745panic(const char *fmt, ...)
746{
747    int bootopt, newpanic;
748    globaldata_t gd = mycpu;
749    thread_t td = gd->gd_curthread;
750    __va_list ap;
751    static char buf[256];
752
753    /*
754     * If a panic occurs on multiple cpus before the first is able to
755     * halt the other cpus, only one cpu is allowed to take the panic.
756     * Attempt to be verbose about this situation but if the kprintf()
757     * itself panics don't let us overrun the kernel stack.
758     *
759     * Be very nasty about descheduling our thread at the lowest
760     * level possible in an attempt to freeze the thread without
761     * inducing further panics.
762     *
763     * Bumping gd_trap_nesting_level will also bypass assertions in
764     * lwkt_switch() and allow us to switch away even if we are a
765     * FAST interrupt or IPI.
766     *
767     * The setting of panic_cpu_gd also determines how kprintf()
768     * spin-locks itself.  DDB can set panic_cpu_gd as well.
769     */
770    for (;;) {
771        globaldata_t xgd = panic_cpu_gd;
772
773        /*
774         * Someone else got the panic cpu
775         */
776        if (xgd && xgd != gd) {
777            crit_enter();
778            ++mycpu->gd_trap_nesting_level;
779            if (mycpu->gd_trap_nesting_level < 25) {
780                kprintf("SECONDARY PANIC ON CPU %d THREAD %p\n",
781                    mycpu->gd_cpuid, td);
782            }
783            td->td_release = NULL;  /* be a grinch */
784            for (;;) {
785                lwkt_deschedule_self(td);
786                lwkt_switch();
787            }
788            /* NOT REACHED */
789            /* --mycpu->gd_trap_nesting_level */
790            /* crit_exit() */
791        }
792
793        /*
794         * Reentrant panic
795         */
796        if (xgd && xgd == gd)
797            break;
798
799        /*
800         * We got it
801         */
802        if (atomic_cmpset_ptr(&panic_cpu_gd, NULL, gd))
803            break;
804    }
805    /*
806     * Try to get the system into a working state.  Save information
807     * we are about to destroy.
808     */
809    kvcreinitspin();
810    if (panicstr == NULL) {
811        bcopy(td->td_toks_array, panic_tokens, sizeof(panic_tokens));
812        panic_tokens_count = td->td_toks_stop - &td->td_toks_base;
813    }
814    lwkt_relalltokens(td);
815    td->td_toks_stop = &td->td_toks_base;
816    if (gd->gd_spinlocks)
817        kprintf("panic with %d spinlocks held\n", gd->gd_spinlocks);
818    gd->gd_spinlocks = 0;
819
820    /*
821     * Setup
822     */
823    bootopt = RB_AUTOBOOT | RB_DUMP;
824    if (sync_on_panic == 0)
825        bootopt |= RB_NOSYNC;
826    newpanic = 0;
827    if (panicstr) {
828        bootopt |= RB_NOSYNC;
829    } else {
830        panicstr = fmt;
831        newpanic = 1;
832    }
833
834    /*
835     * Format the panic string.
836     */
837    __va_start(ap, fmt);
838    kvsnprintf(buf, sizeof(buf), fmt, ap);
839    if (panicstr == fmt)
840        panicstr = buf;
841    __va_end(ap);
842    if (panic_notifier != NULL)
843        panic_notifier->notifier(panic_notifier->arg);
844    kprintf("panic: %s\n", buf);
845    /* two separate prints in case of an unmapped page and trap */
846    kprintf("cpuid = %d\n", mycpu->gd_cpuid);
847
848#if (NGPIO > 0) && defined(ERROR_LED_ON_PANIC)
849    led_switch("error", 1);
850#endif
851
852#if defined(WDOG_DISABLE_ON_PANIC)
853    wdog_disable();
854#endif
855
856    /*
857     * Make sure kgdb knows who we are, there won't be a stoppcbs[]
858     * entry since our cpu wasn't stopped.
859     */
860    savectx(&dumppcb);
861    dumpthread = curthread;
862
863    /*
864     * Enter the debugger or fall through & dump.  Entering the
865     * debugger will stop cpus.  If not entering the debugger stop
866     * cpus here.
867     *
868     * Limit the trace history to leave more panic data on a
869     * potentially row-limited console.
870     */
871
872#if defined(DDB)
873    if (newpanic && trace_on_panic)
874        print_backtrace(6);
875    if (debugger_on_panic)
876        Debugger("panic");
877    else
878#endif
879    if (newpanic)
880        stop_cpus(mycpu->gd_other_cpus);
881    boot(bootopt);
882}
883
884/*
885 * Support for poweroff delay.
886 */
887#ifndef POWEROFF_DELAY
888# define POWEROFF_DELAY 5000
889#endif
890static int poweroff_delay = POWEROFF_DELAY;
891
892SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
893    &poweroff_delay, 0, "");
894
895static void
896poweroff_wait(void *junk, int howto)
897{
898    if(!(howto & RB_POWEROFF) || poweroff_delay <= 0)
899        return;
900    DELAY(poweroff_delay * 1000);
901}
902
903/*
904 * Some system processes (e.g. syncer) need to be stopped at appropriate
905 * points in their main loops prior to a system shutdown, so that they
906 * won't interfere with the shutdown process (e.g. by holding a disk buf
907 * to cause sync to fail).  For each of these system processes, register
908 * shutdown_kproc() as a handler for one of shutdown events.
909 */
910static int kproc_shutdown_wait = 60;
911SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
912    &kproc_shutdown_wait, 0, "");
913
914void
915shutdown_kproc(void *arg, int howto)
916{
917    struct thread *td;
918    struct proc *p;
919    int error;
920
921    if (panicstr)
922        return;
923
924    td = (struct thread *)arg;
925    if ((p = td->td_proc) != NULL) {
926        kprintf("Waiting (max %d seconds) for system process `%s' to stop...",
927        kproc_shutdown_wait, p->p_comm);
928    } else {
929        kprintf("Waiting (max %d seconds) for system thread %s to stop...",
930        kproc_shutdown_wait, td->td_comm);
931    }
932    error = suspend_kproc(td, kproc_shutdown_wait * hz);
933
934    if (error == EWOULDBLOCK)
935        kprintf("timed out\n");
936    else
937        kprintf("stopped\n");
938}
939
940/* Registration of dumpers */
941int
942set_dumper(struct dumperinfo *di)
943{
944    if (di == NULL) {
945        bzero(&dumper, sizeof(dumper));
946        return 0;
947    }
948
949    if (dumper.dumper != NULL)
950        return (EBUSY);
951
952    dumper = *di;
953    return 0;
954}
955
956void
957dumpsys(void)
958{
959#if defined (_KERNEL_VIRTUAL)
960    /* vkernels don't support dumps */
961    kprintf("vkernels don't support dumps\n");
962    return;
963#endif
964    /*
965     * If there is a dumper registered and we aren't dumping already, call
966     * the machine dependent dumpsys (md_dumpsys) to do the hard work.
967     *
968     * XXX: while right now the md_dumpsys() of x86 and x86_64 could be
969     *      factored out completely into here, I rather keep them machine
970     *      dependent in case we ever add a platform which does not share
971     *      the same dumpsys() code, such as arm.
972     */
973    if (dumper.dumper != NULL && !dumping) {
974        dumping++;
975        md_dumpsys(&dumper);
976    }
977}
978
979int dump_stop_usertds = 0;
980
981static
982void
983need_user_resched_remote(void *dummy)
984{
985    need_user_resched();
986}
987
988void
989dump_reactivate_cpus(void)
990{
991    globaldata_t gd;
992    int cpu, seq;
993
994    dump_stop_usertds = 1;
995
996    need_user_resched();
997
998    for (cpu = 0; cpu < ncpus; cpu++) {
999        gd = globaldata_find(cpu);
1000        seq = lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
1001        lwkt_wait_ipiq(gd, seq);
1002    }
1003
1004    restart_cpus(stopped_cpus);
1005}
1006