Sysupgrade causes kernel panic

Itus Network Shield, not officially supported, Octeon3 SoC, built from master r13676-9858a8c582. I connect via USB console roll-over cable.

Working through the sysupgrade system, when i attempt to use it, it gets to the point it needs to switch to the ramdisk and then just dumps...

root@Shield:/tmp# sysupgrade -v openwrt-octeon-itus-itusrouter-ext4-sysupgrade.tar.gz 
Image metadata not found
sysupgrade-itus/
sysupgrade-itus/CONTROL
sysupgrade-itus/kernel
sysupgrade-itus/root
Saving config files...
etc/config/dhcp
etc/config/dhcp-opkg
etc/config/dropbear
etc/config/firewall
etc/config/luci
etc/config/network
etc/config/rpcd
etc/config/system
etc/config/ucitrack
etc/config/uhttpd
etc/config/uhttpd-opkg
etc/dropbear/dropbear_rsa_host_key
etc/group
etc/hosts
etc/inittab
etc/luci-uploads/.placeholder
etc/opkg/keys/0b26f36ae0f4106d
etc/opkg/keys/1035ac73cc4e59e3
etc/opkg/keys/5151f69420c3f508
etc/opkg/keys/72a57f2191b211e0
etc/opkg/keys/792d9d9b39f180dc
etc/opkg/keys/9ef4694208102c43
etc/opkg/keys/b2d571e0880ff617
etc/opkg/keys/b5043e70f9a75cde
etc/opkg/keys/c10b9afab19ee428
etc/opkg/keys/caafb567d784e639
etc/opkg/keys/dace9d4df16896bf
etc/opkg/keys/dd6de0d06bbd3d85
etc/opkg/keys/f94b9dd6febac963
etc/passwd
etc/profile
etc/rc.local
etc/shadow
etc/shells
etc/shinit
etc/sysctl.conf
etc/uhttpd.crt
etc/uhttpd.key
etc/uhttpd.key
etc/uhttpd.crt
Commencing upgrade. Closing all shell sessions.

*** NMI Watchdog interrupt on Core 0x00 ***
        $0      0x0000000000000000      at      0x0000000014109ce1
        v0      0xffffffff81990000      v1      0x00000000001cdf04
        a0      0x00000000001cdf02      a1      0xffffffff8181cd78
        a2      0x0000000014109ce0      a3      0xffffffffffff00fe
        a4      0x116d7caf85800000      a5      0x0000000000000000
        a6      0x000000000d20fe00      a7      0x0000000000000000
        t0      0x0000000000000000      t1      0x0000000000009c00
        t2      0xffffffff81265458      t3      0x0000000000000000
        s0      0xffffffff8180d118      s1      0x0000000000000001
        s2      0xffffffff81810000      s3      0xffffffff8177c720
        s4      0xffffffff81820000      s5      0x0000000000000001
        s6      0x000000004f001160      s7      0xffffffff81691f00
        t8      0x0000000000000000      t9      0xffffffff81697308
        k0      0x0000000000000000      k1      0xffffffff81990000
        gp      0xffffffff817f8000      sp      0xffffffff817fbda0
        s8      0x0000000001100000      ra      0xffffffff8116d5b0
        err_epc 0xffffffff8111b3c0      epc     0xffffffff8111b3c0
        status  0x0000000014589ce5      cause   0x0000000000800000
        sum0    0x0080000000000000      en0     0x00a0000500008000
*** Chip soft reset soon ***

*** NMI Watchdog interrupt on Core 0x01 ***
        $0      0x0000000000000000      at      0x0000000014109ce1
        v0      0x0000000000000000      v1      0x000000002c45bcdb
        a0      0x000000fff7b545a0      a1      0x0000000121b5d77b
        a2      0x0000000000008000      a3      0x0000000121b64040
        a4      0x000000fff7b547a0      a5      0x0000000000000000
        a6      0x0000000000000000      a7      0x0000000000000000
        t0      0x0000000000000000      t1      0xffffffff84080018
        t2      0xffffffff81265528      t3      0x0000000000000000
        s0      0x0000000000000001      s1      0x000000012002e454
        s2      0x000000012002e304      s3      0x0000000000000000
        s4      0x000000012002c494      s5      0x0000000000000000
        s6      0x000000fffbcbb5f0      s7      0x000000012002f040
        t8      0x0000000000000002      t9      0x000000012002c50c
        k0      0x0000000000000008      k1      0xffffffff81990008
        gp      0x0000000120097020      sp      0x000000fffbcbb520
        s8      0x0000000000000018      ra      0x000000012002f328
        err_epc 0x000000012002c540      epc     0x000000012002e384
        status  0x0000000004589cf5      cause   0x0000000000800000
        sum0    0x0080000000000000      en0     0x0000000100000000
*** Chip soft reset soon ***

OCTEON eMMC stage 1 bootloader

This is my altered /lib/upgrade/platform.sh

root@Shield:/# cat /lib/upgrade/platform.sh 
#
# Copyright (C) 2014 OpenWrt.org
#

platform_get_rootfs() {
        local rootfsdev

        if read cmdline < /proc/cmdline; then
                case "$cmdline" in
                        *block2mtd=*)
                                rootfsdev="${cmdline##*block2mtd=}"
                                rootfsdev="${rootfsdev%%,*}"
                        ;;
                        *root=*)
                                rootfsdev="${cmdline##*root=}"
                                rootfsdev="${rootfsdev%% *}"
                        ;;
                esac

                echo "${rootfsdev}"
        fi
}

platform_copy_config() {
        case "$(board_name)" in
        erlite)
                mount -t vfat /dev/sda1 /mnt
                cp -af "$UPGRADE_BACKUP" "/mnt/$BACKUP_FILE"
                umount /mnt
                ;;
        itus*)
                mount -t vfat /dev/mmcblk1p1 /mnt
                cp -af "$UPGRADE_BACKUP" "/mnt/$BACKUP_FILE"
                umount /mnt
                ;;
        esac
}

platform_do_flash() {
        local tar_file=$1
        local board=$2
        local kernel=$3
        local rootfs=$4

        mkdir -p /boot
        mount -t vfat /dev/$kernel /boot

        [ -f /boot/vmlinux.64 -a ! -L /boot/vmlinux.64 ] && {
                mv /boot/vmlinux.64 /boot/vmlinux.64.previous
                mv /boot/vmlinux.64.md5 /boot/vmlinux.64.md5.previous

                echo "flashing kernel to /dev/$kernel"
                md5sum /boot/vmlinux.64 | cut -f1 -d " " > /boot/vmlinux.64.md5
                echo "flashing rootfs to ${rootfs}"
        }

        case "$board" in
        er | erlite)
           tar xf $tar_file sysupgrade-$board/kernel -O > /boot/vmlinux.64
           tar xf $tar_file sysupgrade-$board/root -O | dd of="${rootfs}" bs=4096
                ;;
        itus*)
           tar xvzf $tar_file -C /tmp
           echo "Moving kernel image..."
           cp -v /tmp/sysupgrade-itus${kernel}/kernel /boot/Itus${kernel}Image
           echo "Flashing rootfs..."
           dd if=/tmp/sysupgrade-itus${kernel}/root of=/dev/${rootfs} bs=512
                ;;
        esac

        sync
        umount /boot
}

platform_do_upgrade() {
        local tar_file="$1"
        local board=$(board_name)
        local rootfs="$(platform_get_rootfs)"
        local kernel=

        [ -b "${rootfs}" ] || return 1
        case "$board" in
        er)
                kernel=mmcblk0p1
                ;;
        erlite)
                kernel=sda1
                ;;
        itusrouter)
                kernel=router
                ;;
        itusbridge)
                kernel=bridge
                ;;
        *)
                return 1
        esac

        platform_do_flash $tar_file $board $kernel $rootfs

        return 0
}

platform_check_image() {
        local board=$(board_name)

        case "$board" in
        er | erlite)
                local tar_file="$1"
                local kernel_length=$(tar xf $tar_file sysupgrade-$board/kernel -O | wc -c 2> /dev/null)
                local rootfs_length=$(tar xf $tar_file sysupgrade-$board/root -O | wc -c 2> /dev/null)
                [ "$kernel_length" = 0 -o "$rootfs_length" = 0 ] && {
                        echo "The upgrade image is corrupt."
                        return 1
                }
                return 0
        ;;
        itus*)
                local tar_file="$1"
                # Remove any existing, just in case...
                tar xvfz $tar_file -C /tmp
                if [ ! -f /tmp/sysupgrade-itus${kernel}/kernel ] || [ ! -f /tmp/sysupgrade-itus${kernel}/root ]
                then
                        echo "The upgrade image is corrupt."
                        return 1
                fi
                return 0
        ;;
        esac

        echo "Sysupgrade is not yet supported on $board."
        return 1
}

Is there anything I can do to figure out WHY this is happening?

Appreciate any help!

To me it seems as if a process is terminated that is supposed to keep poking the watchdog. Is there any userspace watchdog process?

1 Like
  183 root         0 SW   [watchdogd]

This is the only process I can see. Could it be a configuration issue in my .config?

Process names in brackets usually indicate kernel threads so it is unlikely that it got killed. I now also noticed that I confused NMI watchdogs with the regular watchdog machinery.

Basically the NMI watchdog fires when the kernel sees no device or timer interrupts for 5 seconds or so. Either the sysupgrade process is somehow freezing the kernel (IO issues, kernel oops?) which causes this reset or maybe the reboot at the end of the image write process is not working, sending the CPU into a busy loop which eventually causes the NMI interrupt to occur.

How soon do you see the *** NMI Watchdog interrupt on Core 0x00 *** message after Commencing upgrade. Closing all shell sessions.. Is it happening immediately or a few seconds later?

It is happening after a few seconds.. In a previous build, I've seen it switch to the ramdisk, but I corrupted the source and wiped and started over, which is why I was wondering about a config setting.

Is there a minimum free RAM it's looking for maybe?

I know that dd takes quite a few seconds to flash when I get it working

Well it needs enough memory to copy a minimal userland to the ramdisk (mainly busybox, required libraries and a bunch of shell scripts) and to hold the image to be written in RAM as well.

If Linux is starved on available RAM and there's no swap available to page out, it is quite likely that the kernel is "freezing" trying to reclaim memory which then eventually triggers the NMI watchdog because the system is not progressing for 5s or longer.

Ok.. I'm going to reduce the rootfs size and try again.

The partition is 850MB, and I had set it to 400MB rootfs because I figured it had to explode it in /tmp before the dd, but I'll drop it back to 104MB and try again. The device has a 1GB of RAM, so I figured I'd have space.

You called it @jow .. Is there a way to increase the timer on the watchdog to allow for the longer time needed to flash?

etc/shells
etc/shinit
etc/sysctl.conf
etc/uhttpd.crt
etc/uhttpd.key
etc/uhttpd.key
etc/uhttpd.crt
Commencing upgrade. Closing all shell sessions.
Watchdog handover: fd=3
- watchdog -
killall: telnetd: no process killed
Sending TERM to remaining processes ... uhttpd ntpd udhcpc odhcp6c dnsmasq ubusd urngd logd rpcd netifd odhcpd 
Sending KILL to remaining processes ... 
Switching to ramdisk...
[ 4058.709065] EXT4-fs (mmcblk1p2): re-mounted. Opts: (null)
Performing system upgrade...
Upgrade completed
Rebooting system...
umount: can't unmount /dev: Resource busy
umount: can't unmount /tmp: Resource busy
[ 4062.444114] reboot: Restarting system

OCTEON eMMC stage 1 bootloader

You could try increasing the kernel.watchdog_thresh sysctl.

I don't seem to have a kernel.watchdog_thresh entry, either in sysctl or /proc/sys/kernel

root@Shield:/proc/sys/kernel# sysctl -a | grep "kernel.*"
kernel.auto_msgmni = 0
kernel.cad_pid = 1
kernel.cap_last_cap = 37
kernel.core_pattern = /tmp/%e.%t.%p.%s.core
kernel.core_pipe_limit = 0
kernel.core_uses_pid = 0
kernel.ctrl-alt-del = 0
kernel.dmesg_restrict = 1
kernel.domainname = (none)
kernel.firmware_config.force_sysfs_fallback = 1
kernel.firmware_config.ignore_sysfs_fallback = 0
kernel.hostname = Shield
kernel.hotplug = /sbin/hotplug
kernel.kptr_restrict = 0
kernel.max_lock_depth = 1024
kernel.modprobe = /sbin/modprobe
kernel.modules_disabled = 0
kernel.msgmax = 8192
kernel.msgmnb = 16384
kernel.msgmni = 32000
kernel.ngroups_max = 65536
kernel.osrelease = 4.19.123
kernel.ostype = Linux
kernel.overflowgid = 65534
kernel.overflowuid = 65534
kernel.panic = 3
kernel.panic_on_oops = 1
kernel.panic_on_rcu_stall = 0
kernel.panic_on_warn = 0
kernel.pid_max = 32768
kernel.poweroff_cmd = /sbin/poweroff
kernel.print-fatal-signals = 0
kernel.printk = 7       4       1       7
kernel.printk_delay = 0
kernel.printk_devkmsg = ratelimit
kernel.printk_ratelimit = 5
kernel.printk_ratelimit_burst = 10
kernel.pty.max = 4096
kernel.pty.nr = 0
kernel.pty.reserve = 1024
kernel.random.boot_id = ffa0cd6d-55a8-4c6f-8490-eb2a77e5db97
kernel.random.entropy_avail = 3413
kernel.random.poolsize = 4096
kernel.random.read_wakeup_threshold = 64
kernel.random.urandom_min_reseed_secs = 60
kernel.random.uuid = 60bb612e-0979-45d6-b34b-e2409012a7df
kernel.random.write_wakeup_threshold = 896
kernel.randomize_va_space = 2
kernel.real-root-dev = 0
kernel.sched_child_runs_first = 0
kernel.sched_rr_timeslice_ms = 100
kernel.sched_rt_period_us = 1000000
kernel.sched_rt_runtime_us = 950000
kernel.seccomp.actions_avail = kill_process kill_thread trap errno trace log allow
kernel.seccomp.actions_logged = kill_process kill_thread trap errno trace log
kernel.sem = 32000      1024000000      500     32000
kernel.shm_rmid_forced = 0
kernel.shmall = 18446744073692774399
kernel.shmmax = 18446744073692774399
kernel.shmmni = 4096
kernel.sysctl_writes_strict = 1
kernel.sysrq = 1
kernel.tainted = 0
kernel.threads-max = 7520
kernel.unprivileged_bpf_disabled = 0
kernel.usermodehelper.bset = 4294967295 63
kernel.usermodehelper.inheritable = 4294967295  63
kernel.version = #0 SMP Thu Jul 2 20:28:06 2020
root@Shield:/proc/sys/kernel# ls /proc/sys/kernel
auto_msgmni                poweroff_cmd
cad_pid                    print-fatal-signals
cap_last_cap               printk
core_pattern               printk_delay
core_pipe_limit            printk_devkmsg
core_uses_pid              printk_ratelimit
ctrl-alt-del               printk_ratelimit_burst
dmesg_restrict             pty
domainname                 random
firmware_config            randomize_va_space
hostname                   real-root-dev
hotplug                    sched_child_runs_first
kptr_restrict              sched_rr_timeslice_ms
max_lock_depth             sched_rt_period_us
modprobe                   sched_rt_runtime_us
modules_disabled           seccomp
msgmax                     sem
msgmnb                     shm_rmid_forced
msgmni                     shmall
ngroups_max                shmmax
osrelease                  shmmni
ostype                     sysctl_writes_strict
overflowgid                sysrq
overflowuid                tainted
panic                      threads-max
panic_on_oops              unprivileged_bpf_disabled
panic_on_rcu_stall         usermodehelper
panic_on_warn              version
pid_max

Then I can only suggest to google for "increase NMI watchdog timeout" or similar. I could imagine that the default timeout is configurable somehow.

1 Like

You've given me a direction to look into.. Many thanks, sir!

might be a good idea to not extract the img twice beforehand... while your there increase bs=512... these are not helping you.

Ok.. I had read that MMC/SD has a 512byte block size, so that is what I set it to. If I leave it off entirely, will it default to the partition sector size?

@anon50098793 does have a point here. You can certainly optimize your flash procedure by piping the tar output directly to dd, this way you don't need to hold the entire uncompressed image contents in RAM.

Something like

echo "Writing kernel image..."
tar Oxvzf $tar_file sysupgrade-itus${kernel}/kernel | dd of=/boot/Itus${kernel}Image
echo "Flashing rootfs..."
tar Oxvzf $tar_file sysupgrade-itus${kernel}/root | dd of=/dev/${rootfs}

This topic was automatically closed 10 days after the last reply. New replies are no longer allowed.