High sys CPU and then no internet

Hi,

Today I had something weird happening in my main router (x86).

The CPU spiked, mainly sys (~35%) and some usr (~12%). Softirq was minimal (~1%), and it continued like that for a while until the router became unresponsive. No internet after that. I had to reboot it to get it back. (see screenshots).

The network traffic was very low at the time (confirmed by the low softirq cpu). The disk is very far from being full (36% utilization). Never had this happening before.

The other thing that jumped was the memory slab utilization (see screenshots).

Unfortunately, I didn't have the system logs being stored. Now I do.

It is like if something triggered, perhaps related to the file system?

Thank you.

/dev/root on / type ext4 (rw,noatime)
proc on /proc type proc (rw,nosuid,nodev,noexec,noatime)
sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,noatime)
cgroup2 on /sys/fs/cgroup type cgroup2 (rw,nosuid,nodev,noexec,relatime,nsdelegate)
tmpfs on /tmp type tmpfs (rw,nosuid,nodev,noatime)
/dev/sda1 on /boot type vfat (rw,noatime,fmask=0022,dmask=0022,codepage=437,iocharset=iso8859-1,shortname=mixed,errors=remount-ro)
/dev/sda1 on /boot type vfat (rw,noatime,fmask=0022,dmask=0022,codepage=437,iocharset=iso8859-1,shortname=mixed,errors=remount-ro)
tmpfs on /dev type tmpfs (rw,nosuid,noexec,noatime,size=512k,mode=755)
devpts on /dev/pts type devpts (rw,nosuid,noexec,noatime,mode=600,ptmxmode=000)
debugfs on /sys/kernel/debug type debugfs (rw,noatime)
none on /sys/fs/bpf type bpf (rw,nosuid,nodev,noexec,noatime,mode=700)



I forgot to mention, that the memory utilization during that period was overall very low. So it wasn't like the kernel was working hard freeing memory.

Let’s start by looking at your config. Don’t forget to redact sensitive information.

ubus call system board 
cat /etc/config/network
cat /etc/config/firewall
cat /etc/config/dhcp
cat /etc/config/wireless

ubus call system board

{
	"kernel": "5.10.176",
	"hostname": "openwrt",
	"system": "Intel(R) Pentium(R) CPU  J3710  @ 1.60GHz",
	"model": "Protectli FW4C",
	"board_name": "protectli-fw4c",
	"rootfs_type": "ext4",
	"release": {
		"distribution": "OpenWrt",
		"version": "22.03.5",
		"revision": "r20134-5f15225c1e",
		"target": "x86/64",
		"description": "OpenWrt 22.03.5 r20134-5f15225c1e"
	}
}

cat /etc/config/network

config interface 'loopback'
	option device 'lo'
	option proto 'static'
	option ipaddr '127.0.0.1'
	option netmask '255.0.0.0'

config globals 'globals'
	option ula_prefix 'fdbb:0da2:05d7::/48'

config device
	option name 'br-lan'
	option type 'bridge'
	list ports 'eth1'
	list ports 'eth2'
	list ports 'eth3'

config interface 'lan'
	option device 'br-lan'
	option proto 'static'
	option ipaddr '192.168.210.1'
	option netmask '255.255.255.0'
	option ip6assign '60'

config interface 'wan'
	option device 'eth0'
	option proto 'dhcp'

config interface 'wan6'
	option device 'eth0'
	option proto 'dhcpv6'
	option reqaddress 'try'
	option reqprefix 'auto'

cat /etc/config/firewall

config defaults
	option syn_flood	1
	option input		ACCEPT
	option output		ACCEPT
	option forward		REJECT
# Uncomment this line to disable ipv6 rules
#	option disable_ipv6	1

config zone
	option name		lan
	list   network		'lan'
	option input		ACCEPT
	option output		ACCEPT
	option forward		ACCEPT

config zone
	option name		wan
	list   network		'wan'
	list   network		'wan6'
	option input		REJECT
	option output		ACCEPT
	option forward		REJECT
	option masq		1
	option mtu_fix		1

config forwarding
	option src		lan
	option dest		wan

# We need to accept udp packets on port 68,
# see https://dev.openwrt.org/ticket/4108
config rule
	option name		Allow-DHCP-Renew
	option src		wan
	option proto		udp
	option dest_port	68
	option target		ACCEPT
	option family		ipv4

# Allow IPv4 ping
config rule
	option name		Allow-Ping
	option src		wan
	option proto		icmp
	option icmp_type	echo-request
	option family		ipv4
	option target		ACCEPT

config rule
	option name		Allow-IGMP
	option src		wan
	option proto		igmp
	option family		ipv4
	option target		ACCEPT

# Allow DHCPv6 replies
# see https://github.com/openwrt/openwrt/issues/5066
config rule
	option name		Allow-DHCPv6
	option src		wan
	option proto		udp
	option dest_port	546
	option family		ipv6
	option target		ACCEPT

config rule
	option name		Allow-MLD
	option src		wan
	option proto		icmp
	option src_ip		fe80::/10
	list icmp_type		'130/0'
	list icmp_type		'131/0'
	list icmp_type		'132/0'
	list icmp_type		'143/0'
	option family		ipv6
	option target		ACCEPT

# Allow essential incoming IPv6 ICMP traffic
config rule
	option name		Allow-ICMPv6-Input
	option src		wan
	option proto	icmp
	list icmp_type		echo-request
	list icmp_type		echo-reply
	list icmp_type		destination-unreachable
	list icmp_type		packet-too-big
	list icmp_type		time-exceeded
	list icmp_type		bad-header
	list icmp_type		unknown-header-type
	list icmp_type		router-solicitation
	list icmp_type		neighbour-solicitation
	list icmp_type		router-advertisement
	list icmp_type		neighbour-advertisement
	option limit		1000/sec
	option family		ipv6
	option target		ACCEPT

# Allow essential forwarded IPv6 ICMP traffic
config rule
	option name		Allow-ICMPv6-Forward
	option src		wan
	option dest		*
	option proto		icmp
	list icmp_type		echo-request
	list icmp_type		echo-reply
	list icmp_type		destination-unreachable
	list icmp_type		packet-too-big
	list icmp_type		time-exceeded
	list icmp_type		bad-header
	list icmp_type		unknown-header-type
	option limit		1000/sec
	option family		ipv6
	option target		ACCEPT

config rule
	option name		Allow-IPSec-ESP
	option src		wan
	option dest		lan
	option proto		esp
	option target		ACCEPT

config rule
	option name		Allow-ISAKMP
	option src		wan
	option dest		lan
	option dest_port	500
	option proto		udp
	option target		ACCEPT

cat /etc/config/dhcp

config dnsmasq
	option domainneeded '1'
	option localise_queries '1'
	option rebind_protection '1'
	option rebind_localhost '1'
	option local '/lan/'
	option domain 'lan'
	option expandhosts '1'
	option authoritative '1'
	option readethers '1'
	option leasefile '/tmp/dhcp.leases'
	option resolvfile '/tmp/resolv.conf.d/resolv.conf.auto'
	option localservice '1'
	option ednspacket_max '1232'

config dhcp 'lan'
	option interface 'lan'
	option start '100'
	option limit '150'
	option leasetime '12h'
	option dhcpv4 'server'
	option dhcpv6 'server'
	option ra 'server'
	option ra_slaac '1'
	list ra_flags 'managed-config'
	list ra_flags 'other-config'

config dhcp 'wan'
	option interface 'wan'
	option ignore '1'

config odhcpd 'odhcpd'
	option maindhcp '0'
	option leasefile '/tmp/hosts/odhcpd'
	option leasetrigger '/usr/sbin/odhcpd-update'
	option loglevel '4'

cat /etc/config/wireless

I have no wireless file, or device on this router.

Also, my crontab:

cat /etc/crontabs/root

# ping
*/10 * * * * curl -fsS -m 10 --retry 5 -o /dev/null https://hc-ping.com/REDACTED
# temp check
*/01 * * * * /usr/bin/tempcheck.sh
58 23 * * * logrotate /etc/logrotate.conf

The tempcheck.sh all it does is run sensors and check the temperature and notifies when it is hot:

cat /usr/bin/tempcheck.sh

#!/bin/sh

if ! sensors -u | awk -F: 'BEGIN {max=0} /_input/{if ($2 > max) max=$2} END{if (max < 70) print "ok"}' | grep -q -e 'ok'; then
  curl -fsS -m 10 --retry 5 -o /dev/null -d "Router temp is above **70 C**" -H "Tags: warning" -H "Email: REDACTED" ntfy.sh/REDACTED
fi

Everything here looks normal, except maybe your tempcheck -- what happens if you remove that?

tempcheck has been there for 23 days running every minute without issues. This is the first time this CPU spike happens. I have metrics for the last 15 days, and there is no something even similar before. It has not happened again since I rebooted the router. It is not easy to reproduce.

The command sensors which is what tempcheck uses, all it does is to read data from /sys/class/hwmon and from /sys/class/i2c-adapter. There is a possibility that under some circumstances, sensors gets in a tight loop. If that's the case I'm just going to keep an eye on it to see if it happens again. And now that I have logs being persisted, I hope I can catch it. I added some logger lines to tempcheck to see if I can catch it.

So the problem happened again a few weeks later. I removed the tempcheck from the crontab and the problem has not happened again. So, it looks like under certain circumstances, the sensors command gets in some sort of tight loop scanning the sysfs files.

This topic was automatically closed 10 days after the last reply. New replies are no longer allowed.