Ramips: mt7620: random generator crash

Routers based on the ramips/mt7620 architecture crashes very often:

<1>[  719.445128] CPU 0 Unable to handle kernel paging request at virtual address 70360604, epc == 70360604, ra == 80004294
<4>[  719.456105] Oops[#1]:
<4>[  719.458432] CPU: 0 PID: 18153 Comm: grep Tainted: G           O       6.6.85 #0
<4>[  719.465901] $ 0   : 00000000 00000001 00000002 fffff000
<4>[  719.471265] $ 4   : 7f8d1000 7f8d1000 00000001 85c4482c
<4>[  719.476619] $ 8   : 00000010 80620d0c 00380023 3893b685
<4>[  719.481973] $12   : 80a4b3e0 8478d00c 00000001 ffffff00
<4>[  719.487327] $16   : 7f8d1000 00000001 85c11bd0 85b81400
<4>[  719.492683] $20   : 85b63960 85b7705c 85b495a0 00000001
<4>[  719.498037] $24   : 00000000 ffffffff                  
<4>[  719.503390] $28   : 84784000 84785d20 84785de8 80004294
<4>[  719.508746] Hi    : 554a9555
<4>[  719.511684] Lo    : 7ffaa000
<4>[  719.514620] epc   : 70360604 0x70360604
<4>[  719.518542] ra    : 80004294 arch_align_stack+0x50/0x70
<4>[  719.523897] Status: 1100b403	KERNEL EXL IE 
<4>[  719.528182] Cause : 10800008 (ExcCode 02)
<4>[  719.532273] BadVA : 70360604
<4>[  719.535209] PrId  : 00019650 (MIPS 24KEc)
<4>[  719.539299] Modules linked in: ksmbd rt2800soc(O) rt2800mmio(O) rt2800lib(O) qcserial pppoe ppp_async option cdc_mbim wireguard usb_wwan sierra_net sierra rt2x00soc(O) rt2x00mmio(O) rt2x00lib(O) rndis_host qmi_wwan pptp pppox ppp_mppe ppp_generic nft_fib_inet mt76x2e(O) mt76x2_common(O) mt76x02_lib(O) mt76(O) mac80211(O) libchacha20poly1305 ipt_REJECT huawei_cdc_ncm cfg80211(O) cdc_ncm cdc_ether xt_time xt_tcpudp xt_tcpmss xt_statistic xt_state xt_recent xt_nat xt_multiport xt_mark xt_mac xt_limit xt_length xt_hl xt_helper xt_ecn xt_dscp xt_conntrack xt_connmark xt_connlimit xt_connbytes xt_comment xt_TCPMSS xt_REDIRECT xt_MASQUERADE xt_LOG xt_HL xt_DSCP xt_CLASSIFY usbserial usbnet usblp ums_usbat ums_sddr55 ums_sddr09 ums_karma ums_jumpshot ums_isd200 ums_freecom ums_datafab ums_cypress ums_alauda ts_fsm ts_bm tcp_scalable tcp_bbr slhc sch_cake r8152 poly1305_mips nft_tproxy nft_socket nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject_bridge nft_reject nft_redir nft_quota nft_queue nft_numgen nft_nat
<4>[  719.539973]  nft_meta_bridge nft_masq nft_log nft_limit nft_hash nft_fwd_netdev nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_dup_netdev nft_ct nft_compat nft_chain_nat nfnetlink_queue nf_tproxy_ipv6 nf_tproxy_ipv4 nf_tables nf_socket_ipv6 nf_socket_ipv4 nf_reject_ipv4 nf_nat_tftp nf_nat_snmp_basic nf_nat_sip nf_nat_pptp nf_nat_irc nf_nat_h323 nf_nat_ftp nf_nat_amanda nf_log_syslog nf_dup_netdev nf_conntrack_tftp nf_conntrack_snmp nf_conntrack_sip nf_conntrack_sane nf_conntrack_pptp nf_conntrack_netbios_ns nf_conntrack_irc nf_conntrack_h323 nf_conntrack_ftp nf_conntrack_broadcast nf_conntrack_bridge ts_kmp nf_conntrack_amanda nf_conncount macvlan libcurve25519_generic libcrc32c ipvlan iptable_nat iptable_mangle iptable_filter ipt_ECN ipheth ip_tables crc_ccitt compat(O) chacha_mips cdc_wdm br_netfilter natflow(O) natcap(O) nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 fuse ntfs3 sch_tbf sch_ingress sch_htb sch_hfsc em_u32 cls_u32 cls_route cls_matchall cls_fw cls_flow cls_basic act_skbedit act_mirred act_gact xt_set
<4>[  719.631894]  ip_set_list_set ip_set_hash_netportnet ip_set_hash_netport ip_set_hash_netnet ip_set_hash_netiface ip_set_hash_net ip_set_hash_mac ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_hash_ipport ip_set_hash_ipmark ip_set_hash_ipmac ip_set_hash_ip ip_set_bitmap_port ip_set_bitmap_ipmac ip_set_bitmap_ip ip_set nfnetlink ip6table_mangle ip6table_filter ip6_tables ip6t_REJECT x_tables nf_reject_ipv6 nfsv4 nls_ucs2_utils cifs_arc4 asn1_decoder nfs msdos ip6_gre ip_gre gre ip6_udp_tunnel udp_tunnel sit ip6_tunnel oid_registry tunnel6 tunnel4 ip_tunnel tun lockd sunrpc grace autofs4 dns_resolver nls_utf8 nls_iso8859_1 nls_cp437 crypto_user algif_skcipher algif_rng algif_hash algif_aead af_alg sha512_generic sha256_generic libsha256 sha1_generic seqiv sha3_generic jitterentropy_rng drbg md5 kpp hmac geniv rng ecb des_generic libdes cmac arc4 uas usb_storage leds_gpio ohci_platform ohci_hcd fsl_mph_dr_of ehci_platform ehci_fsl sd_mod scsi_mod scsi_common ehci_hcd gpio_button_hotplug(O) vfat fat ext4 mbcache jbd2
<4>[  719.723964]  exfat usbcore nls_base usb_common mii crc32c_generic [last unloaded: ksmbd]
<4>[  719.823857] Process grep (pid: 18153, threadinfo=b72927ea, task=8d9a7b5e, tls=77ed9df4)
<4>[  719.832037] Stack : 00001601 85b63000 00005b6f 80a4b4a0 85b77000 8016f8fc 85b63960 85b63000
<4>[  719.840609]         807a5280 00000001 84785de8 8014adc0 00000000 00000000 00000000 00000000
<4>[  719.849173]         00000000 00000000 00000000 00000000 00000001 85c44500 00000000 85c44500
<4>[  719.857739]         00000001 80030918 85b77000 85c44500 00000000 f0831e6f ffffffff 85b63960
<4>[  719.866307]         85b77054 8011e324 85a0a000 859f5400 00000100 801cd6c0 00000034 e98df62d
<4>[  719.874875]         ...
<4>[  719.877377] Call Trace:
<4>[  719.877382] 
<4>[  719.881722] [<8016f8fc>] setup_arg_pages+0x48/0x2cc
<4>[  719.887211] [<8014adc0>] free_unref_page+0x40/0x118
<4>[  719.892534] [<80030918>] flush_itimer_signals+0x34/0x5c
<4>[  719.898531] [<8011e324>] arch_pick_mmap_layout+0x1a4/0x1c4
<4>[  719.904464] [<801cd6c0>] load_elf_phdrs+0x78/0xcc
<4>[  719.910113] [<801cdb8c>] load_elf_binary+0x2e4/0x1540
<4>[  719.915455] [<80166b90>] __kernel_read+0x164/0x2ac
<4>[  719.921355] [<801705f0>] bprm_execve+0x1ec/0x578
<4>[  719.926081] [<80170bfc>] copy_string_kernel+0x104/0x240
<4>[  719.931757] [<80171410>] do_execveat_common+0x1b4/0x240
<4>[  719.937268] [<8000fa8c>] do_page_fault+0xd4/0x554
<4>[  719.942249] [<801720c8>] sys_execve+0x34/0x48
<4>[  719.946714] [<8000db80>] syscall_common+0x34/0x58
<4>[  719.951710] 
<4>[  719.953230] Code: (Bad address in epc)
<4>[  719.953230] 
<4>[  719.958562] 
<4>[  719.960263] ---[ end trace 0000000000000000 ]---

The problem is related to the get_random_u32_below function, which is called e.g. when executing commands.
You can test it by running e.g. while true ; do var=$(iw phy1-ap0 station dump | grep "Station") ; done and waiting a few/several/dozens of minutes. Tests were performed on kernels 6.6 and 6.12. Crashes also occur with CONFIG_CRYPTO_LIB_CHACHA and CONFIG_CRYPTO_CHACHA_MIPS enabled.
Switching to a pseudorandom generator helps.

Discussion on this topic was held at https://github.com/openwrt/openwrt/issues/16396. However, the problem has not been solved.

Tested workaround patches:

--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -709,6 +709,29 @@ unsigned long mips_stack_top(void)
 	return top;
 }
 
+#ifdef CONFIG_NET_RALINK_MT7620
+#include <linux/ktime.h>
+#include <linux/sched/clock.h>
+#include <linux/limits.h>
+static u32 simple_random_u32_below(u32 max) {
+	static u32 seed = 0;
+	if (!seed)
+		seed = ktime_get_ns() ^ sched_clock();
+	seed ^= seed << 13;
+	seed ^= seed >> 17;
+	seed ^= seed << 5;
+	if (max <= 1)
+		return 0;
+	u32 limit = UINT_MAX - (UINT_MAX % max);
+	while (seed >= limit) {
+		seed ^= seed << 13;
+		seed ^= seed >> 17;
+		seed ^= seed << 5;
+	}
+	return seed % max;
+}
+#endif
+
 /*
  * Don't forget that the stack pointer must be aligned on a 8 bytes
  * boundary for 32-bits ABI and 16 bytes for 64-bits ABI.
@@ -716,7 +739,11 @@ unsigned long mips_stack_top(void)
 unsigned long arch_align_stack(unsigned long sp)
 {
 	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+#ifdef CONFIG_NET_RALINK_MT7620
+		sp -= simple_random_u32_below(PAGE_SIZE);
+#else
 		sp -= get_random_u32_below(PAGE_SIZE);
+#endif
 
 	return sp & ALMASK;
 }

or

--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -6,7 +6,8 @@
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-
+#include <linux/ktime.h>
+#include <linux/sched/clock.h>
 #include <uapi/linux/random.h>
 
 struct notifier_block;
@@ -82,11 +83,61 @@ static inline u32 get_random_u32_below(u
 			u32 mult = ceil * get_random_u16();
 			if (likely(is_power_of_2(ceil) || (u16)mult >= (1U << 16) % ceil))
 				return mult >> 16;
+#ifdef CONFIG_NET_RALINK_MT7620
+		} else {
+			/*
+			 * MT7620 / MIPS32 safe software RNG with improved entropy
+			 *
+			 * Problem:
+			 *   MT7620 has no hardware RNG. get_random_bytes()/get_random_u32()
+			 *   may crash for large ceil values.
+			 *
+			 * Solution:
+			 *   Use xoshiro256++ with dynamic seeding from system state.
+			 *   Single call, no loops, 64-bit scaling.
+			 */
+
+			static u64 xoshiro_state[4];
+			static bool seeded = false;
+
+			if (!seeded) {
+				/* Dynamic seeding for better entropy */
+				xoshiro_state[0] = 0x123456789abcdef0ULL ^ ktime_get_ns();
+				xoshiro_state[1] = 0xfedcba9876543210ULL ^ sched_clock();
+				xoshiro_state[2] = 0xdeadbeefcafebabeULL;
+				xoshiro_state[3] = 0x01a3b5c7d9a8c6e4ULL;
+				seeded = true;
+			}
+
+			u64 rotl(const u64 x, int k) {
+				return (x << k) | (x >> (64 - k));
+			}
+
+			u64 xoshiro256pp_next(void) {
+				u64 *s = xoshiro_state;
+				const u64 result = rotl(s[0] + s[3], 23) + s[0];
+
+				const u64 t = s[1] << 17;
+				s[2] ^= s[0];
+				s[3] ^= s[1];
+				s[1] ^= s[2];
+				s[0] ^= s[3];
+				s[2] ^= t;
+				s[3] = rotl(s[3], 45);
+				return result;
+			}
+
+			u64 val = xoshiro256pp_next();
+			/* Accurate scaling using kernel helper */
+			return mul_u64_u32_shr(val, ceil, 64);
+		}
+#else
 		} else {
 			u64 mult = (u64)ceil * get_random_u32();
 			if (likely(is_power_of_2(ceil) || (u32)mult >= -ceil % ceil))
 				return mult >> 32;
 		}
+#endif
 	}
 }
 

why not rand32 % ceil

In short: “rand32 % ceil” yields a biased distribution and is easy to predict.

Should not be crashing on empty rng pool though.

I don't want to find another workaround, I want to find the cause of the problems and fix it.

That understood. does not reproduce on 7628/76x8 listing one connected station for 4 hours….

EPC intends to execute unmapped page ie something is wrong with that new process mapping…

The problem was only noticed on mt7620

Working patch: https://github.com/openwrt/openwrt/pull/20323

This topic was automatically closed 10 days after the last reply. New replies are no longer allowed.