My system:
- Linksys WRT1900ACv1, OpenWrt r19971 built by me, with btrfs in kernel.
- HDD connected to eSATA port, mounted in /mnt
- no-name USB hub connected to USB3 port
- overlay on USB memory sticks
New adition that caused the problems:
- 5x USB2 sticks, 64GB each, btrfs RAID5 with sha256 checksums, mounted in /mnt with -o compress=zstd:15,ssd_spread
The problem:
I tried to copy data from eSATA to btrfs RAID5. After a few seconds (10-20), the router stopped copying and then suddenly rebooted itself. It was a combination of OOM killer and watchdog that caused the reboots. I tried the same operation again and again until I could "save it" just before the crash. This is what I found:
root@GRAPHRT:/proc# cat meminfo
MemTotal: 247724 kB
MemFree: 25848 kB
MemAvailable: 14240 kB
Buffers: 8 kB
Cached: 22472 kB
SwapCached: 0 kB
Active: 11924 kB
Inactive: 13484 kB
Active(anon): 1048 kB
Inactive(anon): 3140 kB
Active(file): 10876 kB
Inactive(file): 10344 kB
Unevictable: 0 kB
Mlocked: 0 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 0 kB
Writeback: 0 kB
AnonPages: 2928 kB
Mapped: 3048 kB
Shmem: 1260 kB
KReclaimable: 4712 kB
Slab: 29496 kB
SReclaimable: 4712 kB
SUnreclaim: 24784 kB
KernelStack: 1448 kB
PageTables: 256 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 123860 kB
Committed_AS: 7012 kB
VmallocTotal: 770048 kB
VmallocUsed: 9952 kB
VmallocChunk: 0 kB
Percpu: 240 kB
root@GRAPHRT:/proc# cat vmstat
nr_free_pages 6462
nr_zone_inactive_anon 786
nr_zone_active_anon 262
nr_zone_inactive_file 2587
nr_zone_active_file 2719
nr_zone_unevictable 0
nr_zone_write_pending 0
nr_mlock 0
nr_page_table_pages 64
nr_bounce 0
nr_zspages 0
nr_free_cma 0
nr_inactive_anon 786
nr_active_anon 262
nr_inactive_file 2587
nr_active_file 2719
nr_unevictable 0
nr_slab_reclaimable 1178
nr_slab_unreclaimable 6179
nr_isolated_anon 0
nr_isolated_file 0
workingset_nodes 2377
workingset_refault_anon 393
workingset_refault_file 699590
workingset_activate_anon 87
workingset_activate_file 87404
workingset_restore_anon 0
workingset_restore_file 42089
workingset_nodereclaim 0
nr_anon_pages 734
nr_mapped 762
nr_file_pages 5620
nr_dirty 0
nr_writeback 0
nr_writeback_temp 0
nr_shmem 315
nr_shmem_hugepages 0
nr_shmem_pmdmapped 0
nr_file_hugepages 0
nr_file_pmdmapped 0
nr_anon_transparent_hugepages 0
nr_vmscan_write 28826
nr_vmscan_immediate_reclaim 65582
nr_dirtied 329138
nr_written 306115
nr_kernel_misc_reclaimable 0
nr_foll_pin_acquired 0
nr_foll_pin_released 0
nr_kernel_stack 1448
nr_dirty_threshold 1124
nr_dirty_background_threshold 561
nr_unstable 0
root@GRAPHRT:/proc# cat vmallocinfo
[...]
0x1bc111ab-0x7448241e 131072 0xc0394ee8 pages=31 vmalloc
0x7448241e-0xb9c30497 131072 0xc038d4e0 pages=31 vmalloc
0x47c1d180-0xec46d4d4 1052672 0xc0565a3c phys=0xf8200000 ioremap
0x3f8861ac-0x03276075 1052672 0xc0565a3c phys=0xf8300000 ioremap
0x497797d6-0x183b3047 1052672 0xc0565a3c phys=0xf8400000 ioremap
0x6f3e7a30-0xd6da58d4 1052672 0xc0565a3c phys=0xf8500000 ioremap
0x4c119a47-0xe18ac24b 2613248 0xc0447b9c pages=637 vmalloc
0xc7763f7c-0x28c05fb1 2097152 0xc0c093e0 ioremap
0x1d0b10c3-0xf91fb7c7 90112 0xc0211e44 vmalloc
0xf91fb7c7-0x74dc1cda 90112 0xc0211e44 vmalloc
0xa902ece0-0xb461dd7b 16384 unpurged vm_area
0xb461dd7b-0xd44196a5 16384 unpurged vm_area
0x479ebb6f-0x4c119a47 2613248 unpurged vm_area
This was after the copy (cp -av ...) was killed.
I think this is a kernel memory leak. I could not reproduce it on any x86 machines that I have. I tried with the same USB memory sticks, same btrfs, same mount commands.
What more can I do to find the cause and report it?
Thanks