Ramips mt7621 mtk-sd: MediaTek MT6575 MSDC Driver time sequential error with kernel 4.9.x


#1

When high volume of BitTorrent transport, kernel hang randomly.
Start up executable files on mmc/tf card the first time even the second time, always crash by return 'Segmentation fault' or 'Illegal instruction'. mmc card file system type f2fs or ext4.
With kernel 4.4.74 every thing fine.

zsh#root@Witi ~
cat /etc/banner                                                                                      3:44:57
     _________
    /        /\      _    ___ ___  ___
   /  LE    /  \    | |  | __|   \| __|
  /    DE  /    \   | |__| _|| |) | _|
 /________/  LE  \  |____|___|___/|___|                      lede-project.org
 \        \   DE /
  \    LE  \    /  -----------------------------------------------------------
   \  DE    \  /    Reboot (SNAPSHOT, r4534-e227bad)
    \________\/    -----------------------------------------------------------
zsh#root@Witi ~
uname -a                                                                                             3:45:01
Linux Witi 4.9.34 #0 SMP Thu Jul 6 17:19:13 2017 mips GNU/Linux
zsh#root@Witi ~
gdb                                                                                                  3:45:09
[1]    7671 illegal hardware instruction  gdb
zsh#root@Witi ~
gdb                                                                                                  3:45:16
[1]    7690 illegal hardware instruction  gdb
zsh#root@Witi ~
gdb                                                                                                  3:45:19
[1]    7713 illegal hardware instruction  gdb
zsh#root@Witi ~
gdb                                                                                                  3:45:20
[1]    7728 segmentation fault  gdb
zsh#root@Witi ~
gdb                                                                                                  3:45:21
[1]    7745 segmentation fault  gdb
zsh#root@Witi ~
gdb                                                                                                  3:45:23
GNU gdb (GDB) 7.12.1
Copyright (C) 2017 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "mipsel-openwrt-linux".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) q
zsh#root@Witi ~
mount                                                                                                3:47:56
/dev/root on /rom type squashfs (ro,relatime)
proc on /proc type proc (rw,nosuid,nodev,noexec,noatime)
sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,noatime)
tmpfs on /tmp type tmpfs (rw,nosuid,nodev,noatime)
/dev/mmcblk0p1 on /overlay type f2fs (rw,lazytime,relatime,background_gc=on,user_xattr,inline_data,inline_dentry,flush_merge,extent_cache,mode=adaptive,active_logs=6)
overlayfs:/overlay on / type overlay (rw,noatime,lowerdir=/,upperdir=/overlay/upper,workdir=/overlay/work)
tmpfs on /dev type tmpfs (rw,nosuid,relatime,size=512k,mode=755)
devpts on /dev/pts type devpts (rw,nosuid,noexec,relatime,mode=600,ptmxmode=000)
debugfs on /sys/kernel/debug type debugfs (rw,noatime)
/dev/mtdblock6 on /overlay-boot type jffs2 (rw,sync,relatime)
zsh#root@Witi ~
cat /etc/config/fstab                                                                                3:52:47
config 'global'
        option  anon_swap       '0'
        option  anon_mount      '0'
        option  auto_swap       '1'
        option  auto_mount      '1'
        option  delay_root      '5'
        option  check_fs        '0'

config 'mount'
        option  target  '/overlay'
        option  uuid    '667e5915-9d28-43c3-b3d6-6985df0e038c'
        option  fstype  'f2fs'
        option  enabled '1'

config 'mount'
        option  target  '/overlay-boot'
        option  device  '/dev/mtdblock6'
        option  fstype  'jffs2'
        option  options 'rw,sync'
        option  enabled '1'
        option  enabled_fsck '0'
zsh#root@Witi ~
dmesg | tail -n 20                                                                                   4:03:04
[   25.280000] br-lan: port 2(wlan1) entered forwarding state
[   27.390000] device br-lan entered promiscuous mode
[   43.060000] do_page_fault(): sending SIGSEGV to zsh for invalid read access from 00000000
[   43.080000] epc = 00000000 in zsh[400000+a1000]
[   43.090000] ra  = 00000000 in zsh[400000+a1000]
[  763.480000] do_page_fault(): sending SIGSEGV to zsh for invalid read access from 08ac0004
[  763.500000] epc = 08ac0005 in libgcc_s.so.1[77287000+22000]
[  763.510000] ra  = 0046890f in zsh[400000+a1000]
[  783.600000] do_page_fault(): sending SIGSEGV to ranger for invalid read access from 7e99fda4
[  783.620000] epc = 7e99fda5 in
[  783.620000] ra  = 76e65f58 in libpython2.7.so.1.0[76dd0000+167000]
[  785.760000] do_page_fault(): sending SIGSEGV to ranger for invalid read access from 7fe7f70f
[  785.780000] epc = 774ae7d4 in libpython2.7.so.1.0[7744c000+167000]
[  785.790000] ra  = 77475a48 in libpython2.7.so.1.0[7744c000+167000]
[ 1200.530000] do_page_fault(): sending SIGSEGV to gdb for invalid write access to 0000002e
[ 1200.550000] epc = 00514885 in gdb[400000+352000]
[ 1200.560000] ra  = 005650ef in gdb[400000+352000]
[ 1201.830000] do_page_fault(): sending SIGSEGV to gdb for invalid read access from 00000063
[ 1201.850000] epc = 77bd3dac in libc.so[77b62000+92000]
[ 1201.860000] ra  = 77bd5734 in libc.so[77b62000+92000]

New kernel versions
New kernel versions
#2

LEDE git trunk new update kernel v4.9.37, the mtk-sd: MediaTek MT6575 MSDC Driver time sequential error which cause page fault problem still there. I have to keep my router's kernel version freeze at v4.4.74.
dmesg

[    7.700000] MTK MSDC device init.
[    7.780000] mtk-sd: MediaTek MT6575 MSDC Driver
[    7.790000] kmodloader: done loading kernel modules from /etc/modules-boot.d/*
[    7.810000] init: - preinit -
[    7.870000] mmc0: new high speed SDHC card at address 0001
[    7.880000] mmcblk0: mmc0:0001 00000 7.44 GiB
[    7.890000]  mmcblk0: p1
[    9.510000] mtk_soc_eth 1e100000.ethernet eth0: port 0 link up
[   10.760000] mount_root: loading kmods from internal overlay
[   10.800000] kmodloader: loading kernel modules from //etc/modules-boot.d/*
[   10.820000] kmodloader: done loading kernel modules from //etc/modules-boot.d/*
[   11.210000] jffs2: notice: (464) jffs2_build_xattr_subsystem: complete building xattr subsystem, 0 of xdatum (0 unchecked, 0 orphan) and 0 of xref (0 dead, 0 orphan) found.
[   11.240000] block: attempting to load /tmp/jffs_cfg/upper/etc/config/fstab
[   11.810000] mount_root: switched to extroot
[   11.820000] urandom-seed: Seeding with /etc/urandom.seed

dmesg part2

[   85.350000] do_page_fault(): sending SIGSEGV to zsh for invalid write access to 00008040
[   85.370000] epc = 00442f11 in zsh[400000+a1000]
[   85.370000] ra  = 0042ee67 in zsh[400000+a1000]
[  120.260000] do_page_fault(): sending SIGSEGV to zsh for invalid write access to 0000001a
[  120.280000] epc = 0046815d in zsh[400000+a1000]
[  120.290000] ra  = 0043c047 in zsh[400000+a1000]
[  193.790000] do_page_fault(): sending SIGSEGV to ranger for invalid read access from 7424296c
[  193.810000] epc = 7424296c in libgcc_s.so.1[77a91000+22000]
[  193.820000] ra  = 77b70420 in libpython2.7.so.1.0[77ad4000+167000]
[ 1793.930000] do_page_fault(): sending SIGSEGV to gdb for invalid read access from 00000050
[ 1793.950000] epc = 00540147 in gdb[400000+352000]
[ 1793.960000] ra  = 00540147 in gdb[400000+352000]

cmd execute program on sd/tf card

root@Witi:~# zsh
Segmentation fault
root@Witi:~# zsh
zsh#root@Witi ~
ranger
[1]    3673 illegal hardware instruction  ranger
zsh#root@Witi ~
ranger
[1]    3699 segmentation fault  ranger
zsh#root@Witi ~
ranger
[1]    3714 illegal hardware instruction  ranger
zsh#root@Witi ~
ranger
[1]    3729 illegal hardware instruction  ranger
zsh#root@Witi ~
ranger
zsh#root@Witi ~
gdb
[1]    15405 segmentation fault  gdb
zsh#root@Witi ~
gdb
[1]    15432 bus error  gdb
zsh#root@Witi ~
gdb
GNU gdb (GDB) 7.12.1
Copyright (C) 2017 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "mipsel-openwrt-linux".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) q
zsh#root@Witi ~
uname -a
Linux Witi 4.9.37 #0 SMP Sat Jul 15 05:02:59 2017 mips GNU/Linux
zsh#root@Witi ~
cat /etc/banner
     _________
    /        /\      _    ___ ___  ___
   /  LE    /  \    | |  | __|   \| __|
  /    DE  /    \   | |__| _|| |) | _|
 /________/  LE  \  |____|___|___/|___|                      lede-project.org
 \        \   DE /
  \    LE  \    /  -----------------------------------------------------------
   \  DE    \  /    Reboot (SNAPSHOT, r4576-9fe9175)
    \________\/    -----------------------------------------------------------

#3

Hi, does the error still exist in the latest snapshot?

According to the kernel panic log, the error seems related to mmcqd, so I did some research in drivers/mmc/card/queue.c and found these:
New commits in kernel 4.9 compared with kernel 4.4:

2016-09-26	mmc: card: do away with indirection pointer	Linus Walleij	1	-1/+3
2016-08-25	mmc: fix use-after-free of struct request	Adrian Hunter	1	-1/+3
2016-08-16	block: Fix secure erase	Adrian Hunter	1	-1/+2
2016-06-09	block: add a separate operation type for secure erase	Christoph Hellwig	1	-1/+1
2016-06-07	drivers: use req op accessor

Someone says in newer kernel the bug has been fixed, so I also looked into the commits in kernel 4.14. But queue.c no longer exists.

I will try to revert these commits and see what will happen.


Support for ZBT-WE1326 (was: New product! 2.4GHZ/5.8GHz Openwrt 802.11AC dual band dual core wifi router with SD card slot)
#4

Just make a trunk build
OpenWrt SNAPSHOT, r5904-adaf1cb
Linux Witi 4.9.76 #0 SMP Sat Jan 20 13:22:39 2018 mips GNU/Linux
Still same error when using SD/TF card.
dmesg log on exec zsh in TF card as /overlay:
[ 30.791381] do_page_fault(): sending SIGSEGV to zsh for invalid write access to 00000000
[ 30.807573] epc = 00457d81 in zsh[400000+a1000]
[ 30.816735] ra = 00457d79 in zsh[400000+a1000]


#5

I often see "Illegal instruction" errors when using Python and gcc.
It seems that I used sd card as overlay


#6

You can use git heads lede-17.01, which has kernel 4.4.111, sd card overlay very well.


#7

Unfortunately, my ykl1 use 17.01 can not be started. See Youku-yk1 can not start after flashing


#8

FS#1242 - SATA broken on kernel 4.9 on mt7621

Disabling CONFIG_HIGHMEM may help. I havn't tried yet. It seems the bug is related to CPU or memory part in kernel, possibly DMA/Block layer part.

Also, will changing the I/O scheduler to noop help? The default scheduler is deadline.

ramips: re-enable highmem support for MT7621 now that it's fixed

Update 2/3: Kernel 4.9 new requirement in DMA:

造成这一问题的原因在于,基于 Kernel 4.9 的设备驱动在进行 DMA 操作前必须要先对相关的数据结构进行初始化,否则只能分配得到 NULL 指针。

Translation: The reason for this problem is that device drivers based on Kernel 4.9 must initialize the relevant data structures before performing DMA operations. Otherwise, only NULL pointers can be allocated.

See Kernel 4.9 上 dma_alloc_coherent() 函数开辟 DMA 连续内存空间失败 解决办法

I didn't find of_dma_configure in 0046-mmc-MIPS-ralink-add-sdhci-for-mt7620a-SoC.patch, so it's likely that if the dma data structure is initialized properly, the problem will be solved.

Update: 2/4: Look at this line of code:

/* using dma_alloc_coherent*/  /* todo: using 1, for all 4 slots */
host->dma.gpd = dma_alloc_coherent(NULL, MAX_GPD_NUM * sizeof(gpd_t), &host->dma.gpd_addr, GFP_KERNEL); 
host->dma.bd =  dma_alloc_coherent(NULL, MAX_BD_NUM  * sizeof(bd_t),  &host->dma.bd_addr,  GFP_KERNEL); 
BUG_ON((!host->dma.gpd) || (!host->dma.bd));    

If dma_alloc_coherent fails to allocate memory and return NULL pointer, kernel will panic immediately. No idea again.

Update 2/8: Too many changes about DMA and block between 4.4 and 4.9, going to use ftrace to see what's happening when the error is shown. Will report later below.


#9

Here's a quote:

The circumstance where it fails to read is when it is asked to read data into a page which is not directly mapped into the kernel's address space. Such pages only exist when CONFIG_HIGHMEM is selected, so the microSD can be made to work in 4.4.87 by disabling CONFIG_HIGHMEM. In mainline there is also a problem that host->id is set to -1 which causes problems with some array accesses, but that is easily fixed.

The 4.9 issue is a different one. Highmem is only used with 512MB boards as the max address space is 256MB.

I wouldn't think CONFIG_HIGHMEM has any impact for <= 256MB board but no idea.


#10

I am having similar problems on MT7688 which does not ever have CONFIG_HIGHMEM enabled. I did not ever find the root of the problem with the mtk-mmc driver, but I looked into it and the mainlain mtk-sd.c driver is for the same IP core but on MIPS instead of ARM. By removing some code that uses the clock-tree stuff we don't have on MIPS I was able to get it to compile and run. No data corruption to be found.

Should work on MT7620 with no modifications.


#11

A working replacement for mt7621? :slight_smile:


#12

4.14 for ramips is out. Will try it.


#13

tried and still no bugfix...
OpenWrt SNAPSHOT, r6244-7a97588
Linux Witi 4.14.20 #0 SMP Wed Feb 21 19:57:30 2018 mips GNU/Linux


#14

Newifi D1 on kernel 4.14.20, with the two patches below, works for me. Programs run normally. I also tried to execute "yes > ./yes.txt" to let about 100MB data generated on the tf card.

root@ASUS-PC:/mnt/mmcblk0p1/opt# echo Hello > ./test.txt
root@ASUS-PC:/mnt/mmcblk0p1/opt# cat ./test.txt
Hello
root@ASUS-PC:/mnt/mmcblk0p1/opt# uname -a
Linux ASUS-PC 4.14.20 #0 SMP Sat Feb 24 22:43:29 2018 mips GNU/Linux
root@ASUS-PC:/mnt/mmcblk0p1/opt#

But in dmesg there is an errror log, which seems not related to the issue.

Mon Feb 26 17:06:40 2018 kern.crit kernel: [13603.646005] EXT4-fs error (device mmcblk0p1): ext4_mb_generate_buddy:756: group 33, block bitmap and bg descriptor inconsistent: 25843 vs 25844 free clusters

My patch:

--- a/drivers/mmc/host/mtk-mmc/sd.c
+++ b/drivers/mmc/host/mtk-mmc/sd.c 	
@@ -2832,6 +2832,7 @@ static int msdc_drv_probe(struct platform_device *pdev)
    
     host->dma.used_gpd = 0;
     host->dma.used_bd = 0;
+    mmc_dev(mmc)->dma_mask = NULL;
 
     /* using dma_alloc_coherent*/  /* todo: using 1, for all 4 slots */
     host->dma.gpd = dma_alloc_coherent(NULL, MAX_GPD_NUM * sizeof(gpd_t), &host->dma.gpd_addr, GFP_KERNEL);

Discussion: https://www.mail-archive.com/lede-dev@lists.infradead.org/msg11210.html

I also applied "ramips: fix mt7620 sdhci OF match variable name", which is still in the mailing list.

https://www.mail-archive.com/lede-dev@lists.infradead.org/msg11320.html

Also set noop to default IO scheduler.

-CONFIG_DEFAULT_IOSCHED="deadline"
+CONFIG_DEFAULT_IOSCHED="noop"
 CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
-# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_NOOP=y

#15

with your patch , tf card as overlay still same issue:

zsh#root@Witi ~
dmesg | tail -n 3
[   70.046394] do_page_fault(): sending SIGSEGV to git for invalid read access from 01000000
[   70.062730] epc = 01000001 in libgcc_s.so.1[77e83000+22000]
[   70.073932] ra  = 004c7bcb in git[400000+124000]
zsh#root@Witi ~
gdb
[1]    3949 segmentation fault  gdb
zsh#root@Witi ~
gdb
[1]    3964 illegal hardware instruction  gdb
zsh#root@Witi ~
gdb
[1]    3979 illegal hardware instruction  gdb
zsh#root@Witi ~
gdb
[1]    3994 segmentation fault  gdb
zsh#root@Witi ~
gdb
GNU gdb (GDB) 7.12.1
Copyright (C) 2017 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "mipsel-openwrt-linux".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word".
(gdb) q
zsh#root@Witi ~
dmesg | tail -n 3
[  189.417786] do_page_fault(): sending SIGSEGV to gdb for invalid read access from 003a1e94
[  189.434341] epc = 003a1e95 in gdb[400000+352000]
[  189.443586] ra  = 0046d717 in gdb[400000+352000]
zsh#root@Witi ~
mount
/dev/root on /rom type squashfs (ro,relatime)
/dev/root on /rom type squashfs (ro,relatime)
proc on /proc type proc (rw,nosuid,nodev,noexec,noatime)
sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,noatime)
tmpfs on /tmp type tmpfs (rw,nosuid,nodev,noatime)
/dev/mmcblk0p1 on /overlay type f2fs (rw,lazytime,relatime,background_gc=on,no_heap,user_xattr,inline_xattr,inline_data,inline_dentry,flush_merge,extent_cache,mode=adaptive,active_logs=6)
overlayfs:/overlay on / type overlay (rw,noatime,lowerdir=/,upperdir=/overlay/upper,workdir=/overlay/work)
tmpfs on /dev type tmpfs (rw,nosuid,relatime,size=512k,mode=755)
devpts on /dev/pts type devpts (rw,nosuid,noexec,relatime,mode=600,ptmxmode=000)
debugfs on /sys/kernel/debug type debugfs (rw,noatime)
/dev/mtdblock6 on /overlay-boot type jffs2 (rw,sync,relatime)
zsh#root@Witi ~
cat /etc/config/fstab
config 'global'
        option  anon_swap       '0'
        option  anon_mount      '0'
        option  auto_swap       '1'
        option  auto_mount      '1'
        option  delay_root      '5'
        option  check_fs        '0'

config 'mount'
        option  target  '/overlay'
        option  uuid    '667e5915-9d28-43c3-b3d6-6985df0e038c'
        option  fstype  'f2fs'
        option  enabled '1'

config 'mount'
        option  target  '/overlay-boot'
        option  device  '/dev/mtdblock6'
        option  fstype  'jffs2'
        option  options 'rw,sync'
        option  enabled '1'
        option  enabled_fsck '0'
zsh#root@Witi ~
blkid
/dev/mtdblock5: TYPE="squashfs"
/dev/mmcblk0: PTUUID="000e3b21" PTTYPE="dos"
/dev/mmcblk0p1: UUID="667e5915-9d28-43c3-b3d6-6985df0e038c" TYPE="f2fs" PARTUUID="000e3b21-01"
zsh#root@Witi ~

#16

If you do not mount tf card as overlay, will it work? Do you apply this patch?

At least FS#1335 - ZBT-WG3526 (16M) SD card write makes crash and reboot could be hopefully solved.


#17

I apply all patches, can not solve above issue.


#18

The code form my gist works. You have to modify the build system to actually build the module and then load as the product will not be the typical mtk-mmc.ko.

should be able to find the necessary changes here:


#19

Test openwrt-18.06-SNAPSHOT r6914, TF/microSD card read write issue fixed.
kernel version 4.14.37

But ISP WAN port speed test too bad:
kernel v4.14.37: 160Mb/s
kernel v4.4.131: 250Mb/s
This let me still choose lede-17.01-SNAPSHOT.