Failover on Ping Timeout with mwan3 Not Working

Hi all,

I've been using LEDE 17.01 for a while now on my Ubiquiti EdgeRouter X. Multi-wan capability is crucial for my setup, so I've been running mwan3 for several months. I swear I had it working correctly before but recently I realized that failover due to ping timeout is not working for me. My setup is a failover from my primary wan (cable) to a wwan and then to a 4g modem, no balancing.

The thing that is tripping me up is that the failover does work if one of the interfaces is manually turned off, and I know ifdown is running on a ping timeout because I get the email notifications that I have setup in the user script. Even stranger is fail-up works, ie if my primary link is down when the router boots, my internet goes out the secondary wan, and when the primary comes back online it switches up to that one, exactly as it is supposed to. Basically everything works (all the events, LUCI buttons, etc) except it won't actually switch the active route over to the secondary or tertiary connections.....seems like it just is not switching the route over correctly.

I do have openvpn installed and a snapshot version of mwan3 but I've also run it with just a clean install and the current version so I don't expect thats the problem.

I'm baffled and I've tried about everything on the internet that I can find as well as every possible combination of mwan options, network options, and firewall options that I can think of and no luck. I'm still a bit new to routers and networking so please pardon any ignorance but I'm definitely willing to run through any diagnosis and try to track the source of this issue down.

Please see below for a trimmed version of the output of the mwan troubleshooting call, I believe that should include everything relevant but please let me know if there is something else I should provide. Thanks!

Software versions : 

OpenWrt - LEDE Reboot 17.01.4 r3560-79f57e422d
LuCI - git-17.290.79498-d3f0685

mwan3 - 2.0.2-1
2.6.8-1
mwan3-luci - 1.4-6
git-17.340.61105-78ebfba-1

Output of "cat /etc/config/mwan3" : 

config interface 'wan'
	option enabled '1'
	list track_ip '8.8.4.4'
	list track_ip '8.8.8.8'
	list track_ip '208.67.220.220'
	option timeout '2'
	option up '7'
	option interval '3'
	option reliability '3'
	option count '1'
	option down '2'
	option flush_conntrack 'always'

config interface 'wan_wifi'
	option enabled '1'
	option timeout '2'
	option interval '3'
	option up '5'
	option down '3'
	list track_ip '8.8.4.4'
	list track_ip '8.8.8.8'
	list track_ip '208.67.220.220'
	option reliability '2'
	option count '1'
	option flush_conntrack 'always'

config interface 'wan_4g'
	option enabled '1'
	option count '1'
	option reliability '1'
	list track_ip '192.168.225.1'
	option timeout '5'
	option interval '10'
	option up '2'
	option down '3'
	option flush_conntrack 'always'

config member 'wan_m1_w3'
	option interface 'wan'
	option metric '1'
	option weight '3'

config member 'wan_m2_w3'
	option interface 'wan'
	option metric '2'
	option weight '3'

config member 'wifi_m1_w2'
	option interface 'wan_wifi'
	option metric '1'
	option weight '2'

config member 'wifi_m2_w2'
	option interface 'wan_wifi'
	option metric '2'
	option weight '2'

config member '4g_m1_w2'
	option interface 'wan_4g'
	option metric '1'
	option weight '2'

config member '4g_m2_w2'
	option interface 'wan_4g'
	option metric '2'
	option weight '2'

config member '4g_m3_w1'
	option interface 'wan_4g'
	option metric '3'
	option weight '1'

config policy '_wan_wifi_4g'
	list use_member 'wan_m1_w3'
	list use_member 'wifi_m2_w2'
	list use_member '4g_m3_w1'

config policy '_wan_only'
	list use_member 'wan_m1_w3'

config policy '_wifi_only'
	list use_member 'wifi_m2_w2'

config policy '_4g_only'
	list use_member '4g_m1_w2'

config policy '_wan_wifi'
	list use_member 'wan_m1_w3'
	list use_member 'wifi_m2_w2'

config policy '_wifi_wan'
	list use_member 'wan_m2_w3'
	list use_member 'wifi_m1_w2'

config rule 'failoverSeq'
	option proto 'all'
	option sticky '0'
	option use_policy '_wan_wifi_4g'
	option dest_ip '0.0.0.0'

config globals 'globals'
	option local_source 'none'
	option mmx_mask '0xff00'

Output of "cat /etc/config/network" : 

config globals 'globals'
	option ula_prefix 'fd57:0200:0fc0::/48'

config interface 'lan'
	option type 'bridge'
	option ifname 'eth0.1'
	option proto 'static'
	option netmask '255.255.255.0'
	option ip6assign '60'
	option ipaddr '192.168.210.1'

config interface 'wan'
	option ifname 'eth0.2'
	option proto 'dhcp'
	option metric '10'

config interface 'self'
	option ifname 'lo'
	option proto 'static'
	option ipaddr '192.168.210.1'
	option netmask '255.255.255.255'

config interface 'wan_wifi'
	option ifname 'eth0.3'
	option _orig_ifname 'eth0.3'
	option _orig_bridge 'false'
	option proto 'static'
	option netmask '255.255.255.0'
	option metric '20'
	option ipaddr '192.168.220.2'
	option gateway '192.168.220.1'

config interface 'wan_4g'
	option proto 'dhcp'
	option ifname 'eth0.4'
	option metric '30'

config device 'wan_dev'
	option name 'eth0.2'
	option macaddr 'xx:xx:xx:xx:xx:xx'
	option metric '50'

config interface 'wan6'
	option ifname 'eth0.2'
	option proto 'dhcpv6'
	option metric '40'

config interface 'vpn0'
	option ifname 'tun0'
	option proto 'none'

config interface 'vpn1'
	option ifname 'tun1'
	option proto 'none'

config switch
	option name 'switch0'
	option reset '1'
	option enable_vlan '1'

config switch_vlan
	option device 'switch0'
	option vlan '1'
	option ports '3 4 6t'
	option vid '1'

config switch_vlan
	option device 'switch0'
	option vlan '2'
	option ports '0 6t'
	option metric '10'
	option vid '2'

config switch_vlan
	option device 'switch0'
	option vlan '3'
	option ports '1 6t'
	option metric '20'
	option vid '3'

config switch_vlan
	option device 'switch0'
	option vlan '4'
	option ports '2 6t'
	option metric '30'
	option vid '4'

Output of "route -n" : 

Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         73.229.168.1    0.0.0.0         UG    10     0        0 eth0.2
0.0.0.0         192.168.220.1   0.0.0.0         UG    20     0        0 eth0.3
0.0.0.0         192.168.225.1   0.0.0.0         UG    30     0        0 eth0.4
10.125.96.0     0.0.0.0         255.255.255.240 U     0      0        0 tun0
10.125.97.0     0.0.0.0         255.255.255.240 U     0      0        0 tun1
73.229.168.0    0.0.0.0         255.255.252.0   U     10     0        0 eth0.2
73.229.168.1    0.0.0.0         255.255.255.255 UH    10     0        0 eth0.2
192.168.210.0   0.0.0.0         255.255.255.0   U     0      0        0 br-lan
192.168.220.0   0.0.0.0         255.255.255.0   U     20     0        0 eth0.3
192.168.225.0   0.0.0.0         255.255.255.0   U     30     0        0 eth0.4
192.168.225.1   0.0.0.0         255.255.255.255 UH    30     0        0 eth0.4

Output of "ip rule show" : 

0:	from all lookup local 
1001:	from all iif eth0.2 lookup main 
1002:	from all iif eth0.3 lookup main 
1003:	from all iif eth0.4 lookup main 
2001:	from all fwmark 0x100 lookup 1 
2002:	from all fwmark 0x200 lookup 2 
2003:	from all fwmark 0x300 lookup 3 
2253:	from all fwmark 0xfd00 lookup unspec blackhole
2254:	from all fwmark 0xfe00 lookup unspec unreachable
32766:	from all lookup main 
32767:	from all lookup default

Output of "ip route list table 1-250" : 

1
default via 73.229.168.1 dev eth0.2 
2
default via 192.168.220.1 dev eth0.3 
3
default via 192.168.225.1 dev eth0.4

mwan3 is problematic when using openvpn or some other things that touch iptables, try disabling temporary openvpn, do a mwan3 restart and check again the failover, you may have to try other IP's, may be the ping to 8.8.8.8 stops working for some reason (isp ping protection or something)... Check if you have custom rules on firewall.user or if you have some script that deal with iptables...

by the way, is nice to see that someone is using similar approach of mwan3 :slight_smile: https://gist.github.com/braian87b/97a186b2e11b5aa438d8fd17de0eab20

Thanks for the quick reply! I completely removed openvpn and did a reboot, also removed the custom firewall rules that were added along with my configuration of openvpn. That seemed like a really good possibility for what is causing my problems but alas, no change.

Ping goes out each interface just fine, and ifdown WILL trigger if the ping test fails, but the active interface does not change. I don't have anything else that messes with iptables to my knowledge. It is strange to me that if some other software is conflicting with mwan that the interfaces will still recover and switch over in an upward direction but won't failover to a lower priority metric, it seems to me that would be the same piece of code.

Might I be able to manually change the routes in one of the mwan user scripts? I am not that familiar with routing but it seems to be that since I can detect when an interface pings out, I could manually force the route to change. I have never written iptables commands myself so will likely need some direction to go this route.

Doing a little more digging into this and this is what my routing tables look like when the primary WAN interface is disabled (eth0.2). Traffic fails over to the secondary interface (eth0.3) and goes out that just fine since the routes for eth0.2 are removed from the table.

Output of "route -n" :

Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         192.168.220.1   0.0.0.0         UG    20     0        0 eth0.3
0.0.0.0         192.168.225.1   0.0.0.0         UG    30     0        0 eth0.4
10.125.96.0     0.0.0.0         255.255.255.240 U     0      0        0 tun0
10.125.97.0     0.0.0.0         255.255.255.240 U     0      0        0 tun1
192.168.210.0   0.0.0.0         255.255.255.0   U     0      0        0 br-lan
192.168.220.0   0.0.0.0         255.255.255.0   U     20     0        0 eth0.3
192.168.225.0   0.0.0.0         255.255.255.0   U     30     0        0 eth0.4
192.168.225.1   0.0.0.0         255.255.255.255 UH    30     0        0 eth0.4


Output of "ip route list table 1-250" : 

1
2
default via 192.168.220.1 dev eth0.3 
3
default via 192.168.225.1 dev eth0.4

Here is the tables when the interface pings out. Clearly the routes for eth0.2 are still there, so mwan3 is not removing them as it should (I would assume?).

Output of "route -n" : 

Kernel IP routing table
Destination     Gateway         Genmask         Flags Metric Ref    Use Iface
0.0.0.0         73.229.168.1    0.0.0.0         UG    10     0        0 eth0.2
0.0.0.0         192.168.220.1   0.0.0.0         UG    20     0        0 eth0.3
0.0.0.0         192.168.225.1   0.0.0.0         UG    30     0        0 eth0.4
10.125.96.0     0.0.0.0         255.255.255.240 U     0      0        0 tun0
10.125.97.0     0.0.0.0         255.255.255.240 U     0      0        0 tun1
73.229.168.0    0.0.0.0         255.255.252.0   U     10     0        0 eth0.2
73.229.168.1    0.0.0.0         255.255.255.255 UH    10     0        0 eth0.2
192.168.210.0   0.0.0.0         255.255.255.0   U     0      0        0 br-lan
192.168.220.0   0.0.0.0         255.255.255.0   U     20     0        0 eth0.3
192.168.225.0   0.0.0.0         255.255.255.0   U     30     0        0 eth0.4
192.168.225.1   0.0.0.0         255.255.255.255 UH    30     0        0 eth0.4

Output of "ip rule show" : 

Output of "ip route list table 1-250" : 

1
default via 73.229.168.1 dev eth0.2 
2
default via 192.168.220.1 dev eth0.3 
3
default via 192.168.225.1 dev eth0.4

Sorry for not be able to give you a response on this... but even now I don't know the cause of your problem.
Did you were able to fix it? how ?