Troubleshooting Wireguard reliability

Having an issue with reliability of Wireguard tunnels used for management. They will randomly drop off after some period of hours/days. 'wg' on the client router shows last handshake was hours ago. They are configured with 60s keepalive.
I am trying to use watchcat to make the device "fix itself", so the first thing I tried was an interface restart. Unfortunately this doesn't do anything:

Thu Oct  3 09:22:37 2024 daemon.info watchcat[6386]: Could not reach 10.31.252.1 via "wg0" for "600" seconds. Restarting "wg0" after reaching "600" seconds
Thu Oct  3 09:22:37 2024 daemon.info watchcat[6386]: Restarting network interface: "wg0".
Thu Oct  3 09:22:37 2024 daemon.notice netifd: Network device 'wg0' link is down
Thu Oct  3 09:22:37 2024 daemon.notice netifd: Network device 'wg0' link is up
Thu Oct  3 09:23:07 2024 daemon.info watchcat[6386]: Could not reach 10.31.252.1 via "wg0" for "30" seconds. Restarting "wg0" after reaching "600" seconds
Thu Oct  3 09:23:37 2024 daemon.info watchcat[6386]: Could not reach 10.31.252.1 via "wg0" for "60" seconds. Restarting "wg0" after reaching "600" seconds

As you can see it restarts the interface but this makes no difference to connectivity. I have added a reboot step to watchcat and this fixes it but I really would like something less drastic than a reboot to get these going again.

I am pretty sure this is a wireguard issue client-side, because I can send UDP packets using netcat from the client to the server on the wireguard port and see the packets arriving at the server using a packet capture. No other traffic is seen from the client IP until after it reboots. I am expecting to see keepalive packets, at a minimum.

I found the wireguard_watchdog script that ships with wireguard-tools but this is intended for fixing connections to dynamic endpoints, which is not the case here. I have tried a few 'wg set ...' commands to try and poke the thing back in to life but it doesn't make any difference.

I use a script to restart the network (or reboot the router) or starts a second tunnel as a fail over see:

But you might post your settings so that we can see if we can spot any thing out of the ordinary which can cause this kind of behaviour.

If so please connect to your OpenWRT device using ssh and copy the output of the following commands and post it here using the "Preformatted text </> " button:

Remember to redact keys, passwords, MAC addresses and any public IP addresses you may have:

ubus call system board
cat /etc/config/network
cat /etc/config/firewall
wg show
1 Like
ubus call system board

{
        "kernel": "5.15.150",
        "hostname": "Mango",
        "system": "MediaTek MT7628AN ver:1 eco:2",
        "model": "GL-MT300N-V2",
        "board_name": "glinet,gl-mt300n-v2",
        "rootfs_type": "squashfs",
        "release": {
                "distribution": "OpenWrt",
                "version": "23.05.3",
                "revision": "r23809-234f1a2efa",
                "target": "ramips/mt76x8",
                "description": "OpenWrt 23.05.3 r23809-234f1a2efa"
        }
}

cat /etc/config/network


config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option ula_prefix 'fdef:eaf3:b42d::/48'

config device 'device1'
        option name 'br-lan'
        list ports 'eth0.1'
        option type 'bridge'

config interface 'lan'
        option device 'br-lan'
        option proto 'static'
        option netmask '255.255.255.0'
        option ip6assign '60'
        option ipaddr '192.168.222.1'

config interface 'wan'
        option device 'eth0.2'
        option proto 'dhcp'

config interface 'wan6'
        option device 'eth0.2'
        option proto 'dhcpv6'

config switch 'switch0'
        option enable_vlan '1'
        option name 'switch0'
        option reset '1'

config switch_vlan 'switch0_vlan1'
        option device 'switch0'
        option ports '1 6t'
        option vlan '1'

config switch_vlan 'switch0_vlan2'
        option device 'switch0'
        option ports '0 6t'
        option vlan '2'

config interface 'wwan'
        option proto 'dhcp'

config wireguard_wg0 'wgpeer_wg0'
        option endpoint_host 'peerfqdn'
        option endpoint_port '51820'
        option persistent_keepalive '60'
        option public_key 'peerpubkey'
        option route_allowed_ips '1'
        list allowed_ips '10.31.252.1/32'

config interface 'wg0'
        option listen_port '51820'
        option mtu '1420'
        option nohostroute '0'
        option private_key 'myprivkey'
        option proto 'wireguard'
        list addresses '10.31.252.48/32'


cat /etc/config/firewall


config defaults 'defaults'
        option forward 'REJECT'
        option input 'ACCEPT'
        option output 'ACCEPT'
        option synflood_protect '1'

config zone 'zone1'
        option forward 'ACCEPT'
        option input 'ACCEPT'
        option name 'lan'
        list network 'lan'
        option output 'ACCEPT'

config zone 'zone2'
        option forward 'REJECT'
        option input 'REJECT'
        option masq '1'
        option mtu_fix '1'
        option name 'wan'
        list network 'wan'
        list network 'wan6'
        list network 'wwan'
        option output 'ACCEPT'

config forwarding 'forwarding1'
        option dest 'wan'
        option src 'lan'

config rule 'rule1'
        option dest_port '68'
        option family 'ipv4'
        option name 'Allow-DHCP-Renew'
        option proto 'udp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule2'
        option family 'ipv4'
        option icmp_type 'echo-request'
        option name 'Allow-Ping'
        option proto 'icmp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule3'
        option family 'ipv4'
        option name 'Allow-IGMP'
        option proto 'igmp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule4'
        option dest_port '546'
        option family 'ipv6'
        option name 'Allow-DHCPv6'
        option proto 'udp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule5'
        option family 'ipv6'
        list icmp_type '130/0'
        list icmp_type '131/0'
        list icmp_type '132/0'
        list icmp_type '143/0'
        option name 'Allow-MLD'
        option proto 'icmp'
        option src 'wan'
        option src_ip 'fe80::/10'
        option target 'ACCEPT'

config rule 'rule6'
        option family 'ipv6'
        list icmp_type 'echo-request'
        list icmp_type 'echo-reply'
        list icmp_type 'destination-unreachable'
        list icmp_type 'packet-too-big'
        list icmp_type 'time-exceeded'
        list icmp_type 'bad-header'
        list icmp_type 'unknown-header-type'
        list icmp_type 'router-solicitation'
        list icmp_type 'neighbour-solicitation'
        list icmp_type 'router-advertisement'
        list icmp_type 'neighbour-advertisement'
        option limit '1000/sec'
        option name 'Allow-ICMPv6-Input'
        option proto 'icmp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule7'
        option dest '*'
        option family 'ipv6'
        list icmp_type 'echo-request'
        list icmp_type 'echo-reply'
        list icmp_type 'destination-unreachable'
        list icmp_type 'packet-too-big'
        list icmp_type 'time-exceeded'
        list icmp_type 'bad-header'
        list icmp_type 'unknown-header-type'
        option limit '1000/sec'
        option name 'Allow-ICMPv6-Forward'
        option proto 'icmp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule8'
        option dest 'lan'
        option name 'Allow-IPSec-ESP'
        option proto 'esp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule9'
        option dest 'lan'
        option dest_port '500'
        option name 'Allow-ISAKMP'
        option proto 'udp'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule10'
        option dest_port '80'
        option name 'http mgmt'
        option src 'wan'
        option target 'ACCEPT'

config rule 'rule11'
        option dest_port '22'
        option name 'ssh'
        option src '*'
        option target 'ACCEPT'

config rule 'rule12'
        option dest_port '10050'
        option name 'zabbix'
        list proto 'tcp'
        option src '*'
        option target 'ACCEPT'

config redirect 'redirect1'
        option dest 'lan'
        option dest_ip '192.168.0.1'
        option dest_port '80'
        option family 'ipv4'
        option name 'Tenda'
        list proto 'tcp'
        option src 'lan'
        option src_dip '10.31.252.48'
        option src_dport '8081'
        option target 'DNAT'

config nat 'nat1'
        option dest_ip '192.168.0.1'
        option dest_port '8080'
        option device 'br-lan'
        option name 'access2tenda'
        list proto 'tcp'
        option snat_ip '192.168.0.130'
        option snat_port '80'
        option src 'lan'
        option target 'SNAT'

config zone 'zone3'
        option forward 'REJECT'
        option input 'ACCEPT'
        option name 'VPN'
        list network 'wg0'
        option output 'ACCEPT'

config forwarding 'forwarding2'
        option dest 'lan'
        option src 'VPN'

config redirect 'redirect2'
        option dest 'lan'
        option dest_ip '192.168.0.2'
        option dest_port '80'
        option family 'ipv4'
        option name 'ATA'
        list proto 'tcp'
        option src 'lan'
        option src_dip '10.31.252.48'
        option src_dport '8082'
        option target 'DNAT'



wg show

interface: wg0
  public key: mypubkey
  private key: (hidden)
  listening port: 51820

peer: peerpubkey
  endpoint: thepublicip:51820
  allowed ips: 10.31.252.1/32
  transfer: 0 B received, 9.45 MiB sent
  persistent keepalive: every 1 minute

I have found an action-short-of-a-reboot that gets this working:

rmmod wireguard; modprobe wireguard; service network restart

Without reloading the kernel module, the network restart doesn't do anything.

What is the purpose of your WG setup?
Is it a server so that you can connect from outside to this router?

Or is it a client connected to a server on the internet?

You setup is indeterminate

I am not sure what relevance of the question is, but as I said in the first sentence of my first post, the purpose of the wireguard tunnel is to manage the OpenWRT device. You will also have seen some NAT rules, the additional use of those is to access other devices local to the OpenWRT device.

I've written a service to send a ping through the tunnel every 5 or 10 seconds. This seems more robust at keeping intermediate NAT and stateful firewalls open than persistent_keepalive. I'm not sure exactly what persistent_keepalive does. It does force an initial handshake when the interface is brought up, but it isn't clear what it does periodically. I also suggest a shorter period than 60 seconds.

Your setup is indeterminate about what purpose it has, on the one hand it has a listen port but that port is not open on the other hand it has an endpoint to connect to a server but then it only allows one IP to connect back.

So lets rephrase my question what is on the other side can you post a config from the other side?

As mentioned keep alive 60 is way to much use a max of 30 sec. It sends a packet through the tunnel but only from the side the connection has established.

(The problem could well be related to your wan which looks to be a wwan)

Pretty sure the wireguard docs says it sends a packet every X seconds (with X being the value defined in the config). Can't say I've ever checked with tcpdumo or similar though to see if it does actually send.

Yes, persistent keepalive does this - I see packets of the telltale size arriving all the time at the server.

These are connected to a mixture of DSL and 4G routers. The 4G ones are behind CGN and have dynamic IPs. The DSL ones have fixed public IPs and a single layer of NAT. And the ones which are least reliable are...the DSL ones! At least the unreliable ones, I can get on to with a port forward to do troubleshooting.
I will try lowering the keepalive, but given that the 60s keepalive packets never get to the far end until after I fix the tunnel, then I am not optimistic that they will with a shorter interval :frowning:

At this point I have watchcat pinging every 30s and Zabbix agent trying to open a TCP connection to the server, probably on a similar schedule. Then coming back in the other direction we have pings and other Zabbix checks from Zabbix.
The tunnel is not going down for a want of traffic.

Using Monit with the following config seems to be doing the job of resuscitating the tunnel when needed.

set daemon  30              # check services at 30 seconds intervals
set log syslog

check host server with address 10.31.252.1
       if failed ping for 20 cycles
          then exec "/bin/ash -c '/sbin/rmmod wireguard ; /sbin/modprobe wireguard ; /sbin/service network restart'"
       if failed ping for 64 cycles
          then exec "/sbin/reboot"

@egc Is there a way for your watchdog script to restart only the Wireguard tunnel?

My Wireguard Tunnel has been down for a day so I thought it was a perfect time to test your script (manully)..

I ran the script, it waited 120s as designed, then a further 30 seconds (as configured by me) before attempting the first ping.. It correctly detected that the tunnel was down - but then the internet dropped completely, before restarting everything, including the wan connection..

Ideally I wouldnt want the wan (pppoe) to be restarted, I only want the wg tunnel to be restarted..

Is there a reason for restarting the entire network service and not just the wg tunnel?

If you only restart the WG tunnel the routes are not restarted.
Therefore as default the network is restarted.

If you only restart the same WG tunnel with the same routes a simple ifup of the WG tunnel might work

You can try to comment this line:

( service network restart >/dev/null 2>&1 ) &

and add at that place:

ifup $WG1

to bring the WG1 interface up again

Let me know if that works

Understood.. Yes, in my scenario I dont think a restart of the entire service is required.

I have made the change to the script and will test it the next time the tunnel has stopped.

Thanks for your response.

If you want you can test if the script works by triggering a restart as described in the script:

You can test the script by blocking the endpoint address of a tunnel with:
`nft insert rule inet fw4 output ip daddr <ip-endpoint-address> counter reject`
do not forget to reset the firewall (service firewall restart) or remove the rule

To test the ifup command I have added a rule to reject the wg end point IP as suggested and the wg tunnel was restarted without anything else going down.

Log entries of what happened, in your are interested.

Thu Oct 24 20:03:00 2024 user.notice wireguard-watchdog.sh[7687]: WireGuard watchdog: tunnel wg_taa is DOWN, starting next tunnel
Thu Oct 24 20:03:01 2024 user.notice wireguard-watchdog.sh[7687]: WireGuard watchdog: all tunnels failed, starting over
Thu Oct 24 20:03:01 2024 daemon.notice netifd: Network device 'wg_taa' link is down
Thu Oct 24 20:03:01 2024 daemon.notice netifd: Interface 'wg_taa' is now down
Thu Oct 24 20:03:01 2024 daemon.notice netifd: Interface 'wg_taa' is setting up now
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: reading /tmp/resolv.conf.d/resolv.conf.auto
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 192.168.0.1#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 90.255.255.90#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 90.255.255.255#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for test
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for onion
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for localhost
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for local
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for invalid
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for bind
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for lan
Thu Oct 24 20:03:01 2024 daemon.notice netifd: Interface 'wg_taa' is now up
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: reading /tmp/resolv.conf.d/resolv.conf.auto
Thu Oct 24 20:03:01 2024 daemon.notice netifd: Network device 'wg_taa' link is up
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 192.168.0.1#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 90.255.255.90#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 90.255.255.255#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using nameserver 1.1.1.1#53
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for test
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for onion
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for localhost
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for local
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for invalid
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for bind
Thu Oct 24 20:03:01 2024 daemon.info dnsmasq[1]: using only locally-known addresses for lan
Thu Oct 24 20:03:02 2024 user.notice firewall: Reloading firewall due to ifup of wg_taa (wg_taa)
Thu Oct 24 20:03:02 2024 user.notice pbr [7825]: Reloading routing for 'wg_taa/0.0.0.0' [✓]
Thu Oct 24 20:03:03 2024 user.notice pbr [8535]: Reloading routing for 'wg_taa/10.8.0.5' [✓]
Thu Oct 24 20:03:03 2024 user.notice pbr: Sending reload signal to pbr due to firewall action: includes
Thu Oct 24 20:03:03 2024 user.notice pbr: Reusing the fw4 nft file.
Thu Oct 24 20:03:21 2024 user.notice wireguard-watchdog.sh[7687]: WireGuard watchdog: tunnel wg_taa is enabled
Thu Oct 24 20:03:21 2024 user.notice wireguard-watchdog.sh[7687]: WireGuard watchdog: started, pinging every 60 seconds to ping-host.lan on tunnel wg_taa with endpoint xxxx.org

The firewall reloaded itself once reconnected, so I didnt have to worry about restarting the firewall or remove the rule, as suggested in #8 in your script.

For my own benefit I have also added some code to send me a Pushover notification in the event the tunnel went down, just so I know. :slight_smile:

Thanks for your help.

1 Like

Thanks @egc I will try your script too because I have one proton tunnel that doesn’t come up and (sadly) watchcat isn’t able to fully restart WireGuard-Interfaces.

1 Like