Roaming fails with Zyxel switch

I have a problem with wireless roaming on my network. This is my setup:


(Note: in reality the correct switch ports are connected to the correct device for the VLAN tagging, this is just a demonstration of the topology)

  • All devices run OpenWRT Snapshot.
  • The Redmi AX6000 is the main router, DHCP, DNS etc.
  • There are 3 VLANs which appear to work correctly.
  • The AX6S's are setup as dumb APs with a separate SSID for each VLAN. They are a DHCP client on the management VLAN and unmanaged on the other VLANs. The WAN port is setup as the VLAN Trunk uplink to the switch (appropriately tagged in all VLANs) 802.11r/k/v is setup and appears to be working correctly.

Configs

/etc/config/wireless

config wifi-device 'radio0'
        option type 'mac80211'
        option path 'Device path is different for each device model'
        option channel '1' # 1,6,11 on each AP
        option band '2g'
        option htmode 'HT20'
        option country 'GB'
        option cell_density '0'

config wifi-device 'radio1'
        option type 'mac80211'
        option path 'Device path is different for each device model'
        option channel 'auto'
        option band '5g'
        option htmode 'HE80'
        option country 'GB'
        option cell_density '2'
        option he_su_beamformee '1'
        option he_bss_color '36'  # Different on each AP

config wifi-iface 'wifinet0'
        option device 'radio0'
        option mode 'ap'
        option ssid 'SSID1'
        option encryption 'psk2'
        option key 'REDACTED'
        option network 'home'
        option ieee80211r '1'
        option mobility_domain 'b724'
        option reassociation_deadline '20000'
        option ft_over_ds '0'
        option ft_psk_generate_local '1'
        option ieee80211k '1'
        option time_advertisement '2'
        option time_zone 'GMT0BST,M3.5.0/1,M10.5.0'
        option wnm_sleep_mode '1'
        option bss_transition '1'

config wifi-iface 'wifinet1'
        option device 'radio0'
        option mode 'ap'
        option ssid 'SSID2'
        option encryption 'psk2'
        option key 'REDACTED'
        option ieee80211r '1'
        option mobility_domain '130b'
        option reassociation_deadline '20000'
        option ft_over_ds '0'
        option ft_psk_generate_local '1'
        option ieee80211k '1'
        option time_advertisement '2'
        option time_zone 'GMT0BST,M3.5.0/1,M10.5.0'
        option wnm_sleep_mode '1'
        option bss_transition '1'
        option network 'admin'
        option hidden '1'

config wifi-iface 'wifinet2'
        option device 'radio0'
        option mode 'ap'
        option ssid 'SSID3'
        option encryption 'psk2'
        option key 'REDACTED'
        option ieee80211r '1'
        option mobility_domain 'd4ec'
        option reassociation_deadline '20000'
        option ft_over_ds '0'
        option ft_psk_generate_local '1'
        option ieee80211k '1'
        option time_advertisement '2'
        option time_zone 'GMT0BST,M3.5.0/1,M10.5.0'
        option wnm_sleep_mode '1'
        option bss_transition '1'
        option network 'corporate'

# Repeat for 5G band...

/etc/config/network (on AX6000)

config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option ula_prefix 'REDACTED'
        option packet_steering '1'

config device
        option name 'wan'
        option macaddr 'REDACTED'

config interface 'wan'
        option device 'wan'
        option proto 'pppoe'
        option username 'REDACTED'
        option password 'REDACTED'
        option ipv6 'auto'

config device
        option type 'bridge'
        option name 'br-eth'
        list ports 'lan2'
        list ports 'lan3'
        list ports 'lan4'

config bridge-vlan
        option device 'br-eth'
        option vlan '1'
        list ports 'lan2:u*'
        list ports 'lan3:u*'
        list ports 'lan4:u*'

config bridge-vlan
        option device 'br-eth'
        option vlan '2'
        list ports 'lan2:t'
        list ports 'lan3:t'
        list ports 'lan4:t'

config bridge-vlan
        option device 'br-eth'
        option vlan '3'
        list ports 'lan2:t'
        list ports 'lan3:t'
        list ports 'lan4:t'

config device
        option type 'bridge'
        option name 'br-admin'
        list ports 'br-eth.1'

config device
        option type 'bridge'
        option name 'br-home'
        list ports 'br-eth.2'

config device
        option type 'bridge'
        option name 'br-corporate'
        list ports 'br-eth.3'

config interface 'admin'
        option proto 'static'
        option device 'br-admin'
        option ipaddr '10.1.1.1'
        option netmask '255.255.255.0'
        option ip6assign '60'

config interface 'home'
        option proto 'static'
        option device 'br-home'
        option ipaddr '10.2.1.1'
        option netmask '255.255.255.0'
        option ip6assign '60'

config interface 'corporate'
        option proto 'static'
        option device 'br-corporate'
        option ipaddr '10.3.1.1'
        option netmask '255.255.255.0'
        option ip6assign '60'

/etc/config/network (on switch)

config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option ula_prefix 'REDACTED'

config device 'switch'
        option name 'switch'
        option type 'bridge'
        option macaddr 'REDACTED'
        list ports 'lan1'
        list ports 'lan2'
        list ports 'lan3'
        list ports 'lan4'
        list ports 'lan5'
        list ports 'lan6'
        list ports 'lan7'
        list ports 'lan8'
        list ports 'lan9'
        list ports 'lan10'
        list ports 'lan11'
        list ports 'lan12'
        list ports 'lan13'
        list ports 'lan14'
        list ports 'lan15'
        list ports 'lan16'
        list ports 'lan17'
        list ports 'lan18'
        list ports 'lan19'
        list ports 'lan20'
        list ports 'lan21'
        list ports 'lan22'
        list ports 'lan23'
        list ports 'lan24'

config interface 'admin'
        option proto 'dhcp'
        option device 'br-admin'

config interface 'home'
        option proto 'none'
        option device 'br-home'

config interface 'corporate'
        option proto 'none'
        option device 'br-corporate'

config bridge-vlan
        option device 'switch'
        option vlan '1'
        list ports 'lan3:u*'
        list ports 'lan10:u*'
        list ports 'lan12:u*'
        list ports 'lan14:u*'
        list ports 'lan15:u*'
        list ports 'lan16:u*'
        list ports 'lan17:u*'
        list ports 'lan18:u*'
        list ports 'lan20:u*'
        list ports 'lan22:u*'
        list ports 'lan23:u*'
        list ports 'lan24:u*'

config bridge-vlan
        option device 'switch'
        option vlan '2'
        list ports 'lan1:u*'
        list ports 'lan2:u*'
        list ports 'lan3:t'
        list ports 'lan4:u*'
        list ports 'lan5:u*'
        list ports 'lan6:u*'
        list ports 'lan7:u*'
        list ports 'lan8:u*'
        list ports 'lan10:t'
        list ports 'lan11:u*'
        list ports 'lan12:t'
        list ports 'lan19:u*'
        list ports 'lan21:u*'
        list ports 'lan23:t'

config bridge-vlan
        option device 'switch'
        option vlan '3'
        list ports 'lan3:t'
        list ports 'lan9:u*'
        list ports 'lan12:t'
        list ports 'lan13:u*'
        list ports 'lan23:t'

config device
        option type 'bridge'
        option name 'br-home'
        list ports 'switch.2'

config device
        option type 'bridge'
        option name 'br-corporate'
        list ports 'switch.3'

config device
        option type 'bridge'
        option name 'br-admin'
        list ports 'switch.1'

/etc/config/network (on APs)

config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option ula_prefix 'REDACTED'

config device
        option name 'wan'
        option macaddr 'REDACTED'

config device
        option type 'bridge'
        option name 'br-eth'
        list ports 'lan1'
        list ports 'lan2'
        list ports 'lan3'
        list ports 'wan'

config bridge-vlan
        option device 'br-eth'
        option vlan '1'
        list ports 'wan:u*'

config bridge-vlan
        option device 'br-eth'
        option vlan '2'
        list ports 'lan1:u*'
        list ports 'lan2:u*'
        list ports 'lan3:u*'
        list ports 'wan:t'

config bridge-vlan
        option device 'br-eth'
        option vlan '3'
        list ports 'wan:t'

config device
        option type 'bridge'
        option name 'br-admin'
        list ports 'br-eth.1'

config device
        option type 'bridge'
        option name 'br-home'
        list ports 'br-eth.2'
        option ageing_time '10'

config device
        option type 'bridge'
        option name 'br-corporate'
        list ports 'br-eth.3'

config interface 'admin'
        option proto 'dhcp'
        option device 'br-admin'

config interface 'home'
        option proto 'none'
        option device 'br-home'

config interface 'corporate'
        option proto 'none'
        option device 'br-corporate'

Observations
The following tests were completed on an iPhone 13 with the wireless diagnostics profile installed to monitor the current BSSID.

When a wireless client roams from the router to either AP, initially the roaming succeeds and the client reports the BSSID of the new AP, but after a few seconds the client loses its IP address and configures a self assigned IP (169.254...), the phone reverts to mobile data but stays associated with the AP. The same happens with my 2021 MacBook Pro, roaming succeeds, device associates with new BSSID, loses IP address, self assigns IP, "!" over the wifi icon in the menu bar.

When a client (now in this self assigned state) roams back to the BSSID of the router, it obtains a new IP address and all is back to normal. When a client connects directly to either dumb AP after being disconnected from the network for a while, it obtains the relevant IP address for the VLAN successfully and all is normal (until it roams to the router and back, or to the other AP). Any client connecting directly to the router always obtains an IP address. This problem occurs on all VLANs/SSIDs.

What I think is happening
From what I've observed, I think the Zyxel switch is causing the problem. Let me explain...

When a client roams from router to AP, its MAC address appears on a new port on the Zyxel switch (the port that the AP is connected to). The MAC address isn't getting learnt on the new port by the switch and it continues to forward traffic to the port the client was seen on before. Therefore traffic doesn't reach the client in its new location on the network and it loses its IP. When a client is disconnected for a while, its entry in the switch MAC table expires and when it reconnects to the AP, its MAC is learnt on the new port and DHCP succeeds. To test this theory, I connected the APs directly to the AX6000, bypassing the switch:

Roaming succeeded!!

The Zyxel switch was running the OEM firmware, so I switched to OpenWRT to see if it would help but it didn't, at least I now have to configuration options available in OpenWRT to try fix this.

So, how can I make the switch follow the client across ports when roaming occurs immediately?

My two cents...

Do not mix tagged with untagged traffic on the same port. Dedicate one port on the main router for the trunk, and tag all traffic on that port. All other ports on the main router should only carry untagged traffic.

I cannot guarantee any of this is related to your issue, but I think it's better to start with a "clean" config.

Thanks for your feedback. I set it up this way so that any client connecting to a trunk port without VLANs configured would fall back to the untagged admin VLAN. I have now configured all trunk ports between routers/APs/switches to be fully tagged with no untagged VLANs, only client ports are not untagged with a primary VLAN set. Whilst this has not resolved the issue, I will keep it this way for simplicity.

This problem seems related to Bridged wifi ap, DHCP Offers only reaching clients after 5 min period and Disappearing DHCP Offers & bridge ageing_time. After setting the Ageing time of all bridge devices in LuCI on all network hardware to 10 instead of the default 30, the problem is less noticeable (but still there). Devices roam over and get an IP address within a couple seconds, and sometimes the transition is seamless. So whist this is not solved, there is a kind of work around. Still looking for a more permanent solution.