Unable to get IP via dynamic VLAN on non-default networks

I'm trying to add dynamic VLANs to an SSID with RADIUS assignment and for the most part, everything seems to be working except my devices connecting via wireless on any VLAN other than the default (VLAN1) are not able to get IPs.

I am able to use any VLAN assigned to a physical port and it works fine. Assignments to VLAN1 work with no problems on the wireless network.
If I manually assign the VLAN's network interface to the wireless SSID, all VLANs can work.

My environment is as follows:
Xiaomi AX3600 running OpenWrt SNAPSHOT r28228+8-207bfee855
Code includes two modifications from master: 1) PR 16070 is brought in for Linksys MX4300 support. 2) ath11k is patched to support AP-VLAN

built with full wpad-openssl

Device is a dumb AP connected to a UniFi UXG-Max gateway.
This device is the DHCP server for VLANs 10, 20, 40, and 50
Windows 2019 DHCP server sits on the other side of this gateway running DHCP for VLAN1

Radius is running locally since the one built into UniFi won't let me default unknown clients to authenticate with a VLAN1 assignment. In system logs I am able to see the authentications running successfully and appropriately applying VLAN assignments. If I remove the hostapd vlan file I can see the vlans being auto-applied to the wlan interface as clients attempt to connect.

In these configs, I have 3 SSIDs setup across two different radios for the sake of testing. It includes a duplicate configuration for the sake of testing on both ath10k and ath11k (since ath11k is using an unofficial patch), and a 2nd SSID riding on ath11k 5ghz without dynamic vlan, tied directly to one of the afflicted vlans (successfully).

OpenWrt-10:~# cat /etc/config/network

config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option ula_prefix 'fd1f:1574:f95d::/48'

config interface 'lan'
        option device 'br-vlan1'
        option proto 'none'

config interface 'mgmt'
        option proto 'static'
        option device 'br-vlan10'
        option ipaddr '192.168.10.10'
        option netmask '255.255.255.0'
        option gateway '192.168.10.1'
        list dns '192.168.10.1'

config device
        option type 'bridge'
        option name 'br-vlan1'
        option igmp_snooping '1'
        list ports 'lan1'
        list ports 'lan3'
        list ports 'wan'

config device
        option type 'bridge'
        option name 'br-vlan10'
        option igmp_snooping '1'
        list ports 'wan.10'

config interface 'kids'
        option proto 'none'
        option device 'br-vlan20'

config interface 'Google'
        option proto 'none'
        option device 'br-vlan40'

config interface 'iot'
        option proto 'none'
        option device 'br-vlan50'

config device
        option type '8021q'
        option ifname 'wan'
        option vid '1'
        option name 'wan.1'

config device
        option type '8021q'
        option ifname 'wan'
        option vid '20'
        option name 'wan.20'

config device
        option type '8021q'
        option ifname 'wan'
        option vid '40'
        option name 'wan.40'

config device
        option type '8021q'
        option ifname 'wan'
        option vid '50'
        option name 'wan.50'

config device
        option type '8021q'
        option ifname 'wan'
        option vid '10'
        option name 'wan.10'

config device
        option type 'bridge'
        option name 'br-vlan20'
        option igmp_snooping '1'
        list ports 'lan2'
        list ports 'wan.20'

config device
        option type 'bridge'
        option name 'br-vlan40'
        list ports 'wan.40'
        option igmp_snooping '1'

config device
        option type 'bridge'
        option name 'br-vlan50'
        list ports 'wan.50'
        option igmp_snooping '1'

OpenWrt-10:~# cat /etc/config/wireless

config wifi-device 'radio0'
        option type 'mac80211'
        option path 'soc@0/20000000.pci/pci0000:00/0000:00:00.0/0000:01:00.0'
        option band '5g'
        option channel '36'
        option htmode 'VHT80'
        option country 'US'
        option cell_density '0'

config wifi-device 'radio1'
        option type 'mac80211'
        option path 'platform/soc@0/c000000.wifi'
        option band '5g'
        option channel '36'
        option htmode 'HE80'
        option country 'US'
        option cell_density '0'

config wifi-device 'radio2'
        option type 'mac80211'
        option path 'platform/soc@0/c000000.wifi+1'
        option band '2g'
        option channel '1'
        option htmode 'HE20'
        option disabled '1'

config wifi-iface 'wifinet0'
        option device 'radio1'
        option ifname 'wlan0'
        option mode 'ap'
        option ssid 'test1'
        option encryption 'psk2'
        option key '11111111'
        option auth_server '192.168.10.10'
        option auth_secret 'wowsecret'
        option vlan_file '/etc/config/hostapd0.vlan'
        option dynamic_vlan '2'
        option vlan_naming '0'
        option vlan_bridge 'br-vlan'

config wifi-iface 'wifinet1'
        option device 'radio1'
        option mode 'ap'
        option ssid 'test2'
        option encryption 'psk2'
        option key '11111111'
        option network 'kids'

config wifi-iface 'wifinet2'
        option device 'radio0'
        option mode 'ap'
        option ssid 'test_9887'
        option encryption 'psk2'
        option key '11111111'
        option ifname 'wlan1'
        option auth_server '192.168.10.10'
        option auth_secret 'wowsecret'
        option vlan_file '/etc/config/hostapd1.vlan'
        option dynamic_vlan '2'
        option vlan_naming '0'
        option vlan_bridge 'br-vlan'
OpenWrt-10:~# cat /etc/config/hostapd0.vlan
1 wlan0.1 br-vlan1
20 wlan0.20 br-vlan20
40 wlan0.40 br-vlan40
50 wlan0.50 br-vlan50
10 wlan0.10 br-vlan10
OpenWrt-10:~# cat /etc/config/hostapd1.vlan
1 wlan1.1 br-vlan1
20 wlan1.20 br-vlan20
40 wlan1.40 br-vlan40
50 wlan1.50 br-vlan50
10 wlan1.10 br-vlan10
OpenWrt-10:~# brctl show
bridge name     bridge id               STP enabled     interfaces
br-vlan20               7fff.88c397c155eb       no              wlan0.20
                                                        wan.20
                                                        phy1-ap0
                                                        lan2
                                                        wlan1.20
br-vlan10               7fff.88c397159787       no              wlan0.10
                                                        wan.10
                                                        wlan1.10
br-vlan1                7fff.88c397c155eb       no              wlan1.1
                                                        wan
                                                        wlan0.1
                                                        lan3
                                                        lan1
br-vlan50               7fff.88c397159787       no              wan.50
                                                        wlan1.50
                                                        wlan0.50
br-vlan40               7fff.88c397159787       no              wan.40
                                                        wlan1.40
                                                        wlan0.40

Any education here would be appreciated!

Did you create multiple instances for dnsmasq?

I had multiple dnsmasq instances since it was introduced. But I found that it interferes with DHCP resp. each dnsmasq instance is listening/bound to 0.0.0.0:67. It cannot bind to an interface like the dns part of dnsmasq is doing. Dnsmasq is using a function called "SO_REUSEPORT" for this.

https://lists.thekelleys.org.uk/pipermail/dnsmasq-discuss/2017q4/011844.html

IMO this is a design flaw and if I understand the discussion correct it is also kernel related:

I think so as well. It may create issues when someone runs 2 dnsmasq
processes with different configurations (for different interfaces for
example - 1 dnsmasq process per interface). Kernel may "balance" incoming
UDP packets to another dnsmasq instance (which AFAIR will just ignore it),
so there will be more retries in general for DHCP traffic.

--> So it is like gambling to get the correct IP address. I saw devices comming up with 0.0.0.0 for the gateway already. Maybe this issue is coupled with vlans.

But it works if you run just one instance of dnsmasq.

1 Like

Since it's a dumb ap, I typically have had dnsmasq disabled entirely. That's part of why I've decided to look deeper on the UniFi side because I think something is wrong there.

If I enable dnsmasq and set it up for the non-functioning interface, it actually does work but it of course then isn't using the settings and reservations defined in the real dhcp server or using my pi-hole. More recently, I've gotten everything to work completely for a small amount of time, only to end up totally locked out of the AP an hour later. Right now for instance, I'm able to connect to WiFi on any VLAN or plug in to the physical ports and have correctly assigned addresses, but I can't access the AP at all (though it was working last night)

Unfortunately I'm finding UniFi's logging leaves a lot to be desired, their early access packet captures are returning completely blank output files, or that I've hit a significant skill issue on that end of it.

This is an interesting setup insofar as we are learning about more variables and systems. Since you have Unifi + PiHole + OpenWrt APs, let's start by getting an idea of the network topology... can you create a basic diagram that shows all the critical infrastructure, labeled with the IP addresses and the brand+model of each component? A photo of a sketch on paper is sufficient.

The Windows DHCP Server services 192.168.0.0/23
Most clients are assigned 192.168.1.x addresses
Eventually VLANs will function as:
1 - Default LAN (this is hardcoded in UniFi)
10 - APs and switches
20 - Child operated devices (isolated, content filtered internet)
40 - Google devices (Multicast to VLAN1/40 only, DNS masquerading, internet)
50 - IoT Devices (isolated, internet with heavy pi-hole restrictions)

Ok... so there's a lot more going on here than would have been apparent at the outset.

FWIW, I don't think that the flex mini can provide any packet captures, and the rest of the infrastructure obviously cannot feed into the Unifi statistics and monitoring within the UNA. So, I think it's unlikely that you'd get anything on the lan side unless you have a regular Unifi switch (i.e. not the flex mini series) and/or APs.

Meanwhile, your bridges look entirely wrong, so I think you need to refactor everything. My recommendation is to reset to defaults, then configure with separate SSIDs per VLAN. Once that is working, you know that you've got the basics setup correctly. Then, you can reconfigure just the wifi related stuff (leaving the /etc/config/network file alone) in order to setup the dynamic VLAN functionality. I can help you with the former, but I've never setup the single SSID/multiple password dynamic VLAN stuff.

I agree. :smiley: I didn't expect this heavy setup. I thought it is VLAN related only bc. of our discussion in NSS thread we had.
But this setup is full of possible errors. It is very easy to overlook a tiny aspect to make it not working.

I think (@tardeaux has to answer) he is running on NSS where the vlan setup is a bit different (https://github.com/qosmio/openwrt-ipq/blob/24.10-nss/nss-setup/example/04-vlans). But generally NSS vlan setup should work on vanilla OpenWrt also (at least in my experience it does work). I assume firewall filtering is off as it is a dumb AP setup. But maybe I'm wrong here regarding the interoperability (if he is running vanilla).

For finding the error it is probably the only way making it easier to follow.

I should clarify that they are set up for NSS compatibility. NSS doesn't support VLAN filtering at all and there is hope of eventually moving the test router to that fork. For now it is running a very recent snapshot build from the main repo.

That said, I'm not opposed to switching to vlan filtered setup for now, as yesterday I tested the NSS version and think the dyn vlan support there is broken (the driver returns mac80211 faults on any wireless connection after radius kicks in), so dynamic vlan with NSS might be a pipe dream for now.

The wireless network that sits on the Flex Mini is the live network with no VLANs at all. The test AP (represented by the biohazard symbol) is directly attached to the UXG-Max gateway (which has pcap) directly alongside the VM host.

I have one shot regarding the config. It is a while but I remember that the option dynamic_vlan '2' was problematic in connection with freeradius. What is within the files:

/tmp/run/hostapd-phy1.conf
/tmp/run/hostapd-phy0.conf

Maybe some options are not recognized/used correctly (bugged)?