Strange behaviour towards stp in my setup

Hello!,

This surely will not be a easy task to solve, but I noticed very odd behaviour with STP and mainly daisy chained network switches this especially make it extra difficult to figure out since it is a complex layer2 issue somewhere on my network.

So first things first my topology:

To my issue:

When I download to my highest available rate on my pc via the lancache on proxmox, with STP on, my full network goes down, I'm able to login via wifi but it seem the router fails to make routes to the internet.

a few older updates I actually got better output in the log which told me that port 2 is sending traffic with it's own source address this is very strange since none of these vlans are connected to this zyxel gs1900-8hp switch except for the managed vlan to maintain switches on their ip.

when I disable STP in br-lan it magically is fixed, I do know there are incompatibility issues with different versions of STP so I tried to stream line this a much of possible, but still managed to cause a full network outtage.

but what I can't get my head around is how port 2 can be affected when the heavy traffic doesn't go to port 2 of the flint 2.

is this something bugged or a misconfiguration?

here is the configuration of /etc/config/network:

config interface 'loopback'
        option device 'lo'
        option proto 'static'
        option ipaddr '127.0.0.1'
        option netmask '255.0.0.0'

config globals 'globals'
        option packet_steering '2'
        option ula_prefix 'skip'

config device
        option name 'br-lan'
        option type 'bridge'
        option ipv6 '0'
        option multicast '1'
        option bridge_empty '1'
        list ports 'lan1'
        list ports 'lan2'
        list ports 'lan3'
        list ports 'lan4'
        list ports 'lan5'
        option igmp_snooping '1'

config interface 'lan'
        option device 'br-lan.169'
        option proto 'static'
        option ipaddr '10.234.53.1'
        option netmask '255.255.255.0'
        option delegate '0'

config interface 'wan'
        option proto 'dhcp'
        option device 'eth1.300'
        option classlessroute '0'
        option keepalive '0 1'
        option delegate '0'
        option force_link '1'

config bridge-vlan
        option device 'br-lan'
        option vlan '169'
        list ports 'lan1:u*'
        list ports 'lan4:u*'
        list ports 'lan5:u*'

config bridge-vlan
        option device 'br-lan'
        option vlan '49'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '53'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '89'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config interface 'pcnet'
        option proto 'static'
        option device 'br-lan.49'
        option ipaddr '10.34.79.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'wlan0'
        option proto 'static'
        option device 'br-lan.50'
        option ipaddr '10.234.80.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'wlan1'
        option proto 'static'
        option ipaddr '10.234.81.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option device 'br-lan.51'
        option delegate '0'

config interface 'iot'
        option proto 'static'
        option device 'br-lan.52'
        option ipaddr '10.33.77.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'tvnet'
        option proto 'static'
        option device 'br-lan.53'
        option ipaddr '172.22.33.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'ps5'
        option proto 'static'
        option device 'br-lan.89'
        option ipaddr '10.56.2.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'ayaneo'
        option proto 'static'
        option device 'br-lan.90'
        option ipaddr '10.87.32.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'aqaranet'
        option proto 'static'
        option device 'br-lan.178'
        option ipaddr '10.233.10.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        list ip6class 'local'
        option ip6assign '64'
        option ip6ifaceid '::a1'
        option delegate '0'

config interface 'hwnet'
        option proto 'static'
        option device 'br-lan.179'
        option ipaddr '10.182.32.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'wifivpn'
        option proto 'wireguard'
        option private_key 'skip'
        option listen_port '51820'
        option defaultroute '0'
        option multicast '1'
        list addresses '10.39.95.1/24'
        option delegate '0'
        option force_link '1'

config interface 'wgclient'
        option proto 'wireguard'
        option private_key 'skip'
        option defaultroute '0'
        option force_link '1'
        option mtu '1420'
        option delegate '0'
        list addresses '10.64.132.53/32'

config bridge-vlan
        option device 'br-lan'
        option vlan '23'
        list ports 'lan1:t'
        list ports 'lan5:t'

config interface 'tvboxnet'
        option proto 'static'
        option device 'br-lan.23'
        option ipaddr '192.168.59.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'wgserver'
        option proto 'wireguard'
        option private_key 'skip'
        option listen_port '4443'
        option force_link '1'
        option defaultroute '0'
        option delegate '0'
        list addresses '10.6.7.1/24'

config bridge-vlan
        option device 'br-lan'
        option vlan '90'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '52'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '178'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '50'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '51'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '179'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config route
        option interface 'wifivpn'
        option target '224.0.0.0/8'
        option type 'multicast'
        option table 'main'

config interface 'vx0'
        option proto 'vxlan'
        option peeraddr '10.6.7.2'
        option defaultroute '0'
        option delegate '0'
        option vid '4921'
        option rxcsum '0'
        option txcsum '0'
        option force_link '1'
        option ipaddr '10.6.7.1'
        option tunlink 'wgserver'

config rule
        option action 'blackhole'
        option dest '8.8.8.8/32'
        option lookup 'main'

config rule
        option action 'blackhole'
        option dest '1.1.1.1/32'
        option lookup 'main'

config rule
        option action 'blackhole'
        option dest '8.8.4.4/32'
        option lookup 'main'

config rule
        option action 'blackhole'
        option dest '1.1.1.2/32'
        option lookup 'main'

config rule
        option action 'blackhole'
        option dest '1.1.1.3/32'
        option lookup 'main'

config bridge-vlan
        option device 'br-lan'
        option vlan '180'
        list ports 'lan1:t'
        list ports 'lan2:u*'
        list ports 'lan3:u*'
        list ports 'lan4'
        list ports 'lan5:t'

config interface 'kvmnet'
        option proto 'static'
        option device 'br-lan.180'
        option ipaddr '10.46.214.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config bridge-vlan
        option device 'br-lan'
        option vlan '62'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config bridge-vlan
        option device 'br-lan'
        option vlan '70'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config interface 'proxmox'
        option proto 'static'
        option device 'br-lan.70'
        option ipaddr '10.244.244.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config bridge-vlan
        option device 'br-lan'
        option vlan '71'
        list ports 'lan1:t'
        list ports 'lan4:t'
        list ports 'lan5:t'

config interface 'proxmox_vpn'
        option proto 'static'
        option device 'br-lan.71'
        option ipaddr '10.245.245.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config device
        option name 'wifivpn'
        option multicast '1'
        option acceptlocal '1'
        option igmpversion '3'


config route
        option interface 'wgclient'
        option target '10.64.0.1/32'
        option table 'main'

config interface 'vx1'
        option proto 'vxlan'
        option force_link '1'
        option peeraddr '10.6.7.3'
        option ipaddr '10.6.7.1'
        option vid '4922'
        option tunlink 'wgserver'
        option rxcsum '0'
        option txcsum '0'
        option defaultroute '0'
        option delegate '0'

config interface 'beta'
        option proto 'static'
        option device 'br-lan.62'
        option ipaddr '192.168.85.1'
        option netmask '255.255.255.0'
        option defaultroute '0'
        option delegate '0'

config interface 'sma'
        option proto 'static'
        option device 'br-lan.133'
        option defaultroute '0'
        option delegate '0'
        option ipaddr '10.178.178.1'
        option netmask '255.255.255.0'

config bridge-vlan
        option device 'br-lan'
        option vlan '133'
        list ports 'lan1:t'

config interface 'wgclient_yt'
        option proto 'wireguard'
        option private_key 'skip'
        option defaultroute '0'
        option delegate '0'
        list dns '10.64.0.1'
        list addresses '10.64.132.53/32'

config interface 'wgclient2'
        option proto 'wireguard'
        option private_key 'skip'
        list addresses '10.64.132.53/32'
        option defaultroute '0'

config interface 'wgclient3'
        option proto 'wireguard'
        option private_key 'skip'
        list addresses '10.64.132.53/32'
        option defaultroute '0'
        option delegate '0'

ubus system board (it is a custom build but afaik I haven't touched dnsmasq or netifd related things, checked out from commit: 5a42ed113):

root@X-SDK-Central:~# ubus call system board
{
        "kernel": "6.12.51",
        "hostname": "X-SDK-Central",
        "system": "ARMv8 Processor rev 4",
        "model": "GL.iNet GL-MT6000",
        "board_name": "glinet,gl-mt6000",
        "rootfs_type": "squashfs",
        "release": {
                "distribution": "OpenWrt",
                "version": "SNAPSHOT",
                "firmware_url": "https://downloads.openwrt.org/",
                "revision": "r0+31730-9453001379",
                "target": "mediatek/filogic",
                "description": "OpenWrt SNAPSHOT r0+31730-9453001379",
                "builddate": "1760537923"
        }
}

my zyxel gs1900-8hp has vlans this way managed:

managed vlan:

and a typical untag/tag scenario:

since I let my switches listen on their own PVID often untagged to 1 and flint 2 uses 169 for the native vlan all network switches are well reachable in my lan network which I use for management, but could a network switch flag wrong traffic if it tags port 1 back? do these switches still need to untag 169 back hence the frame no longer cease to exist ?

the behaviour is so strange but not fun if I download from my pc and everything goes to down mode :slight_smile:, it will also not be possible for me to restore the network, restarting the network acts like there is no internet while there is an route...

what I do see ocassionally is that the br-lan topology detected a change, this seem to direct me to some kind of misconfiguration somewhere.

only a full router restart fixes it, is there a reason to not wanting to use STP on bridge br-lan ?

if there are special debug commands I can use let me know :slight_smile:

p.s:
i host my unifi controller also on the nuc and made it listen to the interface of vlan 169, I don't think that would be the issue but just mentoining this just in case.

Only place where you can cross the cables is inside proxmox. Check that bridges and ports are in right places, or enable STP on their bridges, just note it will kick out half of loop

1 Like

I gonna look into this :+1:

Edit

After testing a little more and changing some things to proxmox only using vmbr0 instead of having dhcp vmbr0 and vlan dhcp interfaces I think that caused duplicates as I misunderstood them :slight_smile:

vmbr0 alone already has these vlans, because the covered devices in vmbr0 are used as wan ones in my setup.

1 Like

Update: it was still happening, and again on lan2 which only has a untagged jetkvm on vlan 180 on the Flint2, I moved the Zyxel GS1900-8hp to my unifi flex switch so this one now is on lan1 too.

My guess is that hardware offloading creates very unexpected behaviour when multiple links are very saturated:

Moonlight stream from tvnet to pcnet upstairs, pc downloading steam games where a lancache instance works as a reverse proxy.

^ here is only eth1, and lan1 affected, but the issue occurs on lan2.

I understand if there is too much traffic a part of the lan will go down basicly lan1, but when the issue occurs renewing the dhcp wan results in no route and is broken until a restart.

I guess hardware offloading create some sort of glitch issue which is a corner case, with it disabled I don't see the issue appear fingers crossed.

Yes I would also disable offload.
And, did you configured stp everywhere explicit?
The stp root should sit also sit at the "top most" switch.

1 Like

Yup I have enabled it on all switches, all stp and made sure none where on rstp (unifi defaults), its also configurated on br-lan.

My suspicioun is the offloading for now triggering false stp loop, the own source mac address from lan2 comes from the masquaraded 0.0.0.0 interface, when I looked with tcpdump to 10.46.214.1 I only saw some arp occasionally, so there is no bad traffic going on.

Must wan masquarading be off?

I would inspect the stp issue first.
Ok you have enabled it but did you set the Bridge Priority everywhere and ensured that the root canonly be at the root?

Regarding masq. Only the edge router which connects wan with lan should do any masquerade.

1 Like

Seems i had not set a priority, but I have tested it with priority set and it does still occur, when I change to software offloading I don't see it happening, it seem to be purely happening on hardware offloading :+1:

These messages I see with hardware offloading are:

received packet on lan2 with own address as source address (addr:7a:9b:e8:xx:xx:xx, vlan:180)

HW Offloding discarges packets outside host CPU or STP (or ARP or changed route or LACP balances) status, that is documented limitation , use SW offload if you really are short in host CPU power.

1 Like

This topic was automatically closed 10 days after the last reply. New replies are no longer allowed.