Bonding changing to "balance-rr" when configured for "active-backup"

Hello,

I have a bonding interface configured for 2 x 1Gbps interfaces for "active-backup" mode using the " luci-proto-bonding" package. For whatever reason it will, occasionally switch over to "balance-rr" and start freaking out my switches.

image

ip -d -j link show bond-lan | jq gives me:

[
  {
    "ifindex": 9,
    "ifname": "bond-lan",
    "flags": [
      "BROADCAST",
      "MULTICAST",
      "MASTER",
      "UP",
      "LOWER_UP"
    ],
    "mtu": 1500,
    "qdisc": "noqueue",
    "master": "br0",
    "operstate": "UP",
    "linkmode": "DEFAULT",
    "group": "default",
    "txqlen": 1000,
    "link_type": "ether",
    "address": "52:54:00:bb:d2:93",
    "broadcast": "ff:ff:ff:ff:ff:ff",
    "promiscuity": 1,
    "min_mtu": 68,
    "max_mtu": 65535,
    "linkinfo": {
      "info_kind": "bond",
      "info_data": {
        "mode": "balance-rr",
        "miimon": 50,
        "updelay": 0,
        "downdelay": 0,
        "peer_notify_delay": 0,
        "use_carrier": 1,
        "arp_interval": 0,
        "arp_validate": null,
        "arp_all_targets": "any",
        "primary_reselect": "always",
        "fail_over_mac": "active",
        "xmit_hash_policy": "layer2",
        "resend_igmp": 1,
        "num_peer_notif": 1,
        "all_slaves_active": 0,
        "min_links": 0,
        "lp_interval": 1,
        "packets_per_slave": 1,
        "ad_lacp_rate": "slow",
        "ad_select": "stable",
        "tlb_dynamic_lb": 1
      },
      "info_slave_kind": "bridge",
      "info_slave_data": {
        "state": "forwarding",
        "priority": 32,
        "cost": 100,
        "hairpin": false,
        "guard": false,
        "root_block": false,
        "fastleave": false,
        "learning": true,
        "flood": true,
        "id": "0x8001",
        "no": "0x1",
        "designated_port": 32769,
        "designated_cost": 0,
        "bridge_id": "7fff.52:54:00:BB:D2:93",
        "root_id": "7fff.52:54:00:BB:D2:93",
        "hold_timer": 0,
        "message_age_timer": 0,
        "forward_delay_timer": 0,
        "topology_change_ack": 0,
        "config_pending": 0,
        "proxy_arp": false,
        "proxy_arp_wifi": false,
        "multicast_router": 1,
        "mcast_flood": true,
        "mcast_to_unicast": false,
        "neigh_suppress": false,
        "group_fwd_mask": "0",
        "group_fwd_mask_str": "0x0",
        "vlan_tunnel": false,
        "isolated": false
      }
    },
    "inet6_addr_gen_mode": "eui64",
    "num_tx_queues": 16,
    "num_rx_queues": 16,
    "gso_max_size": 65536,
    "gso_max_segs": 65535
  }
]

In the :point_up: above state, I have the primary unplugged which is why the non-primary is the active link.

Here is my /etc/config/network:


config interface 'loopback'
	option proto 'static'
	option ipaddr '127.0.0.1'
	option netmask '255.0.0.0'
	option device 'lo'

config globals 'globals'

config interface 'LAN5'
	option proto 'static'
	option device 'eth1'
	option ipaddr '172.16.15.1'
	option netmask '255.255.255.0'
	option ip6assign '64'
	option ip6hint '5'
	option ip6ifaceid '::1'
	option delegate '0'

config interface 'WAN_direct_link_notag'
	option proto 'dhcp'
	option device 'eth3'

config interface '##'
	option proto 'wireguard'
	option private_key '##'
	option listen_port '4790'
	option defaultroute '0'
	option peerdns '0'
	option delegate '0'
	list addresses '198.18.0.1/29'
	option force_link '1'

config wireguard_##
	option description '## router'
	option endpoint_host '##'
	option endpoint_port '4790'
	option public_key '##'
	option private_key '##'
	option persistent_keepalive '20'
	list allowed_ips '198.18.0.2'
	list allowed_ips '172.16.8.0/23'
	option route_allowed_ips '1'

config interface 'nordvpntun'
	option proto 'none'
	option device 'tun0'

config interface 'wan6'
	option proto '6rd'
	option peeraddr '##'
	option ip6prefix '2602::'
	option ip6prefixlen '24'
	list ip6class 'wan6'

config device
	option name 'eth3'

config device
	option name 'eth3'
	option macaddr '00:E0:67:26:38:87'

config device
	option type 'bridge'
	option name 'br0'
	option bridge_empty '1'
	list ports 'bond-lan'

config interface 'br0'
	option proto 'static'
	option device 'br0'
	option ipaddr '172.16.10.1'
	option netmask '255.255.254.0'
	option ip6ifaceid '::1'
	option delegate '0'

config interface 'lan'
	option proto 'bonding'
	option bonding_policy 'active-backup'
	option primary 'eth4'
	option primary_reselect 'always'
	option fail_over_mac 'active'
	option num_grat_arp__num_unsol_na '1'
	option all_slaves_active '0'
	option link_monitoring 'mii'
	option miimon '50'
	option downdelay '0'
	option updelay '0'
	option use_carrier '1'
	option ipaddr '169.254.0.1'
	option netmask '255.255.0.0'
	list slaves 'eth0'
	list slaves 'eth4'
	option delegate '0'
	option force_link '1'

I've used the scripted method of /etc/rc.local ... modprobe .... ip link set etc, but wanted a more transferable / upgradable config.
Am I doing something wrong or is there some problem with the luci-proto-bonding package not properly setting the mode?

@dukekautington3rd

I am using 802.3AD for my config similar to your's and notied the same thing. Sometimes it will come up as "balance-rr"
Seems like this usually happens when its a fresh boot (pulling the power plug)
What I was able to find was that if you do a
ifup bond0
(yes bond0 and not the name of my bond from the web interface)
It resets the policy back to what I want it to be.
So what I end up doing was in the web interface for the bond I unchecked
Bring up on boot
And in my /etc/rc.local file I added
ifup bond0
And commented out
#exit 0

I will test some more but it seems like this brings up the interface correctly even with a fresh boot.

Let me know if you have found a different solution

After a little more research I came across

I think the kernel is starting the bond module, before the OS is starting the network, with its default values which happens to be balance-rr
https://docs.kernel.org/networking/bonding.html

So I went ahead and modified the module to use the values I want here
cat /etc/modules.d/40-bonding
bonding ad_select=0 lacp_rate=0 mode=4 max_bonds=0

I then set the interface to start on boot again in the gui. I am now noticing during boot these messages still showing. Because its trying to set the options, and failing, of the module that has already been loaded. But since I set the options in the modules to what I want its coming up the right way.

[ 24.796972] bond-bond0: option mode: unable to set because the bond device is up
[ 24.831276] bond-bond0: option ad_select: unable to set because the bond device is up
[ 24.857106] bond-bond0: option lacp_rate: unable to set because the bond device is up

Hope this makes sense.