interesting findings... great advice over ~650Mb/s to just trash nlbwmon &&|| luci statistics the bursting messes up cpu utilisation at those levels as you found...

great you don't need SQM... and the packet steering is sort of tied to that AFAIK... so users of SQM(over around 550Mb/s) would be advised to definitely use packet steering

    TASKSET="$(command -v taskset-aarch64)"
	for thispid in $(pidof nlbwmon); do
		$TASKSET -apc 3 $thispid 2>&1 >/dev/null
	done
	for thispid in $(pidof collectd); do
		$TASKSET -apc 3 $thispid 2>&1 >/dev/null
	done
	for thispid in $(pidof uhttpd); do
		$TASKSET -apc 2 $thispid 2>&1 >/dev/null
	done

findRUPT() {
	fgrep ${1} /proc/interrupts  | sed 's|^ ||g' | cut -d':' -f1 | \
		tr -s '\n' ' '
}

eth0INTs="$(findRUPT eth0)"
tRU=
if [ ! -z "$eth0INTs" ]; then
	for tRU in $eth0INTs; do
		coreSET=${coreSET:-1}
		echo -n ${coreSET} > /proc/irq/$tRU/smp_affinity
		coreSET=$((coreSET + 1))
	done
fi

#would be good if you can test all 'c' and all 'f' and all '0' here also without SQM (can test with also but mostly interested without)
echo -n 1 > /sys/class/net/eth0/queues/tx-0/xps_cpus
echo -n 2 > /sys/class/net/eth0/queues/tx-1/xps_cpus
echo -n 4 > /sys/class/net/eth0/queues/tx-2/xps_cpus
echo -n 4 > /sys/class/net/eth0/queues/tx-3/xps_cpus
echo -n 2 > /sys/class/net/eth0/queues/tx-4/xps_cpus
echo -n 7 > /sys/class/net/eth0/queues/rx-0/rps_cpus
echo -n 7 > /sys/class/net/eth1/queues/rx-0/rps_cpus

echo -n "1100000" > /sys/devices/system/cpu/cpufreq/policy0/scaling_min_freq
echo -n 21 > /sys/devices/system/cpu/cpufreq/ondemand/up_threshold && sleep 2
echo -n 5 > /sys/devices/system/cpu/cpufreq/ondemand/sampling_down_factor