Oom-killer: dnsmasq when Physical Free RAM remains

I'm seeing dnsmasq getting hit by the oom-killer cleanup
proc, but I'm not sure why. According to the logs, there's physical free RAM when it's called.

Can anyone suggest a way to track down this leak?

Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.833478] sh invoked oom-killer: gfp_mask=0xcc0(GFP_KERNEL), order=1, oom_score_adj=0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.841683] CPU: 0 PID: 2334 Comm: sh Not tainted 5.10.64 #0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.847439] Stack : 003000140000091e 0000000000000008 b61482848ffe9f11 b61482848ffe9f11
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.855558]         0000000000000000 8000000006e737e0 ffffffff818a3150 8000000006e736b0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.863674]         0000000000000000 c0000000ffffefff 0000000000000003 ffffffffffffffea
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.871790]         0000000000000010 ffffffff814daee0 0000000000000001 8000000006e736d6
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.879906]         fffe000000000000 0000000000000001 0000000000000000 ffffffff818a0000
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.888022]         ffffffff81a49310 8000000006e73b98 0000000000001000 ffffffff8199b5b0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.896137]         00000000fffffffe ffffffff81521ba8 0000000000000000 ffffffff81b90000
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.904252]         8000000006e70000 8000000006e737e0 0000000000000001 ffffffff814c16c0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.912368]         0000000000000000 ffffffff814c15cc 0000000000000000 ffffffff818a3150
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.920483]         0000000000000000 ffffffff81117dc8 0000000000000000 0000000000000000
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.928598]         ...
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.931139] Call Trace:
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.933688] [<ffffffff81117dc8>] show_stack+0x30/0x100
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.938932] [<ffffffff814c16c0>] dump_stack+0xa0/0xd0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.944083] [<ffffffff81229c80>] dump_header+0x50/0x1d0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.949406] [<ffffffff8122a54c>] oom_kill_process+0x20c/0x218
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.955250] [<ffffffff8122af98>] out_of_memory+0x218/0x3a8
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.960837] [<ffffffff812754a8>] __alloc_pages_slowpath.constprop.0+0xa98/0xcd8
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.968247] [<ffffffff81275938>] __alloc_pages_nodemask+0x250/0x280
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.974612] [<ffffffff81275984>] __get_free_pages+0x1c/0x78
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.980284] [<ffffffff811261c8>] pgd_alloc+0x18/0x58
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.985346] [<ffffffff8113183c>] mm_init+0xec/0x1f8
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.990320] [<ffffffff81132344>] dup_mm+0xa4/0x488
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393372.995208] [<ffffffff811331dc>] copy_process+0x7ec/0x1568
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.000791] [<ffffffff81134008>] kernel_clone+0x58/0x358
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.006200] [<ffffffff8113451c>] sys_fork+0x4c/0x58
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.011174] [<ffffffff811204e8>] syscall_common+0x34/0x58
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.016670]
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.018432] Mem-Info:
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967] active_anon:5713 inactive_anon:17821 isolated_anon:0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967]  active_file:978 inactive_file:528 isolated_file:0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967]  unevictable:0 dirty:0 writeback:0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967]  slab_reclaimable:546 slab_unreclaimable:3156
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967]  mapped:1073 shmem:8092 pagetables:160 bounce:0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.020967]  free:98287 free_pcp:59 free_cma:0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.053451] Node 0 active_anon:22868kB inactive_anon:71288kB active_file:3860kB inactive_file:2160kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:4272kB dirty:0kB writeback:0kB shmem:32368kB writeback_tmp:0kB kernel_stack:1088kB all_unreclaimable? no
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.077295] DMA32 free:393376kB min:16384kB low:20480kB high:24576kB reserved_highatomic:0KB active_anon:22868kB inactive_anon:71324kB active_file:3860kB inactive_file:2360kB unevictable:0kB writepending:0kB present:1010540kB managed:965956kB mlocked:0kB pagetables:596kB bounce:0kB free_pcp:240kB local_pcp:236kB free_cma:0kB
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.106332] lowmem_reserve[]: 0 0 0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.110036] DMA32: 98336*4kB (UME) 0*8kB 6*16kB (U) 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 393440kB
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.121810] 9596 total pagecache pages
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.125678] 0 pages in swap cache
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.129107] Swap cache stats: add 0, delete 0, find 0/0
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.134435] Free swap  = 0kB
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.137409] Total swap = 0kB
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.140394] 252635 pages RAM
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.143368] 0 pages HighMem/MovableOnly
Mon Oct 18 01:53:27 2021 kern.warn kernel: [393373.147296] 11146 pages reserved
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.150625] Tasks state (memory values in pages):
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.155424] [  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.164195] [    537]    81   537      358       73    32768        0             0 ubusd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.172524] [    538]     0   538      259        8    28672        0             0 askfirst
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.181118] [    572]     0   572      290        9    24576        0             0 urngd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.189447] [    815]   514   815      357       44    36864        0             0 logd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.197809] [    816]     0   816      386       22    32768        0             0 logread
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.206406] [    868]     0   868      619       99    36864        0             0 rpcd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.214631] [   1087]     0  1087      312       13    28672        0             0 dropbear
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.223226] [   1197]     0  1197      511       71    32768        0             0 netifd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.231641] [   1381]     0  1381      393       13    24576        0             0 udhcpc
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.240059] [   1395]     0  1395      412       32    32768        0             0 odhcpd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.248469] [   1647]     0  1647      974      173    40960        0             0 uhttpd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.256845] [   2259]     0  2259      393       14    28672        0             0 ntpd
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.265089] [   2334]     0  2334      481      318    36864        0             0 sh
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.273159] [   2876]     0  2876     1510      524    36864        0             0 tcpdump
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.281665] [  28356]   453 28356    15547    15040   151552        0             0 dnsmasq
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.290170] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,global_oom,task_memcg=/,task=dnsmasq,pid=28356,uid=453
Mon Oct 18 01:53:27 2021 kern.err kernel: [393373.303138] Out of memory: Killed process 28356 (dnsmasq) total-vm:62188kB, anon-rss:59076kB, file-rss:1084kB, shmem-rss:0kB, UID:453 pgtables:148kB oom_score_adj:0
Mon Oct 18 01:53:27 2021 kern.info kernel: [393373.335060] oom_reaper: reaped process 28356 (dnsmasq), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
Mon Oct 18 01:53:35 2021 daemon.info dnsmasq[32620]: started, version 2.86 cachesize 150

start debugging with the basics:

  • is there a steady continous leak, or does it simply suddenly explode ?
    (you could monitor mem consumption with LuCI statistics)
  • does it happen on some known event, like adblock update/refresh (with lots of blocklists)?
  • dnsmasq is DNS and DHCP. Is there anything special regarding your config?

(ps. which OpenWrt version? which router? ... Just thinking that the ujail functionality has recently been enabled by default in master, and dnsmasq seems to be one of the jailed processes. )

2 Likes

I saw the ujail update, but that was triggering a SEGFAULT rather than a mem-leak it looked like. (I was dealing with that dnsmasq issue on a newer build).

This is a steady-state issue. I use Adblock, but as it shows below, I remain ~350MB of free RAM after it loads and stabilizes (1Gb total).

root@gateway:~# uci show dhcp
dhcp.@dnsmasq[0]=dnsmasq
dhcp.@dnsmasq[0].domainneeded='1'
dhcp.@dnsmasq[0].localise_queries='1'
dhcp.@dnsmasq[0].rebind_protection='1'
dhcp.@dnsmasq[0].rebind_localhost='1'
dhcp.@dnsmasq[0].expandhosts='1'
dhcp.@dnsmasq[0].authoritative='1'
dhcp.@dnsmasq[0].readethers='1'
dhcp.@dnsmasq[0].leasefile='/tmp/dhcp.leases'
dhcp.@dnsmasq[0].resolvfile='/tmp/resolv.conf.d/resolv.conf.auto'
dhcp.@dnsmasq[0].localservice='1'
dhcp.@dnsmasq[0].ednspacket_max='1232'
dhcp.@dnsmasq[0].local='/xxxxx.com/'
dhcp.@dnsmasq[0].domain='xxxxx.com'
dhcp.@dnsmasq[0].confdir='/tmp/dnsmasq.d'
dhcp.@dnsmasq[0].sequential_ip='1'
dhcp.@dnsmasq[0].dnssec='1'
dhcp.@dnsmasq[0].nonegcache='1'
dhcp.lan=dhcp
dhcp.lan.interface='lan'
dhcp.lan.start='100'
dhcp.lan.limit='150'
dhcp.lan.leasetime='12h'
dhcp.lan.dhcpv4='server'
dhcp.lan.dhcpv6='server'
dhcp.lan.ra='server'
dhcp.lan.ra_slaac='1'
dhcp.lan.ra_flags='managed-config' 'other-config'
dhcp.wan=dhcp
dhcp.wan.interface='wan'
dhcp.wan.ignore='1'
dhcp.odhcpd=odhcpd
dhcp.odhcpd.maindhcp='0'
dhcp.odhcpd.leasefile='/tmp/hosts/odhcpd'
dhcp.odhcpd.leasetrigger='/usr/sbin/odhcpd-update'
dhcp.odhcpd.loglevel='4'

Itus Shield Octeon

This is my network edge device, so I try not to cycle thru the updates as much as one of the testing devices :smiley:

Sometimes, oom-killer hits and dnsmasq hangs, which means I have to kick the entire thing over to get the network back, so anything I can do to track it down..

I've had similar issues with Adblock and larger lists (thou i run into lower limits as i only have 128mb to play with).

About 140k entries it OOMs and so i switched to using AdGuardHome instead. This seems to work ok but I'm still hitting memory limits at times.

(edit) one the other reasons i switched was because i could do DoH via AGH instead of using stubby etc

1 Like

As I sent this last message, the death loop happened.
<30>1 2021-10-18T05:36:57-04:00 gateway procd - - - procd: Instance dnsmasq::cfg01411c s in a crash loop 6 crashes, 15 seconds since last crash

This happens when oom-killer fails to restart things I guess.

Yeah, I could understand having memory issues if I was actually out of RAM. Without any swap, I have the roughly 350MB of physical RAM free. I turned off swap because I didn't know if it was causing the OOM condition, but with zram swap on with zstd compression, I get another nearly 650Mb of compressed Swap available that never gets touched :frowning:

does sound like its choking on something or a mem leak... but then if u have 350mb free? why is it killing it... Unless its trying to allocate more ram as a single block and can not? Try removing one your lists and see if it stablises?

I do know there are some list cleaners for adblock to help with memory issues but its been a while since i've used adblock. (they basically import all the lists and merge dupes etc so it just has one master list from all the ones you give it)

2 Likes

I've pared back the Adblock lists, and now, I'm showing:

So, we will see what happens.

1 Like

at least will help you narrow down if it dnsmasq or adblock causing issues.
There is one thing i do remember i had issues with. adblock when it did an update. if it didnt have enough memory to have 2x your lists in memory would fall over. i wonder if that is what might be happening? So while you have 350mb of ram "free" if your lists in total are over that then the loading and swapping to the updated lists might be why it dies? (he may have patched this. i honestly dont know but it is something i remembered.)

Could it be a variation of https://lists.thekelleys.org.uk/pipermail/dnsmasq-discuss/2021q2/014920.html ff.?

4 Likes

Well, your "used" goes from 548 MB down to 129 MB, by 420 MB...

During the blocklist update you might need 548+420 = 968 MB for that. And if you only have 350 MB free, the handling of the 420 MB list sounds unrealistic. (The new list needs to be prepared, while the old dnsmasq process still has the previous list.)

Almost half a gigabyte for adblock lists sounds extreme.

3 Likes

Here's a link to the original thread on this forum related to that particular report

3 Likes

I've taken the patch file you put in that thread and have updated it reflect dnsmasq-2.38. I'll test it and post it there for people to use, or submit it as a PR patch if it works (unless you want to.. It's your code, I just reworked it for the new source)

package/network/services/dnsmasq/patches/200-fix_max_procs.patch

--- a/src/dnsmasq.c
+++ b/src/dnsmasq.c
@@ -1036,7 +1036,7 @@ int main (int argc, char **argv)
   pid = getpid();

   daemon->pipe_to_parent = -1;
-  for (i = 0; i < MAX_PROCS; i++)
+  for (i = 0; i < daemon->max_procs; i++)
     daemon->tcp_pipes[i] = -1;

 #ifdef HAVE_INOTIFY
@@ -1497,7 +1497,7 @@ static void async_event(int pipe, time_t
                break;
            }
          else
-           for (i = 0 ; i < MAX_PROCS; i++)
+           for (i = 0 ; i < daemon->max_procs; i++)
              if (daemon->tcp_pids[i] == p)
                daemon->tcp_pids[i] = 0;
        break;
@@ -1561,7 +1561,7 @@ static void async_event(int pipe, time_t

       case EVENT_TERM:
        /* Knock all our children on the head. */
-       for (i = 0; i < MAX_PROCS; i++)
+       for (i = 0; i < daemon->max_procs; i++)
          if (daemon->tcp_pids[i] != 0)
            kill(daemon->tcp_pids[i], SIGALRM);

@@ -1732,7 +1732,7 @@ static void set_dns_listeners(void)
     poll_listen(rfl->rfd->fd, POLLIN);

   /* check to see if we have free tcp process slots. */
-  for (i = MAX_PROCS - 1; i >= 0; i--)
+  for (i = daemon->max_procs - 1; i >= 0; i--)
     if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1)
       break;

@@ -1756,7 +1756,7 @@ static void set_dns_listeners(void)
     }

   if (!option_bool(OPT_DEBUG))
-    for (i = 0; i < MAX_PROCS; i++)
+    for (i = 0; i < daemon->max_procs; i++)
       if (daemon->tcp_pipes[i] != -1)
        poll_listen(daemon->tcp_pipes[i], POLLIN);
 }
@@ -1791,7 +1791,7 @@ static void check_dns_listeners(time_t n
      to free the process slot. Once the child process has gone, poll()
      returns POLLHUP, not POLLIN, so have to check for both here. */
   if (!option_bool(OPT_DEBUG))
-    for (i = 0; i < MAX_PROCS; i++)
+    for (i = 0; i < daemon->max_procs; i++)
       if (daemon->tcp_pipes[i] != -1 &&
          poll_check(daemon->tcp_pipes[i], POLLIN | POLLHUP) &&
          !cache_recv_insert(now, daemon->tcp_pipes[i]))
@@ -1815,7 +1815,7 @@ static void check_dns_listeners(time_t n
         at least one a poll() time, that we still do.
         There may be more waiting connections after
         poll() returns then free process slots. */
-      for (i = MAX_PROCS - 1; i >= 0; i--)
+      for (i = daemon->max_procs - 1; i >= 0; i--)
        if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1)
          break;

--- a/src/dnsmasq.h
+++ b/src/dnsmasq.h
@@ -1240,6 +1240,9 @@ extern struct daemon {
   /* file for packet dumps. */
   int dumpfd;
 #endif
+
+  /* maximum number of child processes to fork */
+  unsigned int max_procs;
 } *daemon;

 /* cache.c */
--- a/src/option.c
+++ b/src/option.c
@@ -174,7 +174,8 @@ struct myoption {
 #define LOPT_CMARK_ALST_EN 365
 #define LOPT_CMARK_ALST    366
 #define LOPT_QUIET_TFTP    367
-
+#define LOPT_MAX_PROCS     368
+
 #ifdef HAVE_GETOPT_LONG
 static const struct option opts[] =
 #else
@@ -351,8 +352,9 @@ static const struct myoption opts[] =
     { "dhcp-ignore-clid", 0, 0,  LOPT_IGNORE_CLID },
     { "dynamic-host", 1, 0, LOPT_DYNHOST },
     { "log-debug", 0, 0, LOPT_LOG_DEBUG },
-       { "umbrella", 2, 0, LOPT_UMBRELLA },
+    { "umbrella", 2, 0, LOPT_UMBRELLA },
     { "quiet-tftp", 0, 0, LOPT_QUIET_TFTP },
+    { "max-procs", 1, 0, LOPT_MAX_PROCS },
     { NULL, 0, 0, 0 }
   };

@@ -539,8 +541,9 @@ static struct {
   { LOPT_SCRIPT_TIME, OPT_LEASE_RENEW, NULL, gettext_noop("Call dhcp-script when lease expiry changes."), NULL },
   { LOPT_UMBRELLA, ARG_ONE, "[=<optspec>]", gettext_noop("Send Cisco Umbrella identifiers including remote IP."), NULL },
   { LOPT_QUIET_TFTP, OPT_QUIET_TFTP, NULL, gettext_noop("Do not log routine TFTP."), NULL },
+  { LOPT_MAX_PROCS, ARG_ONE, "<number>", gettext_noop("Specify maximum number of child process to fork."), NULL },
   { 0, 0, NULL, NULL, NULL }
-};
+};

 /* We hide metacharacters in quoted strings by mapping them into the ASCII control
    character space. Note that the \0, \t \b \r \033 and \n characters are carefully placed in the
@@ -4800,7 +4803,12 @@ err:
        break;
       }
 #endif
-
+    case LOPT_MAX_PROCS:  /* --max-procs */
+      if (!atoi_check16(arg, &daemon->max_procs))
+       ret_err(gen_err);
+      if (daemon->max_procs > MAX_PROCS) daemon->max_procs = MAX_PROCS;
+        break;
+
     default:
       ret_err(_("unsupported option (check that dnsmasq was compiled with DHCP/TFTP/DNSSEC/DBus support)"));

@@ -5282,7 +5290,7 @@ void read_opts(int argc, char **argv, ch
   daemon->soa_refresh = SOA_REFRESH;
   daemon->soa_retry = SOA_RETRY;
   daemon->soa_expiry = SOA_EXPIRY;
-
+  daemon->max_procs = MAX_PROCS;
 #ifndef NO_ID
   add_txt("version.bind", "dnsmasq-" VERSION, 0 );
   add_txt("authors.bind", "Simon Kelley", 0);

Sure, I can submit a PR. Given that Simon Kelley didn't seem to think that a patch like this belongs in the upstream source, it probably makes sense to add it to Openwrt - the combination of large adblock lists and constrained router memory makes this a problem just waiting to reoccur.

Let us know if that patch actually solves your problem.

1 Like

Funny enough:

root@OpenWrt:/# service dnsmasq start
[  433.275260] do_page_fault(): sending SIGSEGV to ujail for invalid read access from 00000100f32271e3
[  433.284367] epc = 000000aaab68a828 in ujail[aaab680000+14000]
[  433.290141] ra  = 000000aaab68adc4 in ujail[aaab680000+14000]
root@OpenWrt:/#

Fresh rebase with master as of this post.. Any suggestions?

Edit: I'm going to try building without the procd-ujail and see if that helps

You should alert @daniel who has been the main author of the ujail functionality.

Please share platform and configuration which will allow me to reproduce this error.

Other suggestion is that you could build with debugging enabled, and you should be able to a core dump out of it, which you can then analyse with gdb in your buildhost (where you have the non-stripped binaries in staging_dir)

If gdb core dump debugging in new for you, see a recent example in:

Successful debugging (finding the crashing source line) pretty much requires that you have the unstripped binary still available in the buildhost. (or you copy and use the unstripped binary into the router itself.)

Hi @daniel - MIPS64 Octeon3 Itus Shield, built from source (master), HEAD at c4e994011f. Swapping procd-ujail for just procd solved the ujail issue (obviousy) and I'm more than happy to help test whatever once I get the image stabilized (this started with dnsmasq memory issues)

I've used remotegdb once before, but I don't use gdb all that much. I'll look into it, or maybe someone will be able to give me specific directions, if it comes to it :slight_smile:
I did remove sstrip striping from the build to leave the symbols intact though, so it's an option going forward.

I did manage to get dnsmasq patched to allow for the --max-procs call, and updated /etc/init.d/dnsmasq to call it.

# auto-generated config file from /etc/config/dhcp
conf-file=/etc/dnsmasq.conf
dhcp-authoritative
domain-needed
localise-queries
read-ethers
enable-ubus=dnsmasq
expand-hosts
bind-dynamic
local-service
edns-packet-max=1232
domain=lan
local=/lan/
max-procs=1
addn-hosts=/tmp/hosts
dhcp-leasefile=/tmp/dhcp.leases
resolv-file=/tmp/resolv.conf.d/resolv.conf.auto
stop-dns-rebind
rebind-localhost-ok
dhcp-broadcast=tag:needs-broadcast
conf-dir=/tmp/dnsmasq.d
user=dnsmasq
group=dnsmasq
dhcp-ignore-names=tag:dhcp_bogus_hostname
conf-file=/usr/share/dnsmasq/dhcpbogushostname.conf
bogus-priv
conf-file=/usr/share/dnsmasq/rfc6761.conf
dhcp-range=set:lan,192.168.1.100,192.168.1.249,255.255.255.0,12h
no-dhcp-interface=eth0

We will see if this fixes it or not.. it usually takes a few days though.

@dl12345

Ok.. So. Changes for dnsmasq. I'm currently testing to see if it dies, but it'll take a few days now that I'm stablized, but these are the changes that'll need to go into the PR you put up.

Patch to fix the dnsmasq init service

--- a/package/network/services/dnsmasq/files/dnsmasq.init
+++ b/package/network/services/dnsmasq/files/dnsmasq.init
@@ -937,6 +937,8 @@
        append_parm "$cfg" "maxport" "--max-port"
        append_parm "$cfg" "domain" "--domain"
        append_parm "$cfg" "local" "--local"
+       append_parm "$cfg" "maxprocs" "--max-procs"
+
        config_list_foreach "$cfg" "listen_address" append_listenaddress
        config_list_foreach "$cfg" "server" append_server
        config_list_foreach "$cfg" "rev_server" append_rev_server
--- a/package/network/services/dnsmasq/files/dhcp.conf
+++ b/package/network/services/dnsmasq/files/dhcp.conf
@@ -21,6 +21,7 @@
        #list bogusnxdomain     '64.94.110.11'
        option localservice     1  # disable to allow DNS requests from non-local subnets
        option ednspacket_max   1232
+       option maxprocs         1  # Max Forked Processes

 config dhcp lan
        option interface        lan

package/network/services/dnsmasq/patches/200-fix_max_procs.patch

--- a/src/dnsmasq.c
+++ b/src/dnsmasq.c
@@ -1036,7 +1036,7 @@ int main (int argc, char **argv)
   pid = getpid();

   daemon->pipe_to_parent = -1;
-  for (i = 0; i < MAX_PROCS; i++)
+  for (i = 0; i < daemon->max_procs; i++)
     daemon->tcp_pipes[i] = -1;

 #ifdef HAVE_INOTIFY
@@ -1497,7 +1497,7 @@ static void async_event(int pipe, time_t
                break;
            }
          else
-           for (i = 0 ; i < MAX_PROCS; i++)
+           for (i = 0 ; i < daemon->max_procs; i++)
              if (daemon->tcp_pids[i] == p)
                daemon->tcp_pids[i] = 0;
        break;
@@ -1561,7 +1561,7 @@ static void async_event(int pipe, time_t

       case EVENT_TERM:
        /* Knock all our children on the head. */
-       for (i = 0; i < MAX_PROCS; i++)
+       for (i = 0; i < daemon->max_procs; i++)
          if (daemon->tcp_pids[i] != 0)
            kill(daemon->tcp_pids[i], SIGALRM);

@@ -1732,7 +1732,7 @@ static void set_dns_listeners(void)
     poll_listen(rfl->rfd->fd, POLLIN);

   /* check to see if we have free tcp process slots. */
-  for (i = MAX_PROCS - 1; i >= 0; i--)
+  for (i = daemon->max_procs - 1; i >= 0; i--)
     if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1)
       break;

@@ -1756,7 +1756,7 @@ static void set_dns_listeners(void)
     }

   if (!option_bool(OPT_DEBUG))
-    for (i = 0; i < MAX_PROCS; i++)
+    for (i = 0; i < daemon->max_procs; i++)
       if (daemon->tcp_pipes[i] != -1)
        poll_listen(daemon->tcp_pipes[i], POLLIN);
 }
@@ -1791,7 +1791,7 @@ static void check_dns_listeners(time_t n
      to free the process slot. Once the child process has gone, poll()
      returns POLLHUP, not POLLIN, so have to check for both here. */
   if (!option_bool(OPT_DEBUG))
-    for (i = 0; i < MAX_PROCS; i++)
+    for (i = 0; i < daemon->max_procs; i++)
       if (daemon->tcp_pipes[i] != -1 &&
          poll_check(daemon->tcp_pipes[i], POLLIN | POLLHUP) &&
          !cache_recv_insert(now, daemon->tcp_pipes[i]))
@@ -1815,7 +1815,7 @@ static void check_dns_listeners(time_t n
         at least one a poll() time, that we still do.
         There may be more waiting connections after
         poll() returns then free process slots. */
-      for (i = MAX_PROCS - 1; i >= 0; i--)
+      for (i = daemon->max_procs - 1; i >= 0; i--)
        if (daemon->tcp_pids[i] == 0 && daemon->tcp_pipes[i] == -1)
          break;

--- a/src/dnsmasq.h
+++ b/src/dnsmasq.h
@@ -1240,6 +1240,9 @@ extern struct daemon {
   /* file for packet dumps. */
   int dumpfd;
 #endif
+
+  /* maximum number of child processes to fork */
+  unsigned int max_procs;
 } *daemon;

 /* cache.c */
--- a/src/option.c
+++ b/src/option.c
@@ -174,7 +174,8 @@ struct myoption {
 #define LOPT_CMARK_ALST_EN 365
 #define LOPT_CMARK_ALST    366
 #define LOPT_QUIET_TFTP    367
-
+#define LOPT_MAX_PROCS     368
+
 #ifdef HAVE_GETOPT_LONG
 static const struct option opts[] =
 #else
@@ -351,8 +352,9 @@ static const struct myoption opts[] =
     { "dhcp-ignore-clid", 0, 0,  LOPT_IGNORE_CLID },
     { "dynamic-host", 1, 0, LOPT_DYNHOST },
     { "log-debug", 0, 0, LOPT_LOG_DEBUG },
-       { "umbrella", 2, 0, LOPT_UMBRELLA },
+    { "umbrella", 2, 0, LOPT_UMBRELLA },
     { "quiet-tftp", 0, 0, LOPT_QUIET_TFTP },
+    { "max-procs", 1, 0, LOPT_MAX_PROCS },
     { NULL, 0, 0, 0 }
   };

@@ -539,8 +541,9 @@ static struct {
   { LOPT_SCRIPT_TIME, OPT_LEASE_RENEW, NULL, gettext_noop("Call dhcp-script when lease expiry changes."), NULL },
   { LOPT_UMBRELLA, ARG_ONE, "[=<optspec>]", gettext_noop("Send Cisco Umbrella identifiers including remote IP."), NULL },
   { LOPT_QUIET_TFTP, OPT_QUIET_TFTP, NULL, gettext_noop("Do not log routine TFTP."), NULL },
+  { LOPT_MAX_PROCS, ARG_ONE, "<number>", gettext_noop("Specify maximum number of child process to fork."), NULL },
   { 0, 0, NULL, NULL, NULL }
-};
+};

 /* We hide metacharacters in quoted strings by mapping them into the ASCII control
    character space. Note that the \0, \t \b \r \033 and \n characters are carefully placed in the
@@ -4800,7 +4803,12 @@ err:
        break;
       }
 #endif
-
+    case LOPT_MAX_PROCS:  /* --max-procs */
+      if (!atoi_check16(arg, &daemon->max_procs))
+       ret_err(gen_err);
+      if (daemon->max_procs > MAX_PROCS) daemon->max_procs = MAX_PROCS;
+        break;
+
     default:
       ret_err(_("unsupported option (check that dnsmasq was compiled with DHCP/TFTP/DNSSEC/DBus support)"));

@@ -5282,7 +5290,7 @@ void read_opts(int argc, char **argv, ch
   daemon->soa_refresh = SOA_REFRESH;
   daemon->soa_retry = SOA_RETRY;
   daemon->soa_expiry = SOA_EXPIRY;
-
+  daemon->max_procs = MAX_PROCS;
 #ifndef NO_ID
   add_txt("version.bind", "dnsmasq-" VERSION, 0 );
   add_txt("authors.bind", "Simon Kelley", 0);