Skip to content

Commit

Permalink
nscd: disable caching for real
Browse files Browse the repository at this point in the history
We noticed that nscd does cache things and sometimes causes weird bugs. Turning the TTL do 0 does not help in (some? all?) cases as the test was able to demonstrate even on the original commit where the TTL was lowered to 0.

This change ships a patch to completely short circuit nscd's cache functionality and a test that shows that `getaddrinfo` called
quickly twice causes proper resolver queries.

Could be used as a temporary fix for NixOS#135888 and NixOS#55276
  • Loading branch information
ctheune committed Nov 18, 2021
1 parent 46251a7 commit 39abefe
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 8 deletions.
22 changes: 14 additions & 8 deletions nixos/modules/services/system/nscd.conf
Original file line number Diff line number Diff line change
@@ -1,34 +1,40 @@
# We basically use nscd as a proxy for forwarding nss requests to appropriate
# nss modules, as we run nscd with LD_LIBRARY_PATH set to the directory
# containing all such modules
# Note that we can not use `enable-cache no` As this will actually cause nscd
# Note that we can not use `enable-cache no` as this will actually cause nscd
# to just reject the nss requests it receives, which then causes glibc to
# fallback to trying to handle the request by itself. Which won't work as glibc
# is not aware of the path in which the nss modules live. As a workaround, we
# have `enable-cache yes` with an explicit ttl of 0
# is not aware of the path in which the nss modules live.
#
# We originally tried to set this to 0 but that was unreliable and we added
# a patch to NSCD that short-circuits the cache functionality.
#
# Disabling shared access is a minor optimization to avoid that the clients
# look at the (empty) cache.
#
server-user nscd

enable-cache passwd yes
positive-time-to-live passwd 0
negative-time-to-live passwd 0
shared passwd yes
shared passwd no

enable-cache group yes
positive-time-to-live group 0
negative-time-to-live group 0
shared group yes
shared group no

enable-cache netgroup yes
positive-time-to-live netgroup 0
negative-time-to-live netgroup 0
shared netgroup yes
shared netgroup no

enable-cache hosts yes
positive-time-to-live hosts 0
negative-time-to-live hosts 0
shared hosts yes
shared hosts no

enable-cache services yes
positive-time-to-live services 0
negative-time-to-live services 0
shared services yes
shared services no
1 change: 1 addition & 0 deletions nixos/tests/all-tests.nix
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ in
ndppd = handleTest ./ndppd.nix {};
nebula = handleTest ./nebula.nix {};
neo4j = handleTest ./neo4j.nix {};
nscd = handleTest ./nscd.nix {};
netdata = handleTest ./netdata.nix {};
networking.networkd = handleTest ./networking.nix { networkd = true; };
networking.scripted = handleTest ./networking.nix { networkd = false; };
Expand Down
41 changes: 41 additions & 0 deletions nixos/tests/nscd.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import ./make-test-python.nix {
name = "nscd";

machine = { pkgs, ... }: {

environment.systemPackages = [
pkgs.python3Full
];
services.dnsmasq = {
enable = true;
extraConfig = ''
log-queries
log-facility=-
address=/example.com/127.0.0.1
'';
};
networking.nameservers = [ "127.0.0.1" ];
};

testScript = ''
machine.wait_for_unit("multi-user.target")
print(
machine.execute(
"python -c 'import socket; socket.getaddrinfo(\"example.com\", 0, socket.AF_INET)'"
)[1]
)
print("checking query log (1/2)")
query_count = machine.execute("journalctl -u dnsmasq.service | grep -c example.com")[1]
assert int(query_count) == 2
print(
machine.execute(
"python -c 'import socket; socket.getaddrinfo(\"example.com\", 0, socket.AF_INET)'"
)[1]
)
print("OK")
print("checking query log (2/2)")
query_count = machine.execute("journalctl -u dnsmasq.service | grep -c example.com")[1]
assert int(query_count) == 4
print("OK")
'';
}
2 changes: 2 additions & 0 deletions pkgs/development/libraries/glibc/common.nix
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ stdenv.mkDerivation ({
*/
./2.32-master.patch.gz

./disable-nscd-cache.patch

/* Allow NixOS and Nix to handle the locale-archive. */
./nix-locale-archive.patch

Expand Down
90 changes: 90 additions & 0 deletions pkgs/development/libraries/glibc/disable-nscd-cache.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
diff --git a/nscd/cache.c b/nscd/cache.c
index 85090a1641..5ffd578d48 100644
--- a/nscd/cache.c
+++ b/nscd/cache.c
@@ -153,83 +153,9 @@ cache_add (int type, const void *key, size_t len, struct datahead *packet,
first ? _(" (first)") : "");
}

- unsigned long int hash = __nss_hash (key, len) % table->head->module;
- struct hashentry *newp;
-
- newp = mempool_alloc (table, sizeof (struct hashentry), 0);
- /* If we cannot allocate memory, just do not do anything. */
- if (newp == NULL)
- {
- /* If necessary mark the entry as unusable so that lookups will
- not use it. */
- if (first)
+ // Completely bypass adding things to the cache.
packet->usable = false;
-
- return -1;
- }
-
- newp->type = type;
- newp->first = first;
- newp->len = len;
- newp->key = (char *) key - table->data;
- assert (newp->key + newp->len <= table->head->first_free);
- newp->owner = owner;
- newp->packet = (char *) packet - table->data;
- assert ((newp->packet & BLOCK_ALIGN_M1) == 0);
-
- /* Put the new entry in the first position. */
- /* TODO Review concurrency. Use atomic_exchange_release. */
- newp->next = atomic_load_relaxed (&table->head->array[hash]);
- while (!atomic_compare_exchange_weak_release (&table->head->array[hash],
- (ref_t *) &newp->next,
- (ref_t) ((char *) newp
- - table->data)));
-
- /* Update the statistics. */
- if (packet->notfound)
- ++table->head->negmiss;
- else if (first)
- ++table->head->posmiss;
-
- /* We depend on this value being correct and at least as high as the
- real number of entries. */
- atomic_increment (&table->head->nentries);
-
- /* It does not matter that we are not loading the just increment
- value, this is just for statistics. */
- unsigned long int nentries = table->head->nentries;
- if (nentries > table->head->maxnentries)
- table->head->maxnentries = nentries;
-
- if (table->persistent)
- // XXX async OK?
- msync ((void *) table->head,
- (char *) &table->head->array[hash] - (char *) table->head
- + sizeof (ref_t), MS_ASYNC);
-
- /* We do not have to worry about the pruning thread if we are
- re-adding the data since this is done by the pruning thread. We
- also do not have to do anything in case this is not the first
- time the data is entered since different data heads all have the
- same timeout. */
- if (first && prune_wakeup)
- {
- /* Perhaps the prune thread for the table is not running in a long
- time. Wake it if necessary. */
- pthread_mutex_lock (&table->prune_lock);
- time_t next_wakeup = table->wakeup_time;
- bool do_wakeup = false;
- if (next_wakeup > packet->timeout + CACHE_PRUNE_INTERVAL)
- {
- table->wakeup_time = packet->timeout;
- do_wakeup = true;
- }
- pthread_mutex_unlock (&table->prune_lock);
- if (do_wakeup)
- pthread_cond_signal (&table->prune_cond);
- }
-
- return 0;
+ return -1;
}

/* Walk through the table and remove all entries which lifetime ended.

0 comments on commit 39abefe

Please sign in to comment.