diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea102..08c5e3fd8235c8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3019,6 +3019,10 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) char buf[PROC_NUMBUF]; struct qstr name; + /* procfs is not mounted. There is nothing to unhash. */ + if (!mnt) + return; + name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", pid); /* no ->d_hash() rejects on procfs */ diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e43108..d9d087b555ee4d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -97,6 +97,7 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new, struct pid_namespace; extern struct pid_namespace init_pid_ns; +extern int pid_max; /* * look up a PID in the hash table. Must be called with the tasklist_lock diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index b09136f88cf45c..1ac6e85dd6c2f1 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -9,6 +9,7 @@ #include #include #include +#include struct pidmap { atomic_t nr_free; @@ -29,7 +30,7 @@ enum { /* definitions for pid_namespace's hide_pid field */ struct pid_namespace { struct kref kref; - struct pidmap pidmap[PIDMAP_ENTRIES]; + struct idr idr; struct rcu_head rcu; int last_pid; unsigned int nr_hashed; @@ -105,6 +106,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); void pidhash_init(void); -void pidmap_init(void); +void pid_idr_init(void); #endif /* _LINUX_PID_NS_H */ diff --git a/init/main.c b/init/main.c index 052481fbe3633f..7e64f5e7c8b320 100644 --- a/init/main.c +++ b/init/main.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -658,7 +659,7 @@ asmlinkage __visible void __init start_kernel(void) if (late_time_init) late_time_init(); calibrate_delay(); - pidmap_init(); + pid_idr_init(); anon_vma_init(); acpi_early_init(); #ifdef CONFIG_X86 @@ -980,7 +981,6 @@ static inline void mark_readonly(void) static int __ref kernel_init(void *unused) { int ret; - kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); @@ -1053,7 +1053,6 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); - do_basic_setup(); /* Open the /dev/console on the rootfs, this should never fail */ diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5a2..508d569d0e22dc 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -39,6 +39,7 @@ #include #include #include +#include #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) @@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -static inline int mk_pid(struct pid_namespace *pid_ns, - struct pidmap *map, int off) -{ - return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; -} - #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -68,12 +63,11 @@ static inline int mk_pid(struct pid_namespace *pid_ns, * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ + struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), - .pidmap = { - [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } - }, .last_pid = 0, + .idr = IDR_INIT, .nr_hashed = PIDNS_HASH_ADDING, .level = 0, .child_reaper = &init_task, @@ -101,138 +95,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) -{ - int nr = upid->nr; - struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; - int offset = nr & BITS_PER_PAGE_MASK; - - clear_bit(offset, map->page); - atomic_inc(&map->nr_free); -} - -/* - * If we started walking pids at 'base', is 'a' seen before 'b'? - */ -static int pid_before(int base, int a, int b) -{ - /* - * This is the same as saying - * - * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT - * and that mapping orders 'a' and 'b' with respect to 'base'. - */ - return (unsigned)(a - base) < (unsigned)(b - base); -} - -/* - * We might be racing with someone else trying to set pid_ns->last_pid - * at the pid allocation time (there's also a sysctl for this, but racing - * with this one is OK, see comment in kernel/pid_namespace.c about it). - * We want the winner to have the "later" value, because if the - * "earlier" value prevails, then a pid may get reused immediately. - * - * Since pids rollover, it is not sufficient to just pick the bigger - * value. We have to consider where we started counting from. - * - * 'base' is the value of pid_ns->last_pid that we observed when - * we started looking for a pid. - * - * 'pid' is the pid that we eventually found. - */ -static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) -{ - int prev; - int last_write = base; - do { - prev = last_write; - last_write = cmpxchg(&pid_ns->last_pid, prev, pid); - } while ((prev != last_write) && (pid_before(base, last_write, pid))); -} - -static int alloc_pidmap(struct pid_namespace *pid_ns) -{ - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; - offset = pid & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; - /* - * If last_pid points into the middle of the map->page we - * want to scan this bitmap block twice, the second time - * we start with offset == 0 (or RESERVED_PIDS). - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { - if (unlikely(!map->page)) { - void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* - * Free the page if someone raced with us - * installing it: - */ - spin_lock_irq(&pidmap_lock); - if (!map->page) { - map->page = page; - page = NULL; - } - spin_unlock_irq(&pidmap_lock); - kfree(page); - if (unlikely(!map->page)) - return -ENOMEM; - } - if (likely(atomic_read(&map->nr_free))) { - for ( ; ; ) { - if (!test_and_set_bit(offset, map->page)) { - atomic_dec(&map->nr_free); - set_last_pid(pid_ns, last, pid); - return pid; - } - offset = find_next_offset(map, offset); - if (offset >= BITS_PER_PAGE) - break; - pid = mk_pid(pid_ns, map, offset); - if (pid >= pid_max) - break; - } - } - if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { - ++map; - offset = 0; - } else { - map = &pid_ns->pidmap[0]; - offset = RESERVED_PIDS; - if (unlikely(last == offset)) - break; - } - pid = mk_pid(pid_ns, map, offset); - } - return -EAGAIN; -} - -int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) -{ - int offset; - struct pidmap *map, *end; - - if (last >= PID_MAX_LIMIT) - return -1; - - offset = (last + 1) & BITS_PER_PAGE_MASK; - map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; - end = &pid_ns->pidmap[PIDMAP_ENTRIES]; - for (; map < end; map++, offset = 0) { - if (unlikely(!map->page)) - continue; - offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); - if (offset < BITS_PER_PAGE) - return mk_pid(pid_ns, map, offset); - } - return -1; -} - void put_pid(struct pid *pid) { struct pid_namespace *ns; @@ -285,10 +147,14 @@ void free_pid(struct pid *pid) break; } } - spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); + for (i = 0; i <= pid->level; i++) { + struct upid *upid = pid->numbers + i; + struct pid_namespace *ns = upid->ns; + + idr_remove(&ns->idr, upid->nr); + } + spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } @@ -309,7 +175,22 @@ struct pid *alloc_pid(struct pid_namespace *ns) tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - nr = alloc_pidmap(tmp); + int pid_min = 1; + idr_preload(GFP_KERNEL); + spin_lock_irq(&pidmap_lock); + + /* + * init really needs pid 1, but after reaching the maximum + * wrap back to RESERVED_PIDS + */ + if (tmp->idr.idr_next > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + nr = idr_alloc_cyclic(&tmp->idr, ns, pid_min, + pid_max, GFP_ATOMIC); + spin_unlock_irq(&pidmap_lock); + idr_preload_end(); + if (nr < 0) { retval = nr; goto out_free; @@ -346,12 +227,14 @@ struct pid *alloc_pid(struct pid_namespace *ns) return pid; out_unlock: - spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); + spin_unlock_irq(&pidmap_lock); out_free: + spin_lock_irq(&pidmap_lock); while (++i <= ns->level) - free_pidmap(pid->numbers + i); + idr_remove(&ns->idr, (pid->numbers + i)->nr); + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); @@ -562,7 +445,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) pid = find_pid_ns(nr, ns); if (pid) break; - nr = next_pidmap(ns, nr); + idr_get_next(&ns->idr, &nr); } while (nr > 0); return pid; @@ -575,13 +458,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { + unsigned int pidhash_size; + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); + pidhash_size = 1U << pidhash_shift; } -void __init pidmap_init(void) +void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); @@ -593,10 +479,9 @@ void __init pidmap_init(void) PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); - init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, init_pid_ns.pidmap[0].page); - atomic_dec(&init_pid_ns.pidmap[0].nr_free); + idr_init(&init_pid_ns.idr); + /* Reserve PID 0. */ + idr_alloc_cyclic(&init_pid_ns.idr, &init_pid_ns, 0, 0, GFP_KERNEL); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 74a5a7255b4d9c..d0dc231fa6f709 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -21,6 +21,7 @@ #include #include #include +#include struct pid_cache { int nr_ids; @@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; struct ucounts *ucounts; - int i; int err; err = -ENOSPC; @@ -113,17 +113,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns if (ns == NULL) goto out_dec; - ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); - if (!ns->pidmap[0].page) - goto out_free; + idr_init(&ns->idr); ns->pid_cachep = create_pid_cachep(level + 1); if (ns->pid_cachep == NULL) - goto out_free_map; + goto out_free_idr; err = ns_alloc_inum(&ns->ns); if (err) - goto out_free_map; + goto out_free_idr; ns->ns.ops = &pidns_operations; kref_init(&ns->kref); @@ -134,17 +132,10 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->nr_hashed = PIDNS_HASH_ADDING; INIT_WORK(&ns->proc_work, proc_cleanup_work); - set_bit(0, ns->pidmap[0].page); - atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); - - for (i = 1; i < PIDMAP_ENTRIES; i++) - atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); - return ns; -out_free_map: - kfree(ns->pidmap[0].page); -out_free: +out_free_idr: + idr_destroy(&ns->idr); kmem_cache_free(pid_ns_cachep, ns); out_dec: dec_pid_namespaces(ucounts); @@ -164,11 +155,9 @@ static void delayed_free_pidns(struct rcu_head *p) static void destroy_pid_namespace(struct pid_namespace *ns) { - int i; - ns_free_inum(&ns->ns); - for (i = 0; i < PIDMAP_ENTRIES; i++) - kfree(ns->pidmap[i].page); + + idr_destroy(&ns->idr); call_rcu(&ns->rcu, delayed_free_pidns); } @@ -205,10 +194,11 @@ EXPORT_SYMBOL_GPL(put_pid_ns); void zap_pid_ns_processes(struct pid_namespace *pid_ns) { - int nr; + int nr = 2; int rc; struct task_struct *task, *me = current; int init_pids = thread_group_leader(me) ? 1 : 2; + struct pid *pid; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -236,8 +226,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * */ read_lock(&tasklist_lock); - nr = next_pidmap(pid_ns, 1); - while (nr > 0) { + pid = idr_get_next(&pid_ns->idr, &nr); + while (pid) { rcu_read_lock(); task = pid_task(find_vpid(nr), PIDTYPE_PID); @@ -246,7 +236,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rcu_read_unlock(); - nr = next_pidmap(pid_ns, nr); + nr++; + pid = idr_get_next(&pid_ns->idr, &nr); } read_unlock(&tasklist_lock); @@ -307,7 +298,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write, * it should synchronize its usage with external means. */ - tmp.data = &pid_ns->last_pid; + tmp.data = &pid_max; return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); }