Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

1 #468

Closed
wants to merge 26 commits into from
Closed

1 #468

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
c57cdfe
Replace bitmap PID implementation with IDR API implementation
gs0510 Aug 21, 2017
68c4af5
idr_preload returns in a non-interruptible context, while
Aug 25, 2017
9cb5ff7
Add locking around the idr_remove calls in the out_free
Aug 25, 2017
ec07a32
The function free_pid is called with irqs and preemption disabled,
Aug 27, 2017
ec9ae10
GFP_KERNEL can sleep, and cannot be used from an atomic context.
Aug 27, 2017
d7885c2
If a task exits before procfs is mounted, proc_flush_task_mnt will
Aug 28, 2017
ca41e8f
Ensure PID allocation starts at 1 during bootup, and wraps back around
Aug 28, 2017
bd03215
The strange looking do { ... } while loop in find_ge_pid turns
Aug 28, 2017
546e83a
x86/asm/64: Clear AC on NMI entries
amluto Aug 8, 2017
b64dade
x86/smpboot: Unbreak CPU0 hotplug
vittyvk Aug 3, 2017
bb90296
x86/cpufeature, kvm/svm: Rename (shorten) the new "virtualized VMSAVE…
suryasaimadhu Aug 1, 2017
fb21e77
x86: Mark various structures and functions as 'static'
Aug 10, 2017
38aa83e
x86/mtrr: Prevent CPU hotplug lock recursion
KAGA-KOKO Aug 15, 2017
39d8752
x86: Fix norandmaps/ADDR_NO_RANDOMIZE
oleg-nesterov Aug 15, 2017
5adf3f6
x86/elf: Remove the unnecessary ADDR_NO_RANDOMIZE checks
oleg-nesterov Aug 15, 2017
52db166
x86/boot/64/clang: Use fixup_pointer() to access 'next_early_pgt'
ramosian-glider Aug 16, 2017
bd711e6
x86: Constify attribute_group structures
ArvindYadavCs Jul 20, 2017
93c9bf4
Sanitize 'move_pages()' permission checks
torvalds Aug 20, 2017
d0e11c3
Linux 4.13-rc6
torvalds Aug 20, 2017
57d5003
Replace bitmap PID implementation with IDR API implementation
gs0510 Aug 21, 2017
395168a
idr_preload returns in a non-interruptible context, while
Aug 25, 2017
55c21c5
Add locking around the idr_remove calls in the out_free
Aug 25, 2017
6efcefe
The function free_pid is called with irqs and preemption disabled,
Aug 27, 2017
fde89ce
GFP_KERNEL can sleep, and cannot be used from an atomic context.
Aug 27, 2017
5dcb605
Merge branch 'master' into master
gs0510 Aug 29, 2017
4c49fe7
Merge pull request #2 from rikvanriel/master
gs0510 Aug 29, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions fs/proc/base.c
Original file line number Diff line number Diff line change
Expand Up @@ -3019,6 +3019,10 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
char buf[PROC_NUMBUF];
struct qstr name;

/* procfs is not mounted. There is nothing to unhash. */
if (!mnt)
return;

name.name = buf;
name.len = snprintf(buf, sizeof(buf), "%d", pid);
/* no ->d_hash() rejects on procfs */
Expand Down
1 change: 1 addition & 0 deletions include/linux/pid.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ extern void transfer_pid(struct task_struct *old, struct task_struct *new,

struct pid_namespace;
extern struct pid_namespace init_pid_ns;
extern int pid_max;

/*
* look up a PID in the hash table. Must be called with the tasklist_lock
Expand Down
5 changes: 3 additions & 2 deletions include/linux/pid_namespace.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/kref.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

struct pidmap {
atomic_t nr_free;
Expand All @@ -29,7 +30,7 @@ enum { /* definitions for pid_namespace's hide_pid field */

struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
struct idr idr;
struct rcu_head rcu;
int last_pid;
unsigned int nr_hashed;
Expand Down Expand Up @@ -105,6 +106,6 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pidmap_init(void);
void pid_idr_init(void);

#endif /* _LINUX_PID_NS_H */
5 changes: 2 additions & 3 deletions init/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/binfmts.h>
#include <linux/delay.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/stackprotector.h>
Expand Down Expand Up @@ -658,7 +659,7 @@ asmlinkage __visible void __init start_kernel(void)
if (late_time_init)
late_time_init();
calibrate_delay();
pidmap_init();
pid_idr_init();
anon_vma_init();
acpi_early_init();
#ifdef CONFIG_X86
Expand Down Expand Up @@ -980,7 +981,6 @@ static inline void mark_readonly(void)
static int __ref kernel_init(void *unused)
{
int ret;

kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
Expand Down Expand Up @@ -1053,7 +1053,6 @@ static noinline void __init kernel_init_freeable(void)
sched_init_smp();

page_alloc_init_late();

do_basic_setup();

/* Open the /dev/console on the rootfs, this should never fail */
Expand Down
191 changes: 38 additions & 153 deletions kernel/pid.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#include <linux/sched/task.h>
#include <linux/idr.h>

#define pid_hashfn(nr, ns) \
hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
Expand All @@ -53,12 +54,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;

static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
}

#define find_next_offset(map, off) \
find_next_zero_bit((map)->page, BITS_PER_PAGE, off)

Expand All @@ -68,12 +63,11 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
* value does not cause lots of bitmaps to be allocated, but
* the scheme scales to up to 4 million PIDs, runtime.
*/

struct pid_namespace init_pid_ns = {
.kref = KREF_INIT(2),
.pidmap = {
[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
},
.last_pid = 0,
.idr = IDR_INIT,
.nr_hashed = PIDNS_HASH_ADDING,
.level = 0,
.child_reaper = &init_task,
Expand Down Expand Up @@ -101,138 +95,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

static void free_pidmap(struct upid *upid)
{
int nr = upid->nr;
struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
int offset = nr & BITS_PER_PAGE_MASK;

clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
}

/*
* If we started walking pids at 'base', is 'a' seen before 'b'?
*/
static int pid_before(int base, int a, int b)
{
/*
* This is the same as saying
*
* (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
* and that mapping orders 'a' and 'b' with respect to 'base'.
*/
return (unsigned)(a - base) < (unsigned)(b - base);
}

/*
* We might be racing with someone else trying to set pid_ns->last_pid
* at the pid allocation time (there's also a sysctl for this, but racing
* with this one is OK, see comment in kernel/pid_namespace.c about it).
* We want the winner to have the "later" value, because if the
* "earlier" value prevails, then a pid may get reused immediately.
*
* Since pids rollover, it is not sufficient to just pick the bigger
* value. We have to consider where we started counting from.
*
* 'base' is the value of pid_ns->last_pid that we observed when
* we started looking for a pid.
*
* 'pid' is the pid that we eventually found.
*/
static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
{
int prev;
int last_write = base;
do {
prev = last_write;
last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
} while ((prev != last_write) && (pid_before(base, last_write, pid)));
}

static int alloc_pidmap(struct pid_namespace *pid_ns)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;

pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
offset = pid & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
/*
* If last_pid points into the middle of the map->page we
* want to scan this bitmap block twice, the second time
* we start with offset == 0 (or RESERVED_PIDS).
*/
max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
for (i = 0; i <= max_scan; ++i) {
if (unlikely(!map->page)) {
void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/*
* Free the page if someone raced with us
* installing it:
*/
spin_lock_irq(&pidmap_lock);
if (!map->page) {
map->page = page;
page = NULL;
}
spin_unlock_irq(&pidmap_lock);
kfree(page);
if (unlikely(!map->page))
return -ENOMEM;
}
if (likely(atomic_read(&map->nr_free))) {
for ( ; ; ) {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
set_last_pid(pid_ns, last, pid);
return pid;
}
offset = find_next_offset(map, offset);
if (offset >= BITS_PER_PAGE)
break;
pid = mk_pid(pid_ns, map, offset);
if (pid >= pid_max)
break;
}
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
offset = 0;
} else {
map = &pid_ns->pidmap[0];
offset = RESERVED_PIDS;
if (unlikely(last == offset))
break;
}
pid = mk_pid(pid_ns, map, offset);
}
return -EAGAIN;
}

int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
{
int offset;
struct pidmap *map, *end;

if (last >= PID_MAX_LIMIT)
return -1;

offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
for (; map < end; map++, offset = 0) {
if (unlikely(!map->page))
continue;
offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
if (offset < BITS_PER_PAGE)
return mk_pid(pid_ns, map, offset);
}
return -1;
}

void put_pid(struct pid *pid)
{
struct pid_namespace *ns;
Expand Down Expand Up @@ -285,10 +147,14 @@ void free_pid(struct pid *pid)
break;
}
}
spin_unlock_irqrestore(&pidmap_lock, flags);

for (i = 0; i <= pid->level; i++)
free_pidmap(pid->numbers + i);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;

idr_remove(&ns->idr, upid->nr);
}
spin_unlock_irqrestore(&pidmap_lock, flags);

call_rcu(&pid->rcu, delayed_put_pid);
}
Expand All @@ -309,7 +175,22 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
nr = alloc_pidmap(tmp);
int pid_min = 1;
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);

/*
* init really needs pid 1, but after reaching the maximum
* wrap back to RESERVED_PIDS
*/
if (tmp->idr.idr_next > RESERVED_PIDS)
pid_min = RESERVED_PIDS;

nr = idr_alloc_cyclic(&tmp->idr, ns, pid_min,
pid_max, GFP_ATOMIC);
spin_unlock_irq(&pidmap_lock);
idr_preload_end();

if (nr < 0) {
retval = nr;
goto out_free;
Expand Down Expand Up @@ -346,12 +227,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
return pid;

out_unlock:
spin_unlock_irq(&pidmap_lock);
put_pid_ns(ns);
spin_unlock_irq(&pidmap_lock);

out_free:
spin_lock_irq(&pidmap_lock);
while (++i <= ns->level)
free_pidmap(pid->numbers + i);
idr_remove(&ns->idr, (pid->numbers + i)->nr);
spin_unlock_irq(&pidmap_lock);

kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
Expand Down Expand Up @@ -562,7 +445,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
pid = find_pid_ns(nr, ns);
if (pid)
break;
nr = next_pidmap(ns, nr);
idr_get_next(&ns->idr, &nr);
} while (nr > 0);

return pid;
Expand All @@ -575,13 +458,16 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
unsigned int pidhash_size;

pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
pidhash_size = 1U << pidhash_shift;
}

void __init pidmap_init(void)
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
Expand All @@ -593,10 +479,9 @@ void __init pidmap_init(void)
PIDS_PER_CPU_MIN * num_possible_cpus());
pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);

init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
idr_init(&init_pid_ns.idr);
/* Reserve PID 0. */
idr_alloc_cyclic(&init_pid_ns.idr, &init_pid_ns, 0, 0, GFP_KERNEL);

init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
Expand Down
Loading