Newer
Older
return 0;
}
#endif
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
{
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
/*
* Conditions under which we can wait for the oom_killer. Those are
* the same conditions tested by the core page allocator
*/
may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
&_memcg, may_oom);
if (ret == -EINTR) {
/*
* __mem_cgroup_try_charge() chosed to bypass to root due to
* OOM kill or fatal signal. Since our only options are to
* either fail the allocation or charge it to this cgroup, do
* it as a temporary condition. But we can't fail. From a
* kmem/slab perspective, the cache has already been selected,
* by mem_cgroup_kmem_get_cache(), so it is too late to change
* our minds.
*
* This condition will only trigger if the task entered
* memcg_charge_kmem in a sane state, but was OOM-killed during
* __mem_cgroup_try_charge() above. Tasks that were already
* dying when the allocation triggers should have been already
* directed to the root cgroup in memcontrol.h
*/
res_counter_charge_nofail(&memcg->res, size, &fail_res);
if (do_swap_account)
res_counter_charge_nofail(&memcg->memsw, size,
&fail_res);
ret = 0;
} else if (ret)
res_counter_uncharge(&memcg->kmem, size);
return ret;
}
static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
{
res_counter_uncharge(&memcg->res, size);
if (do_swap_account)
res_counter_uncharge(&memcg->memsw, size);
/* Not down to 0 */
if (res_counter_uncharge(&memcg->kmem, size))
return;
/*
* Releases a reference taken in kmem_cgroup_css_offline in case
* this last uncharge is racing with the offlining code or it is
* outliving the memcg existence.
*
* The memory barrier imposed by test&clear is paired with the
* explicit one in memcg_kmem_mark_dead().
*/
if (memcg_kmem_test_and_clear_dead(memcg))
void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
{
if (!memcg)
return;
mutex_lock(&memcg->slab_caches_mutex);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
mutex_unlock(&memcg->slab_caches_mutex);
}
/*
* helper for acessing a memcg's index. It will be used as an index in the
* child cache array in kmem_cache, and also to derive its name. This function
* will return -1 when this is not a kmem-limited memcg.
*/
int memcg_cache_id(struct mem_cgroup *memcg)
{
return memcg ? memcg->kmemcg_id : -1;
}
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
/*
* This ends up being protected by the set_limit mutex, during normal
* operation, because that is its main call site.
*
* But when we create a new cache, we can call this as well if its parent
* is kmem-limited. That will have to hold set_limit_mutex as well.
*/
int memcg_update_cache_sizes(struct mem_cgroup *memcg)
{
int num, ret;
num = ida_simple_get(&kmem_limited_groups,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (num < 0)
return num;
/*
* After this point, kmem_accounted (that we test atomically in
* the beginning of this conditional), is no longer 0. This
* guarantees only one process will set the following boolean
* to true. We don't need test_and_set because we're protected
* by the set_limit_mutex anyway.
*/
memcg_kmem_set_activated(memcg);
ret = memcg_update_all_caches(num+1);
if (ret) {
ida_simple_remove(&kmem_limited_groups, num);
memcg_kmem_clear_activated(memcg);
return ret;
}
memcg->kmemcg_id = num;
INIT_LIST_HEAD(&memcg->memcg_slab_caches);
mutex_init(&memcg->slab_caches_mutex);
return 0;
}
static size_t memcg_caches_array_size(int num_groups)
{
ssize_t size;
if (num_groups <= 0)
return 0;
size = 2 * num_groups;
if (size < MEMCG_CACHES_MIN_SIZE)
size = MEMCG_CACHES_MIN_SIZE;
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
return size;
}
/*
* We should update the current array size iff all caches updates succeed. This
* can only be done from the slab side. The slab mutex needs to be held when
* calling this.
*/
void memcg_update_array_size(int num)
{
if (num > memcg_limited_groups_array_size)
memcg_limited_groups_array_size = memcg_caches_array_size(num);
}
static void kmem_cache_destroy_work_func(struct work_struct *w);
int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
if (num_groups > memcg_limited_groups_array_size) {
int i;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += offsetof(struct memcg_cache_params, memcg_caches);
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
s->memcg_params = kzalloc(size, GFP_KERNEL);
if (!s->memcg_params) {
s->memcg_params = cur_params;
return -ENOMEM;
}
s->memcg_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
* memcg_limited_groups_array_size, if we failed an allocation
* in a cache, in which case all caches updated before it, will
* have a bigger array.
*
* But if that is the case, the data after
* memcg_limited_groups_array_size is certainly unused
*/
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
s->memcg_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
/*
* Ideally, we would wait until all caches succeed, and only
* then free the old one. But this is not worth the extra
* pointer per-cache we'd have to have for this.
*
* It is not a big deal if some caches are left with a size
* bigger than the others. And all updates will reset this
* anyway.
*/
kfree(cur_params);
}
return 0;
}
int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache)
size_t size;
if (!memcg_kmem_enabled())
return 0;
if (!memcg) {
size = offsetof(struct memcg_cache_params, memcg_caches);
size += memcg_limited_groups_array_size * sizeof(void *);
} else
size = sizeof(struct memcg_cache_params);
s->memcg_params = kzalloc(size, GFP_KERNEL);
if (!s->memcg_params)
return -ENOMEM;
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
} else
s->memcg_params->is_root_cache = true;
return 0;
}
void memcg_release_cache(struct kmem_cache *s)
{
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
/*
* This happens, for instance, when a root cache goes away before we
* add any memcg.
*/
if (!s->memcg_params)
return;
if (s->memcg_params->is_root_cache)
goto out;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
root = s->memcg_params->root_cache;
root->memcg_params->memcg_caches[id] = NULL;
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
css_put(&memcg->css);
kfree(s->memcg_params);
}
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
/*
* During the creation a new cache, we need to disable our accounting mechanism
* altogether. This is true even if we are not creating, but rather just
* enqueing new caches to be created.
*
* This is because that process will trigger allocations; some visible, like
* explicit kmallocs to auxiliary data structures, name strings and internal
* cache structures; some well concealed, like INIT_WORK() that can allocate
* objects during debug.
*
* If any allocation happens during memcg_kmem_get_cache, we will recurse back
* to it. This may not be a bounded recursion: since the first cache creation
* failed to complete (waiting on the allocation), we'll just try to create the
* cache again, failing at the same point.
*
* memcg_kmem_get_cache is prepared to abort after seeing a positive count of
* memcg_kmem_skip_account. So we enclose anything that might allocate memory
* inside the following two functions.
*/
static inline void memcg_stop_kmem_account(void)
{
VM_BUG_ON(!current->mm);
current->memcg_kmem_skip_account++;
}
static inline void memcg_resume_kmem_account(void)
{
VM_BUG_ON(!current->mm);
current->memcg_kmem_skip_account--;
}
static void kmem_cache_destroy_work_func(struct work_struct *w)
{
struct kmem_cache *cachep;
struct memcg_cache_params *p;
p = container_of(w, struct memcg_cache_params, destroy);
cachep = memcg_params_to_cache(p);
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
/*
* If we get down to 0 after shrink, we could delete right away.
* However, memcg_release_pages() already puts us back in the workqueue
* in that case. If we proceed deleting, we'll get a dangling
* reference, and removing the object from the workqueue in that case
* is unnecessary complication. We are not a fast path.
*
* Note that this case is fundamentally different from racing with
* shrink_slab(): if memcg_cgroup_destroy_cache() is called in
* kmem_cache_shrink, not only we would be reinserting a dead cache
* into the queue, but doing so from inside the worker racing to
* destroy it.
*
* So if we aren't down to zero, we'll just schedule a worker and try
* again
*/
if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
kmem_cache_shrink(cachep);
if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
return;
} else
kmem_cache_destroy(cachep);
}
void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
{
if (!cachep->memcg_params->dead)
return;
/*
* There are many ways in which we can get here.
*
* We can get to a memory-pressure situation while the delayed work is
* still pending to run. The vmscan shrinkers can then release all
* cache memory and get us to destruction. If this is the case, we'll
* be executed twice, which is a bug (the second time will execute over
* bogus data). In this case, cancelling the work should be fine.
*
* But we can also get here from the worker itself, if
* kmem_cache_shrink is enough to shake all the remaining objects and
* get the page count to 0. In this case, we'll deadlock if we try to
* cancel the work (the worker runs with an internal lock held, which
* is the same lock we would hold for cancel_work_sync().)
*
* Since we can't possibly know who got us here, just refrain from
* running if there is already work pending
*/
if (work_pending(&cachep->memcg_params->destroy))
return;
/*
* We have to defer the actual destroying to a workqueue, because
* we might currently be in a context that cannot sleep.
*/
schedule_work(&cachep->memcg_params->destroy);
}
/*
* This lock protects updaters, not readers. We want readers to be as fast as
* they can, and they will either see NULL or a valid cache value. Our model
* allow them to see NULL, in which case the root memcg will be selected.
*
* We need this lock because multiple allocations to the same cache from a non
* will span more than one worker. Only one of them can create the cache.
*/
static DEFINE_MUTEX(memcg_cache_mutex);
/*
* Called with memcg_cache_mutex held
*/
static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
struct kmem_cache *s)
{
struct kmem_cache *new;
static char *tmp_name = NULL;
lockdep_assert_held(&memcg_cache_mutex);
/*
* kmem_cache_create_memcg duplicates the given name and
* cgroup_name for this name requires RCU context.
* This static temporary buffer is used to prevent from
* pointless shortliving allocation.
*/
if (!tmp_name) {
tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!tmp_name)
return NULL;
}
rcu_read_lock();
snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
rcu_read_unlock();
new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
if (new)
new->allocflags |= __GFP_KMEMCG;
return new;
}
static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct kmem_cache *new_cachep;
int idx;
BUG_ON(!memcg_can_account_kmem(memcg));
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
if (new_cachep) {
css_put(&memcg->css);
goto out;
new_cachep = kmem_cache_dup(memcg, cachep);
if (new_cachep == NULL) {
new_cachep = cachep;
css_put(&memcg->css);
goto out;
}
atomic_set(&new_cachep->memcg_params->nr_pages , 0);
cachep->memcg_params->memcg_caches[idx] = new_cachep;
/*
* the readers won't lock, make sure everybody sees the updated value,
* so they won't put stuff in the queue again for no reason
*/
wmb();
out:
mutex_unlock(&memcg_cache_mutex);
return new_cachep;
}
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
int i;
if (!s->memcg_params)
return;
if (!s->memcg_params->is_root_cache)
return;
/*
* If the cache is being destroyed, we trust that there is no one else
* requesting objects from it. Even if there are, the sanity checks in
* kmem_cache_destroy should caught this ill-case.
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
for (i = 0; i < memcg_limited_groups_array_size; i++) {
c = s->memcg_params->memcg_caches[i];
if (!c)
continue;
/*
* We will now manually delete the caches, so to avoid races
* we need to cancel all pending destruction workers and
* proceed with destruction ourselves.
*
* kmem_cache_destroy() will call kmem_cache_shrink internally,
* and that could spawn the workers again: it is likely that
* the cache still have active pages until this very moment.
* This would lead us back to mem_cgroup_destroy_cache.
*
* But that will not execute at all if the "dead" flag is not
* set, so flip it down to guarantee we are in control.
*/
c->memcg_params->dead = false;
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
mutex_unlock(&set_limit_mutex);
}
struct create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
struct memcg_cache_params *params;
if (!memcg_kmem_is_active(memcg))
return;
mutex_lock(&memcg->slab_caches_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
cachep->memcg_params->dead = true;
schedule_work(&cachep->memcg_params->destroy);
}
mutex_unlock(&memcg->slab_caches_mutex);
}
static void memcg_create_cache_work_func(struct work_struct *w)
{
struct create_work *cw;
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
kfree(cw);
}
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct create_work *cw;
cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
if (cw == NULL) {
css_put(&memcg->css);
return;
}
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_create_cache_work_func);
schedule_work(&cw->work);
}
static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
* in __memcg_create_cache_enqueue will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
* trigger an allocation. This too, will make us recurse. Because at
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
memcg_stop_kmem_account();
__memcg_create_cache_enqueue(memcg, cachep);
memcg_resume_kmem_account();
}
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
* If the cache does not exist yet, if we are the first user of it,
* we either create it immediately, if possible, or create it asynchronously
* in a workqueue.
* In the latter case, we will let the current allocation go through with
* the original cache.
*
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
int idx;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
if (!current->mm || current->memcg_kmem_skip_account)
return cachep;
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
if (!memcg_can_account_kmem(memcg))
idx = memcg_cache_id(memcg);
/*
* barrier to mare sure we're always seeing the up to date value. The
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
if (likely(cachep->memcg_params->memcg_caches[idx])) {
cachep = cachep->memcg_params->memcg_caches[idx];
goto out;
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
/* The corresponding put will be done in the workqueue. */
if (!css_tryget(&memcg->css))
goto out;
rcu_read_unlock();
/*
* If we are in a safe context (can wait, and not in interrupt
* context), we could be be predictable and return right away.
* This would guarantee that the allocation being performed
* already belongs in the new cache.
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
* kmem_cache_dup, this means no further allocation could happen
* with the slab_mutex held.
*
* Also, because cache creation issue get_online_cpus(), this
* creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
* that ends up reversed during cpu hotplug. (cpuset allocates
* a bunch of GFP_KERNEL memory during cpuup). Due to all that,
* better to defer everything.
*/
memcg_create_cache_enqueue(memcg, cachep);
return cachep;
out:
rcu_read_unlock();
return cachep;
}
EXPORT_SYMBOL(__memcg_kmem_get_cache);
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
* need a further commit step to do the final arrangements.
*
* It is possible for the task to switch cgroups in this mean time, so at
* commit time, we can't rely on task conversion any longer. We'll then use
* the handle argument to return to the caller which cgroup we should commit
* against. We could also return the memcg directly and avoid the pointer
* passing, but a boolean return value gives better semantics considering
* the compiled-out case as well.
*
* Returning true means the allocation is possible.
*/
bool
__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
{
struct mem_cgroup *memcg;
int ret;
*_memcg = NULL;
/*
* Disabling accounting is only relevant for some specific memcg
* internal allocations. Therefore we would initially not have such
* check here, since direct calls to the page allocator that are marked
* with GFP_KMEMCG only happen outside memcg core. We are mostly
* concerned with cache allocations, and by having this test at
* memcg_kmem_get_cache, we are already able to relay the allocation to
* the root cache and bypass the memcg cache altogether.
*
* There is one exception, though: the SLUB allocator does not create
* large order caches, but rather service large kmallocs directly from
* the page allocator. Therefore, the following sequence when backed by
* the SLUB allocator:
*
* memcg_stop_kmem_account();
* kmalloc(<large_number>)
* memcg_resume_kmem_account();
*
* would effectively ignore the fact that we should skip accounting,
* since it will drive us directly to this function without passing
* through the cache selector memcg_kmem_get_cache. Such large
* allocations are extremely rare but can happen, for instance, for the
* cache arrays. We bring this test here.
*/
if (!current->mm || current->memcg_kmem_skip_account)
return true;
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
memcg = try_get_mem_cgroup_from_mm(current->mm);
/*
* very rare case described in mem_cgroup_from_task. Unfortunately there
* isn't much we can do without complicating this too much, and it would
* be gfp-dependent anyway. Just let it go
*/
if (unlikely(!memcg))
return true;
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
return true;
}
ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
if (!ret)
*_memcg = memcg;
css_put(&memcg->css);
return (ret == 0);
}
void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
int order)
{
struct page_cgroup *pc;
VM_BUG_ON(mem_cgroup_is_root(memcg));
/* The page allocation failed. Revert */
if (!page) {
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
return;
}
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
pc->mem_cgroup = memcg;
SetPageCgroupUsed(pc);
unlock_page_cgroup(pc);
}
void __memcg_kmem_uncharge_pages(struct page *page, int order)
{
struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Fast unlocked return. Theoretically might have changed, have to
* check again after locking.
*/
if (!PageCgroupUsed(pc))
return;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
/*
* We trust that only if there is a memcg associated with the page, it
* is a valid allocation
*/
if (!memcg)
return;
VM_BUG_ON(mem_cgroup_is_root(memcg));
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
* charge/uncharge will be never happen and move_account() is done under
* compound_lock(), so we don't have to take care of races.
void mem_cgroup_split_huge_fixup(struct page *head)
{
struct page_cgroup *head_pc = lookup_page_cgroup(head);
struct page_cgroup *pc;
if (mem_cgroup_disabled())
return;
memcg = head_pc->mem_cgroup;
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
smp_wmb();/* see __commit_charge() */
pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
HPAGE_PMD_NR);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline
void mem_cgroup_move_account_page_stat(struct mem_cgroup *from,
struct mem_cgroup *to,
unsigned int nr_pages,
enum mem_cgroup_stat_index idx)
{
/* Update stat data for mem_cgroup */
preempt_disable();
WARN_ON_ONCE(from->stat->count[idx] < nr_pages);
__this_cpu_add(from->stat->count[idx], -nr_pages);
__this_cpu_add(to->stat->count[idx], nr_pages);
preempt_enable();
}
* mem_cgroup_move_account - move account of the page
* @nr_pages: number of regular pages (>1 for huge pages)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - compound_lock is held when nr_pages > 1
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
struct mem_cgroup *from,
struct mem_cgroup *to)
unsigned long flags;
int ret;
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(page));
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
* Do this check under compound_page_lock(). The caller should
* hold it.
*/
ret = -EBUSY;
if (nr_pages > 1 && !PageTransHuge(page))
goto out;
lock_page_cgroup(pc);
ret = -EINVAL;
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
goto unlock;
move_lock_mem_cgroup(from, &flags);
if (!anon && page_mapped(page))
mem_cgroup_move_account_page_stat(from, to, nr_pages,
MEM_CGROUP_STAT_FILE_MAPPED);
if (PageWriteback(page))
mem_cgroup_move_account_page_stat(from, to, nr_pages,
MEM_CGROUP_STAT_WRITEBACK);
mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
/* caller should have done css_get */
mem_cgroup_charge_statistics(to, page, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
/*
* check events
*/
memcg_check_events(to, page);
memcg_check_events(from, page);
return ret;
}
/**
* mem_cgroup_move_parent - moves page to the parent group
* @page: the page to move
* @pc: page_cgroup of the page
* @child: page's cgroup
*
* move charges to its parent or the root cgroup if the group has no
* parent (aka use_hierarchy==0).
* Although this might fail (get_page_unless_zero, isolate_lru_page or
* mem_cgroup_move_account fails) the failure is always temporary and
* it signals a race with a page removal/uncharge or migration. In the
* first case the page is on the way out and it will vanish from the LRU
* on the next attempt and the call should be retried later.
* Isolation from the LRU fails only if page has been isolated from
* the LRU since we looked at it and that usually means either global
* reclaim or migration going on. The page will either get back to the
* LRU or vanish.
* Finaly mem_cgroup_move_account fails only if the page got uncharged
* (!PageCgroupUsed) or moved to a different group. The page will
* disappear in the next attempt.
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
{
struct mem_cgroup *parent;
unsigned int nr_pages;
unsigned long uninitialized_var(flags);
VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
goto out;
if (isolate_lru_page(page))
goto put;
nr_pages = hpage_nr_pages(page);
parent = parent_mem_cgroup(child);
/*
* If no parent, move charges to root cgroup.
*/
if (!parent)
parent = root_mem_cgroup;
if (nr_pages > 1) {
VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
if (!ret)
__mem_cgroup_cancel_local_charge(child, nr_pages);
if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
return ret;
}
/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
unsigned int nr_pages = 1;
nr_pages <<= compound_order(page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
*/
oom = false;
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
return ret;
__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping && !PageAnon(page));
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON);
}
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/