Newer
Older
return size;
}
/*
* We should update the current array size iff all caches updates succeed. This
* can only be done from the slab side. The slab mutex needs to be held when
* calling this.
*/
void memcg_update_array_size(int num)
{
if (num > memcg_limited_groups_array_size)
memcg_limited_groups_array_size = memcg_caches_array_size(num);
}
static void kmem_cache_destroy_work_func(struct work_struct *w);
int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
if (num_groups > memcg_limited_groups_array_size) {
int i;
ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);
size += sizeof(struct memcg_cache_params);
s->memcg_params = kzalloc(size, GFP_KERNEL);
if (!s->memcg_params) {
s->memcg_params = cur_params;
return -ENOMEM;
}
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
s->memcg_params->is_root_cache = true;
/*
* There is the chance it will be bigger than
* memcg_limited_groups_array_size, if we failed an allocation
* in a cache, in which case all caches updated before it, will
* have a bigger array.
*
* But if that is the case, the data after
* memcg_limited_groups_array_size is certainly unused
*/
for (i = 0; i < memcg_limited_groups_array_size; i++) {
if (!cur_params->memcg_caches[i])
continue;
s->memcg_params->memcg_caches[i] =
cur_params->memcg_caches[i];
}
/*
* Ideally, we would wait until all caches succeed, and only
* then free the old one. But this is not worth the extra
* pointer per-cache we'd have to have for this.
*
* It is not a big deal if some caches are left with a size
* bigger than the others. And all updates will reset this
* anyway.
*/
kfree(cur_params);
}
return 0;
}
int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
struct kmem_cache *root_cache)
{
size_t size = sizeof(struct memcg_cache_params);
if (!memcg_kmem_enabled())
return 0;
if (!memcg)
size += memcg_limited_groups_array_size * sizeof(void *);
s->memcg_params = kzalloc(size, GFP_KERNEL);
if (!s->memcg_params)
return -ENOMEM;
INIT_WORK(&s->memcg_params->destroy,
kmem_cache_destroy_work_func);
s->memcg_params->memcg = memcg;
s->memcg_params->root_cache = root_cache;
} else
s->memcg_params->is_root_cache = true;
return 0;
}
void memcg_release_cache(struct kmem_cache *s)
{
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
struct kmem_cache *root;
struct mem_cgroup *memcg;
int id;
/*
* This happens, for instance, when a root cache goes away before we
* add any memcg.
*/
if (!s->memcg_params)
return;
if (s->memcg_params->is_root_cache)
goto out;
memcg = s->memcg_params->memcg;
id = memcg_cache_id(memcg);
root = s->memcg_params->root_cache;
root->memcg_params->memcg_caches[id] = NULL;
mem_cgroup_put(memcg);
mutex_lock(&memcg->slab_caches_mutex);
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
out:
kfree(s->memcg_params);
}
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
/*
* During the creation a new cache, we need to disable our accounting mechanism
* altogether. This is true even if we are not creating, but rather just
* enqueing new caches to be created.
*
* This is because that process will trigger allocations; some visible, like
* explicit kmallocs to auxiliary data structures, name strings and internal
* cache structures; some well concealed, like INIT_WORK() that can allocate
* objects during debug.
*
* If any allocation happens during memcg_kmem_get_cache, we will recurse back
* to it. This may not be a bounded recursion: since the first cache creation
* failed to complete (waiting on the allocation), we'll just try to create the
* cache again, failing at the same point.
*
* memcg_kmem_get_cache is prepared to abort after seeing a positive count of
* memcg_kmem_skip_account. So we enclose anything that might allocate memory
* inside the following two functions.
*/
static inline void memcg_stop_kmem_account(void)
{
VM_BUG_ON(!current->mm);
current->memcg_kmem_skip_account++;
}
static inline void memcg_resume_kmem_account(void)
{
VM_BUG_ON(!current->mm);
current->memcg_kmem_skip_account--;
}
static void kmem_cache_destroy_work_func(struct work_struct *w)
{
struct kmem_cache *cachep;
struct memcg_cache_params *p;
p = container_of(w, struct memcg_cache_params, destroy);
cachep = memcg_params_to_cache(p);
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
/*
* If we get down to 0 after shrink, we could delete right away.
* However, memcg_release_pages() already puts us back in the workqueue
* in that case. If we proceed deleting, we'll get a dangling
* reference, and removing the object from the workqueue in that case
* is unnecessary complication. We are not a fast path.
*
* Note that this case is fundamentally different from racing with
* shrink_slab(): if memcg_cgroup_destroy_cache() is called in
* kmem_cache_shrink, not only we would be reinserting a dead cache
* into the queue, but doing so from inside the worker racing to
* destroy it.
*
* So if we aren't down to zero, we'll just schedule a worker and try
* again
*/
if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
kmem_cache_shrink(cachep);
if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
return;
} else
kmem_cache_destroy(cachep);
}
void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
{
if (!cachep->memcg_params->dead)
return;
/*
* There are many ways in which we can get here.
*
* We can get to a memory-pressure situation while the delayed work is
* still pending to run. The vmscan shrinkers can then release all
* cache memory and get us to destruction. If this is the case, we'll
* be executed twice, which is a bug (the second time will execute over
* bogus data). In this case, cancelling the work should be fine.
*
* But we can also get here from the worker itself, if
* kmem_cache_shrink is enough to shake all the remaining objects and
* get the page count to 0. In this case, we'll deadlock if we try to
* cancel the work (the worker runs with an internal lock held, which
* is the same lock we would hold for cancel_work_sync().)
*
* Since we can't possibly know who got us here, just refrain from
* running if there is already work pending
*/
if (work_pending(&cachep->memcg_params->destroy))
return;
/*
* We have to defer the actual destroying to a workqueue, because
* we might currently be in a context that cannot sleep.
*/
schedule_work(&cachep->memcg_params->destroy);
}
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
{
char *name;
struct dentry *dentry;
rcu_read_lock();
dentry = rcu_dereference(memcg->css.cgroup->dentry);
rcu_read_unlock();
BUG_ON(dentry == NULL);
name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
memcg_cache_id(memcg), dentry->d_name.name);
return name;
}
static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
struct kmem_cache *s)
{
char *name;
struct kmem_cache *new;
name = memcg_cache_name(memcg, s);
if (!name)
return NULL;
new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
if (new)
new->allocflags |= __GFP_KMEMCG;
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
kfree(name);
return new;
}
/*
* This lock protects updaters, not readers. We want readers to be as fast as
* they can, and they will either see NULL or a valid cache value. Our model
* allow them to see NULL, in which case the root memcg will be selected.
*
* We need this lock because multiple allocations to the same cache from a non
* will span more than one worker. Only one of them can create the cache.
*/
static DEFINE_MUTEX(memcg_cache_mutex);
static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct kmem_cache *new_cachep;
int idx;
BUG_ON(!memcg_can_account_kmem(memcg));
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
if (new_cachep)
goto out;
new_cachep = kmem_cache_dup(memcg, cachep);
if (new_cachep == NULL) {
new_cachep = cachep;
goto out;
}
mem_cgroup_get(memcg);
atomic_set(&new_cachep->memcg_params->nr_pages , 0);
cachep->memcg_params->memcg_caches[idx] = new_cachep;
/*
* the readers won't lock, make sure everybody sees the updated value,
* so they won't put stuff in the queue again for no reason
*/
wmb();
out:
mutex_unlock(&memcg_cache_mutex);
return new_cachep;
}
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
{
struct kmem_cache *c;
int i;
if (!s->memcg_params)
return;
if (!s->memcg_params->is_root_cache)
return;
/*
* If the cache is being destroyed, we trust that there is no one else
* requesting objects from it. Even if there are, the sanity checks in
* kmem_cache_destroy should caught this ill-case.
*
* Still, we don't want anyone else freeing memcg_caches under our
* noses, which can happen if a new memcg comes to life. As usual,
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
for (i = 0; i < memcg_limited_groups_array_size; i++) {
c = s->memcg_params->memcg_caches[i];
if (!c)
continue;
/*
* We will now manually delete the caches, so to avoid races
* we need to cancel all pending destruction workers and
* proceed with destruction ourselves.
*
* kmem_cache_destroy() will call kmem_cache_shrink internally,
* and that could spawn the workers again: it is likely that
* the cache still have active pages until this very moment.
* This would lead us back to mem_cgroup_destroy_cache.
*
* But that will not execute at all if the "dead" flag is not
* set, so flip it down to guarantee we are in control.
*/
c->memcg_params->dead = false;
cancel_work_sync(&c->memcg_params->destroy);
kmem_cache_destroy(c);
}
mutex_unlock(&set_limit_mutex);
}
struct create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
struct kmem_cache *cachep;
struct memcg_cache_params *params;
if (!memcg_kmem_is_active(memcg))
return;
mutex_lock(&memcg->slab_caches_mutex);
list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
cachep = memcg_params_to_cache(params);
cachep->memcg_params->dead = true;
schedule_work(&cachep->memcg_params->destroy);
}
mutex_unlock(&memcg->slab_caches_mutex);
}
static void memcg_create_cache_work_func(struct work_struct *w)
{
struct create_work *cw;
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
/* Drop the reference gotten when we enqueued. */
css_put(&cw->memcg->css);
kfree(cw);
}
/*
* Enqueue the creation of a per-memcg kmem_cache.
* Called with rcu_read_lock.
*/
static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct create_work *cw;
cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
if (cw == NULL)
return;
/* The corresponding put will be done in the workqueue. */
if (!css_tryget(&memcg->css)) {
kfree(cw);
return;
}
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_create_cache_work_func);
schedule_work(&cw->work);
}
static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
* in __memcg_create_cache_enqueue will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
* trigger an allocation. This too, will make us recurse. Because at
* this point we can't allow ourselves back into memcg_kmem_get_cache,
* the safest choice is to do it like this, wrapping the whole function.
*/
memcg_stop_kmem_account();
__memcg_create_cache_enqueue(memcg, cachep);
memcg_resume_kmem_account();
}
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
*
* If the cache does not exist yet, if we are the first user of it,
* we either create it immediately, if possible, or create it asynchronously
* in a workqueue.
* In the latter case, we will let the current allocation go through with
* the original cache.
*
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
gfp_t gfp)
{
struct mem_cgroup *memcg;
int idx;
VM_BUG_ON(!cachep->memcg_params);
VM_BUG_ON(!cachep->memcg_params->is_root_cache);
if (!current->mm || current->memcg_kmem_skip_account)
return cachep;
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
rcu_read_unlock();
if (!memcg_can_account_kmem(memcg))
return cachep;
idx = memcg_cache_id(memcg);
/*
* barrier to mare sure we're always seeing the up to date value. The
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
/*
* If we are in a safe context (can wait, and not in interrupt
* context), we could be be predictable and return right away.
* This would guarantee that the allocation being performed
* already belongs in the new cache.
*
* However, there are some clashes that can arrive from locking.
* For instance, because we acquire the slab_mutex while doing
* kmem_cache_dup, this means no further allocation could happen
* with the slab_mutex held.
*
* Also, because cache creation issue get_online_cpus(), this
* creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
* that ends up reversed during cpu hotplug. (cpuset allocates
* a bunch of GFP_KERNEL memory during cpuup). Due to all that,
* better to defer everything.
*/
memcg_create_cache_enqueue(memcg, cachep);
return cachep;
}
return cachep->memcg_params->memcg_caches[idx];
}
EXPORT_SYMBOL(__memcg_kmem_get_cache);
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
/*
* We need to verify if the allocation against current->mm->owner's memcg is
* possible for the given order. But the page is not allocated yet, so we'll
* need a further commit step to do the final arrangements.
*
* It is possible for the task to switch cgroups in this mean time, so at
* commit time, we can't rely on task conversion any longer. We'll then use
* the handle argument to return to the caller which cgroup we should commit
* against. We could also return the memcg directly and avoid the pointer
* passing, but a boolean return value gives better semantics considering
* the compiled-out case as well.
*
* Returning true means the allocation is possible.
*/
bool
__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
{
struct mem_cgroup *memcg;
int ret;
*_memcg = NULL;
memcg = try_get_mem_cgroup_from_mm(current->mm);
/*
* very rare case described in mem_cgroup_from_task. Unfortunately there
* isn't much we can do without complicating this too much, and it would
* be gfp-dependent anyway. Just let it go
*/
if (unlikely(!memcg))
return true;
if (!memcg_can_account_kmem(memcg)) {
css_put(&memcg->css);
return true;
}
ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
if (!ret)
*_memcg = memcg;
css_put(&memcg->css);
return (ret == 0);
}
void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
int order)
{
struct page_cgroup *pc;
VM_BUG_ON(mem_cgroup_is_root(memcg));
/* The page allocation failed. Revert */
if (!page) {
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
return;
}
pc = lookup_page_cgroup(page);
lock_page_cgroup(pc);
pc->mem_cgroup = memcg;
SetPageCgroupUsed(pc);
unlock_page_cgroup(pc);
}
void __memcg_kmem_uncharge_pages(struct page *page, int order)
{
struct mem_cgroup *memcg = NULL;
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Fast unlocked return. Theoretically might have changed, have to
* check again after locking.
*/
if (!PageCgroupUsed(pc))
return;
lock_page_cgroup(pc);
if (PageCgroupUsed(pc)) {
memcg = pc->mem_cgroup;
ClearPageCgroupUsed(pc);
}
unlock_page_cgroup(pc);
/*
* We trust that only if there is a memcg associated with the page, it
* is a valid allocation
*/
if (!memcg)
return;
VM_BUG_ON(mem_cgroup_is_root(memcg));
memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
}
#else
static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock, 'splitting on pmd' and compound_lock.
* charge/uncharge will be never happen and move_account() is done under
* compound_lock(), so we don't have to take care of races.
void mem_cgroup_split_huge_fixup(struct page *head)
{
struct page_cgroup *head_pc = lookup_page_cgroup(head);
struct page_cgroup *pc;
int i;
if (mem_cgroup_disabled())
return;
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
pc->mem_cgroup = head_pc->mem_cgroup;
smp_wmb();/* see __commit_charge() */
pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
* mem_cgroup_move_account - move account of the page
* @nr_pages: number of regular pages (>1 for huge pages)
* @pc: page_cgroup of the page.
* @from: mem_cgroup which the page is moved from.
* @to: mem_cgroup which the page is moved to. @from != @to.
*
* The caller must confirm following.
* - page is not on LRU (isolate_page() is useful.)
* - compound_lock is held when nr_pages > 1
* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
* from old cgroup.
static int mem_cgroup_move_account(struct page *page,
unsigned int nr_pages,
struct page_cgroup *pc,
struct mem_cgroup *from,
struct mem_cgroup *to)
unsigned long flags;
int ret;
bool anon = PageAnon(page);
VM_BUG_ON(from == to);
VM_BUG_ON(PageLRU(page));
/*
* The page is isolated from LRU. So, collapse function
* will not handle this page. But page splitting can happen.
* Do this check under compound_page_lock(). The caller should
* hold it.
*/
ret = -EBUSY;
if (nr_pages > 1 && !PageTransHuge(page))
goto out;
lock_page_cgroup(pc);
ret = -EINVAL;
if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
goto unlock;
move_lock_mem_cgroup(from, &flags);
/* Update mapped_file data for mem_cgroup */
preempt_disable();
__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
preempt_enable();
mem_cgroup_charge_statistics(from, anon, -nr_pages);
/* caller should have done css_get */
mem_cgroup_charge_statistics(to, anon, nr_pages);
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
/*
* check events
*/
memcg_check_events(to, page);
memcg_check_events(from, page);
return ret;
}
/**
* mem_cgroup_move_parent - moves page to the parent group
* @page: the page to move
* @pc: page_cgroup of the page
* @child: page's cgroup
*
* move charges to its parent or the root cgroup if the group has no
* parent (aka use_hierarchy==0).
* Although this might fail (get_page_unless_zero, isolate_lru_page or
* mem_cgroup_move_account fails) the failure is always temporary and
* it signals a race with a page removal/uncharge or migration. In the
* first case the page is on the way out and it will vanish from the LRU
* on the next attempt and the call should be retried later.
* Isolation from the LRU fails only if page has been isolated from
* the LRU since we looked at it and that usually means either global
* reclaim or migration going on. The page will either get back to the
* LRU or vanish.
* Finaly mem_cgroup_move_account fails only if the page got uncharged
* (!PageCgroupUsed) or moved to a different group. The page will
* disappear in the next attempt.
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
{
struct mem_cgroup *parent;
unsigned int nr_pages;
unsigned long uninitialized_var(flags);
VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
goto out;
if (isolate_lru_page(page))
goto put;
nr_pages = hpage_nr_pages(page);
parent = parent_mem_cgroup(child);
/*
* If no parent, move charges to root cgroup.
*/
if (!parent)
parent = root_mem_cgroup;
if (nr_pages > 1) {
VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
if (!ret)
__mem_cgroup_cancel_local_charge(child, nr_pages);
if (nr_pages > 1)
compound_unlock_irqrestore(page, flags);
return ret;
}
/*
* Charge the memory controller for page usage.
* Return
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask, enum charge_type ctype)
unsigned int nr_pages = 1;
nr_pages <<= compound_order(page);
/*
* Never OOM-kill a process for a huge page. The
* fault handler will fall back to regular pages.
*/
oom = false;
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
return ret;
__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
int mem_cgroup_newpage_charge(struct page *page,
struct mm_struct *mm, gfp_t gfp_mask)
{
VM_BUG_ON(page_mapped(page));
VM_BUG_ON(page->mapping && !PageAnon(page));
VM_BUG_ON(!mm);
return mem_cgroup_charge_common(page, mm, gfp_mask,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON);
}
/*
* While swap-in, try_charge -> commit or cancel, the page is locked.
* And when try_charge() successfully returns, one refcnt to memcg without
* struct page_cgroup is acquired. This refcnt will be consumed by
* "commit()" or removed by "cancel()"
*/
static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
struct page *page,
gfp_t mask,
struct mem_cgroup **memcgp)
struct page_cgroup *pc;
pc = lookup_page_cgroup(page);
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. The USED bit is protected by
* the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
if (PageCgroupUsed(pc))
return 0;
if (!do_swap_account)
goto charge_cur_mm;
memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
*memcgp = memcg;
ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
return ret;
int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
gfp_t gfp_mask, struct mem_cgroup **memcgp)
{
*memcgp = NULL;
if (mem_cgroup_disabled())
return 0;
/*
* A racing thread's fault, or swapoff, may have already
* updated the pte, and even removed page from swap cache: in
* those cases unuse_pte()'s pte_same() test will fail; but
* there's also a KSM case which does need to charge the page.
*/
if (!PageSwapCache(page)) {
int ret;
ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
if (ret == -EINTR)
ret = 0;
return ret;
}
return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
}
void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
{
if (mem_cgroup_disabled())
return;
if (!memcg)
return;
__mem_cgroup_cancel_charge(memcg, 1);
}
__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
if (!memcg)
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
* Now swap is on-memory. This means this page may be
* counted both as mem and swap....double count.
* Fix it by uncharging from memsw. Basically, this SwapCache is stable
* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
* may call delete_from_swap_cache() before reach here.
if (do_swap_account && PageSwapCache(page)) {
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
void mem_cgroup_commit_charge_swapin(struct page *page,
struct mem_cgroup *memcg)
__mem_cgroup_commit_charge_swapin(page, memcg,

Kamezawa Hiroyuki
committed
MEM_CGROUP_CHARGE_TYPE_ANON);
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
struct mem_cgroup *memcg = NULL;
enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
int ret;
return 0;
if (PageCompound(page))
return 0;
if (!PageSwapCache(page))
ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
else { /* page is swapcache/shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
gfp_mask, &memcg);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, memcg, type);
}
return ret;
static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
unsigned int nr_pages,
const enum charge_type ctype)
{
struct memcg_batch_info *batch = NULL;
bool uncharge_memsw = true;
/* If swapout, usage of swap doesn't decrease */
if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
uncharge_memsw = false;
batch = ¤t->memcg_batch;
/*
* In usual, we do css_get() when we remember memcg pointer.
* But in this case, we keep res->usage until end of a series of
* uncharges. Then, it's ok to ignore memcg's refcnt.
*/
if (!batch->memcg)
/*
* do_batch > 0 when unmapping pages or inode invalidate/truncate.
* In those cases, all pages freed continuously can be expected to be in
* the same cgroup and we have chance to coalesce uncharges.
* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
* because we want to do uncharge as soon as possible.
*/
if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
goto direct_uncharge;
if (nr_pages > 1)
/*
* In typical case, batch->memcg == mem. This means we can
* merge a series of uncharges to an uncharge of res_counter.
* If not, we uncharge res_counter ony by one.
*/
goto direct_uncharge;
/* remember freed charge and uncharge it later */
batch->nr_pages++;
batch->memsw_nr_pages++;
return;
direct_uncharge:
res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
if (unlikely(batch->memcg != memcg))
memcg_oom_recover(memcg);
* uncharge if !page_mapped(page)
__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
bool end_migration)
unsigned int nr_pages = 1;
struct page_cgroup *pc;
VM_BUG_ON(PageSwapCache(page));
nr_pages <<= compound_order(page);