-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplacement.c
executable file
·607 lines (466 loc) · 14.4 KB
/
placement.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
/**
* @file placement.c
* @author INESC-ID
* @date 26 jul 2023
* @version 2.2.0
* @brief Page walker for finding page table entries' r/m bits. Intended for
* the 5.10.0 linux kernel. Adapted from the code provided by ilia kuzmin
* <[email protected]>, adapted from the code provided by reza
* karimi <[email protected]>, adapted from the code implemented by miguel
* marques <[email protected]>
*/
#define pr_fmt(fmt) "ambix.PLACEMENT: " fmt
#include <generated/utsrelease.h>
#include <linux/delay.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/seq_file.h>
#include <linux/shmem_fs.h>
#include <linux/signal.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/timekeeping.h>
#include <linux/uaccess.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmzone.h>
#include <linux/mutex.h>
#include <linux/pagewalk.h>
#include <linux/string.h>
#include "config.h"
#include "find_kallsyms_lookup_name.h"
#include "sys_mem_info.h"
#include "perf_counters.h"
#include "vm_management.h"
#include "placement.h"
#include "tsc.h"
#include "kernel_symbols.h"
#include "migrate.h"
#include "ambix_types.h"
// Maximum number of pages that can be walked during a call to g_walk_page_range
#define FAST_TIER_WALK_THRESHOLD 11184810
#define MAX_N_FIND 65536U
#define PMM_MIXED 1
#define DEMOTE_PAGES 0
#define EVICT_FAST_TIER 1
#define PROMOTE_YOUNG_PAGES 2
#define SWITCH_MODE 3
#define PROMOTE_DIRTY_PAGES 4
typedef int (*pte_entry_handler_t)(pte_t *, unsigned long addr,
unsigned long next, struct mm_walk *);
struct pte_callback_context_t {
u32 n_found;
u32 n_to_find;
u32 walk_iter;
unsigned long last_addr;
struct pid *curr_pid;
struct vm_heat_map fast_tier_pages;
struct vm_heat_map slow_tier_pages;
struct memory_range_t *tracking_range;
} static g_context = { 0 };
struct vm_area_walk_t last_fast_tier_scan;
struct vm_area_walk_t last_slow_tier_scan;
unsigned long long g_promotion_count = 0;
unsigned long long g_demotion_count = 0;
unsigned long long pages_walked = 0;
unsigned long long total_migrations = 0;
unsigned long long dram_migrations[5];
unsigned long long nvram_migrations[5];
int migration_type = 0;
const int g_switch_act = 0;
const int g_thresh_act = 0;
int current_average = 0;
int value_count = 0;
#define SCALE_FACTOR 1000 // For precision
unsigned long long temp_dram_usage = 0;
unsigned long long temp_nvram_usage = 0;
unsigned long long temp_cold_page_count = 0;
unsigned long long temp_hot_page_count = 0;
unsigned long long slow_tier_hist[1024] = { 0 };
unsigned long long fast_tier_hist[1024] = { 0 };
int g_threshold = 4;
int g_fast_tier_threshold = 4;
int g_slow_tier_threshold = 4;
int page_cpupid_xchg_last(struct page *page, int cpupid)
{
unsigned long old_flags, flags;
int last_cpupid;
do {
old_flags = flags = page->flags;
last_cpupid = page_cpupid_last(page);
flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
} while (
unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
return last_cpupid;
}
// ==================================================================================
// CALLBACK FUNCTIONS
// ==================================================================================
#define CPUPID_BITS 21
#define CPUPID_MAX ((1 << CPUPID_BITS) - 1) // 0x1FFFFF
#define SCORE_BITS 13
#define AGE_BITS 8
#define SCORE_MASK ((1 << SCORE_BITS) - 1) // 0x1FFF
#define AGE_MASK ((1 << AGE_BITS) - 1) // 0xFF
#define SCORE_THRESHOLD 128
#define AGE_THRESHOLD 64
#define SCORE_SHIFT 0
#define AGE_SHIFT SCORE_BITS
static int page_scan_callback(pte_t *ptep, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct pte_callback_context_t *ctx =
(struct pte_callback_context_t *)walk->private;
pte_t old_pte, new_pte;
struct page *page = NULL;
bool managed_by_ambix = 0;
ctx->last_addr = addr;
ctx->walk_iter++;
if (ctx->walk_iter == FAST_TIER_WALK_THRESHOLD) {
return 1;
}
// If page is not present or read-only
if ((ptep == NULL) || !pte_present(*ptep) || !pte_write(*ptep)) {
return 0;
}
if (ctx->tracking_range && addr >= ctx->tracking_range->end_addr) {
next_area:
struct memory_range_t *next_vm_area =
list_next_entry(ctx->tracking_range, node);
// Looped back to the start of the circular list
if (next_vm_area->start_addr <
ctx->tracking_range->start_addr) {
ctx->tracking_range = NULL;
} else {
if (next_vm_area->end_addr < addr) {
ctx->tracking_range = next_vm_area;
goto next_area;
}
ctx->tracking_range = next_vm_area;
//ctx->tracking_range->fast_tier_bytes = 0;
//ctx->tracking_range->slow_tier_bytes = 0;
}
}
if (ctx->tracking_range && ctx->tracking_range->start_addr <= addr &&
ctx->tracking_range->end_addr > addr)
managed_by_ambix = 1;
page = g_vm_normal_page(walk->vma, addr, *ptep);
int last = page_cpupid_last(page);
int age = (last >> AGE_SHIFT) & AGE_MASK;
int score = (last >> SCORE_SHIFT) & SCORE_MASK;
int new_cpupid = last;
int cold_candidate = 0;
if (last == CPUPID_MAX) {
age = 0;
score = g_threshold;
}
if (!pte_young(*ptep) && !pte_dirty(*ptep)) {
if (score > 0)
score--;
cold_candidate = 1;
} else if (score < SCORE_THRESHOLD){
if (pte_young(*ptep))
score += 8;
if (pte_dirty(*ptep))
score += 4;
}
age += (age < AGE_THRESHOLD);
new_cpupid = (age << AGE_SHIFT) | (score << SCORE_SHIFT);
page_cpupid_xchg_last(page, new_cpupid);
if (is_page_in_pool(*ptep, DRAM_POOL)) {
temp_dram_usage += PAGE_SIZE;
fast_tier_hist[score%1024]++;
if (managed_by_ambix && age == 0) {
ctx->tracking_range->fast_tier_bytes += PAGE_SIZE;
}
if (age > 10 && score <= g_fast_tier_threshold &&
cold_candidate) {
heat_map_add_page(&ctx->fast_tier_pages, addr,
ctx->curr_pid, COLD_PAGE);
temp_cold_page_count++;
}
} else if (is_page_in_pool(*ptep, NVRAM_POOL)) {
slow_tier_hist[score%1024]++;
temp_nvram_usage += PAGE_SIZE;
if (managed_by_ambix && age == 0) {
ctx->tracking_range->slow_tier_bytes += PAGE_SIZE;
}
if (age > 10 && score >= g_slow_tier_threshold && !cold_candidate) {
heat_map_add_page(&ctx->slow_tier_pages, addr,
ctx->curr_pid, HOT_PAGE);
temp_hot_page_count++;
}
}
if (!cold_candidate) {
old_pte = ptep_modify_prot_start(walk->vma, addr, ptep);
new_pte = pte_mkold(old_pte); // unset modified bit
new_pte = pte_mkclean(new_pte); // unset dirty bit
ptep_modify_prot_commit(walk->vma, addr, ptep, old_pte,
new_pte);
}
return 0;
}
/*
-------------------------------------------------------------------------------
PAGE WALKERS
-------------------------------------------------------------------------------
*/
void calculate_treshold(void)
{
u64 total_pages = get_memory_total(DRAM_POOL) / PAGE_SIZE;
u64 temp = 0;
int i;
for (i = 255; i > 0; i--) {
temp += fast_tier_hist[i] + slow_tier_hist[i];
if (temp > total_pages)
break;
}
g_threshold = i + 1;
temp = 0;
for (i = 255; i > 0; i--) {
temp += slow_tier_hist[i];
if (temp >= MAX_N_FIND)
break;
}
g_slow_tier_threshold = max(i + 1, g_threshold);
temp = 0;
for (i = 0; i < 255; i++) {
temp += fast_tier_hist[i];
if (temp >= MAX_N_FIND)
break;
}
g_fast_tier_threshold = min(i, g_threshold) ;
for (i = 0; i < 256; i++) {
fast_tier_hist[i] = 0;
slow_tier_hist[i] = 0;
}
pr_info("g_threshold: %d; demote treshold: %d; promote treshold: %d\n", g_threshold, g_fast_tier_threshold, g_slow_tier_threshold);
}
static int do_page_walk(struct bound_program_t *program,
unsigned long start_addr, unsigned long end_addr,
pte_entry_handler_t pte_handler,
struct pte_callback_context_t *ctx)
{
struct mm_walk_ops mem_walk_ops = { .pte_entry = pte_handler };
struct task_struct *t;
struct mm_struct *mm;
struct pid *pid_p = program->__pid;
ctx->curr_pid = pid_p;
t = get_pid_task(pid_p, PIDTYPE_PID);
if (!t)
return 0;
mm = get_task_mm(t);
if (!mm) {
put_task_struct(t);
return 0;
}
ctx->last_addr = start_addr;
mmap_read_lock(mm);
g_walk_page_range(mm, start_addr, end_addr, &mem_walk_ops, ctx);
mmap_read_unlock(mm);
if (ctx->walk_iter < FAST_TIER_WALK_THRESHOLD) {
program->fast_tier_bytes = temp_dram_usage;
program->slow_tier_bytes = temp_nvram_usage;
pr_info("Usage (bytes) dram: %llu pmem: %llu\n",
temp_dram_usage, temp_nvram_usage);
temp_nvram_usage = 0;
temp_dram_usage = 0;
pr_info("Pid: %d, Hot: %llu, Cold: %llu\n",
pid_nr(program->__pid), temp_hot_page_count,
temp_cold_page_count);
temp_hot_page_count = 0;
temp_cold_page_count = 0;
}
if (!program->migrations_enabled) {
heat_map_clear(&ctx->fast_tier_pages);
heat_map_clear(&ctx->slow_tier_pages);
}
mmput(mm);
put_task_struct(t);
return 1;
}
struct vm_area_walk_t
walk_vm_ranges_constrained(int start_pid, unsigned long start_addr, int end_pid,
unsigned long end_addr,
pte_entry_handler_t pte_handler,
struct pte_callback_context_t *ctx)
{
struct bound_program_t *bound_program;
struct vm_area_walk_t ret;
ret.start_pid = start_pid;
ret.start_addr = start_addr;
int walk_count = 0;
int i;
mutex_lock(&bound_list_mutex);
ctx->walk_iter = 0;
repeat:
list_for_each_entry (bound_program, &bound_program_list, node) {
if (pid_nr(bound_program->__pid) != start_pid &&
walk_count == 0)
continue;
ctx->tracking_range = find_memory_range_for_address(
pid_nr(bound_program->__pid), start_addr);
pr_info("pid %d, start %lu", pid_nr(bound_program->__pid),
start_addr);
do_page_walk(bound_program, start_addr, MAX_ADDRESS,
pte_handler, ctx);
walk_count++;
if (ctx->walk_iter < FAST_TIER_WALK_THRESHOLD) {
start_addr = 0;
} else {
ret.end_pid = pid_nr(bound_program->__pid);
ret.end_addr = ctx->last_addr;
goto out;
}
if (walk_count > 10) {
ret.end_pid = pid_nr(bound_program->__pid);
ret.end_addr = ctx->last_addr;
goto out;
}
}
if (ctx->walk_iter < FAST_TIER_WALK_THRESHOLD) {
calculate_treshold();
walk_count++;
goto repeat;
}
out:
mutex_unlock(&bound_list_mutex);
return ret;
}
struct vm_area_walk_t walk_all_vm_ranges(int start_pid,
unsigned long start_addr,
pte_entry_handler_t pte_handler,
struct pte_callback_context_t *ctx)
{
return walk_vm_ranges_constrained(start_pid, start_addr, start_pid,
start_addr, pte_handler, ctx);
}
static int do_ambix_page_walk(pte_entry_handler_t pte_handler,
struct pte_callback_context_t *ctx)
{
struct vm_area_walk_t last_vm_area =
walk_all_vm_ranges(last_fast_tier_scan.end_pid,
last_fast_tier_scan.end_addr, pte_handler,
ctx);
last_fast_tier_scan.start_pid = last_fast_tier_scan.end_pid;
last_fast_tier_scan.start_addr = last_fast_tier_scan.end_addr;
last_fast_tier_scan.end_pid = last_vm_area.end_pid;
last_fast_tier_scan.end_addr = last_vm_area.end_addr;
return heat_map_size(&ctx->fast_tier_pages);
}
/**
* returns number of pages found if success, 0 if error occurs
**/
int mem_walk(struct pte_callback_context_t *ctx, const int n, const int mode)
{
pte_entry_handler_t pte_handler;
ctx->n_found = 0;
ctx->n_to_find = n;
heat_map_clear(&ctx->fast_tier_pages);
heat_map_clear(&ctx->slow_tier_pages);
pte_handler = page_scan_callback;
do_ambix_page_walk(pte_handler, ctx);
pr_info("found %d cold pages in dram \n",
heat_map_size(&ctx->fast_tier_pages));
pr_info("found %d hot pages in optane \n",
heat_map_size(&ctx->slow_tier_pages));
u32 slow_tier_hotter_pages =
heat_map_compare(&ctx->slow_tier_pages, &ctx->fast_tier_pages);
int demoted = 0;
int promoted = 0;
int ram_usage = get_real_memory_usage_per(DRAM_POOL);
if (ram_usage > 95 && heat_map_size(&ctx->fast_tier_pages)) {
demoted += do_migration(
&ctx->fast_tier_pages,
min(MAX_N_FIND, heat_map_size(&ctx->fast_tier_pages)),
NVRAM_POOL, COLDER_PAGES_FIRST);
}
if (ram_usage < 95 && heat_map_size(&ctx->slow_tier_pages)) {
promoted += do_migration(
&ctx->slow_tier_pages,
min(MAX_N_FIND, heat_map_size(&ctx->slow_tier_pages)),
DRAM_POOL, WARMER_PAGES_FIRST);
}
if (ram_usage >= 95 && slow_tier_hotter_pages > 0) {
demoted += do_migration(&ctx->fast_tier_pages,
min(MAX_N_FIND, slow_tier_hotter_pages),
NVRAM_POOL, COLDER_PAGES_FIRST);
promoted +=
do_migration(&ctx->slow_tier_pages,
min(MAX_N_FIND, slow_tier_hotter_pages),
DRAM_POOL, WARMER_PAGES_FIRST);
}
g_promotion_count += promoted;
g_demotion_count += demoted;
pr_info("Promoted: %d, Demoted: %d\n", promoted, demoted);
return 1;
}
// MAIN ENTRY POINT
int ambix_check_memory(void)
{
u32 n_migrated = 0;
struct pte_callback_context_t *ctx = &g_context;
pr_info("Memory management routine\n");
//u64 track_start = tsc_rd();
//pr_info("Tracking time: %lu\n", tsc_to_usec(tsc_rd() - track_start));
mutex_lock(&bound_list_mutex);
refresh_bound_programs();
if (list_empty(&bound_program_list)) {
pr_info("No bound processes...\n");
mutex_unlock(&bound_list_mutex);
goto release_return_acm;
}
mutex_unlock(&bound_list_mutex);
/*pr_info("Ambix DRAM Usage: %d\n", get_memory_usage_percent(DRAM_POOL));
pr_info("Ambix NVRAM Usage: %d\n",
get_memory_usage_percent(NVRAM_POOL));
pr_info("System DRAM Usage: %d\n",
get_real_memory_usage_per(DRAM_POOL));
pr_info("System NVRAM Usage: %d\n",
get_real_memory_usage_per(NVRAM_POOL));*/
mem_walk(ctx, MAX_N_FIND, 1);
release_return_acm:
return n_migrated;
}
/*
-------------------------------------------------------------------------------
MODULE INIT/EXIT
-------------------------------------------------------------------------------
*/
int ambix_init(void)
{
pr_debug("Initializing\n");
import_symbols();
size_t i;
for (i = 0; i < get_pool_size(DRAM_POOL); ++i) {
int n = get_pool_nodes(DRAM_POOL)[i];
if (!node_online(n)) {
pr_err("DRAM node %d is not online.\n", n);
return -1;
}
}
for (i = 0; i < get_pool_size(NVRAM_POOL); ++i) {
int n = get_pool_nodes(NVRAM_POOL)[i];
if (!node_online(n)) {
pr_err("NVRAM node %d is not online.\n", n);
return -1;
}
}
return 0;
}
void ambix_cleanup(void)
{
pr_debug("Cleaning up\n");
}