-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmem_table.h
201 lines (172 loc) · 7 KB
/
mem_table.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#pragma once
#include <linux/mman.h>
#include <tbb/concurrent_vector.h>
#include <cerrno>
#include <cstdint>
#include <iosfwd>
#include <tuple>
#include "block/block.h"
#include "config.h"
#include "const.h"
#include "idx.h"
#include "posix.h"
#include "utils/logging.h"
#include "utils/tbb.h"
#include "utils/timer.h"
#include "utils/utils.h"
namespace madfs::dram {
constexpr static uint32_t GROW_UNIT_IN_BLOCK_SHIFT =
GROW_UNIT_SHIFT - BLOCK_SHIFT;
constexpr static uint32_t GROW_UNIT_IN_BLOCK_MASK =
(1 << GROW_UNIT_IN_BLOCK_SHIFT) - 1;
// map LogicalBlockIdx into memory address
// this is a more low-level data structure than Allocator
// it should maintain the virtualization of infinite large of file
// everytime it gets a LogicalBlockIdx:
// - if this block is already mapped; return addr
// - if this block is allocated from kernel filesystem, mmap and return
// the addr
// - if this block is not even allocated from kernel filesystem, grow_to_fit and
// map it, and return the address
class MemTable : noncopyable {
pmem::MetaBlock* meta;
int fd;
int prot;
// immutable after ctor
pmem::Block* first_region;
uint32_t first_region_num_blocks;
// map a chunk_idx to addr, where chunk_idx =
// (lidx - first_region_num_blocks) >> GROW_UNIT_IN_BLOCK_SHIFT
tbb::concurrent_vector<std::atomic<pmem::Block*>,
zero_allocator<std::atomic<pmem::Block*>>>
table;
static_assert(std::atomic<pmem::Block*>::is_always_lock_free);
// a vector of <addr, length> pairs
tbb::concurrent_vector<std::tuple<void*, size_t>> mmap_regions;
public:
MemTable(int fd, off_t init_file_size, bool read_only)
: fd(fd), prot(read_only ? PROT_READ : PROT_READ | PROT_WRITE) {
bool is_empty = init_file_size == 0;
// grow to multiple of grow_unit_size if the file is empty or the file
// size is not grow_unit aligned
bool should_grow = is_empty || !IS_ALIGNED(init_file_size, GROW_UNIT_SIZE);
off_t file_size = init_file_size;
if (should_grow) {
file_size =
is_empty ? PREALLOC_SIZE : ALIGN_UP(init_file_size, GROW_UNIT_SIZE);
int ret = posix::fallocate(fd, 0, 0, file_size);
PANIC_IF(ret < 0, "fallocate failed");
}
first_region = mmap_file(static_cast<size_t>(file_size), 0, 0);
first_region_num_blocks = BLOCK_SIZE_TO_IDX(file_size);
meta = &first_region[0].meta_block;
if (!is_empty && !meta->is_valid())
throw FileInitException("invalid meta block");
// update the mata block if necessary
if (should_grow)
meta->set_num_logical_blocks_if_larger(first_region_num_blocks);
}
~MemTable() {
for (const auto& [addr, length] : mmap_regions) {
munmap(addr, length);
VALGRIND_PMC_REMOVE_PMEM_MAPPING(addr, length);
}
}
[[nodiscard]] pmem::MetaBlock* get_meta() const { return meta; }
/**
* it will then check if it has been mapped into the address space; if not,
* it does mapping first; if the file does not even have the corresponding
* data block, it allocates from the kernel.
*
* @param idx the logical block index
* @return the Block pointer if idx is not 0; nullptr for idx == 0, and the
* caller should handle this case
*/
pmem::Block* lidx_to_addr_rw(LogicalBlockIdx idx) {
if (unlikely(idx == 0)) return nullptr;
// super fast path: within first_region, no need touch concurrent vector
if (idx < first_region_num_blocks) return &first_region[idx.get()];
// fast path: just look up
uint32_t chunk_idx =
(idx - first_region_num_blocks) >> GROW_UNIT_IN_BLOCK_SHIFT;
uint32_t chunk_local_idx = idx & GROW_UNIT_IN_BLOCK_MASK;
if (chunk_idx < table.size()) {
pmem::Block* chunk_addr = table[chunk_idx];
if (chunk_addr) return chunk_addr + chunk_local_idx;
} else {
table.grow_to_at_least(next_pow2(chunk_idx));
}
// ensure this idx has real blocks allocated; do allocation if not
grow_to_fit(idx);
LogicalBlockIdx chunk_begin_lidx = idx & ~GROW_UNIT_IN_BLOCK_MASK;
pmem::Block* chunk_addr = mmap_file(
GROW_UNIT_SIZE, static_cast<off_t>(BLOCK_IDX_TO_SIZE(chunk_begin_lidx)),
MAP_POPULATE);
table[chunk_idx] = chunk_addr;
return chunk_addr + chunk_local_idx;
}
[[nodiscard]] const pmem::Block* lidx_to_addr_ro(LogicalBlockIdx lidx) {
constexpr static const char __attribute__((aligned(BLOCK_SIZE)))
empty_block[BLOCK_SIZE]{};
if (lidx == 0) return reinterpret_cast<const pmem::Block*>(&empty_block);
return lidx_to_addr_rw(lidx);
}
private:
// ask more blocks for the kernel filesystem, so that idx is valid
void grow_to_fit(LogicalBlockIdx idx) {
// fast path: if smaller than the number of block; return
if (idx < meta->get_num_logical_blocks()) return;
// slow path: acquire lock to verify and grow_to_fit if necessary
// the new file size should be a multiple of grow_to_fit unit
// we have `idx + 1` since we want to grow_to_fit the file when idx is a
// multiple of the number of blocks in a grow_to_fit unit (e.g., 512 for 2
// MB grow_to_fit)
uint64_t file_size = ALIGN_UP(BLOCK_IDX_TO_SIZE(idx + 1), GROW_UNIT_SIZE);
int ret = posix::fallocate(fd, 0, 0, static_cast<off_t>(file_size));
PANIC_IF(ret, "fd %d: fallocate failed", fd);
meta->set_num_logical_blocks_if_larger(BLOCK_SIZE_TO_IDX(file_size));
}
/**
* a private helper function that calls mmap internally
* @return the pointer to the first block on the persistent memory
*/
pmem::Block* mmap_file(size_t length, off_t offset, int flags = 0) {
TimerGuard<Event::MMAP> guard;
if constexpr (BuildOptions::map_sync)
flags |= MAP_SHARED_VALIDATE | MAP_SYNC;
else
flags |= MAP_SHARED;
if constexpr (BuildOptions::map_populate) flags |= MAP_POPULATE;
void* addr = posix::mmap(nullptr, length, prot, flags, fd, offset);
if (unlikely(addr == MAP_FAILED)) {
if constexpr (BuildOptions::map_sync) {
if (errno == EOPNOTSUPP) {
LOG_WARN("MAP_SYNC not supported for fd = %d. Retry w/o MAP_SYNC",
fd);
flags &= ~(MAP_SHARED_VALIDATE | MAP_SYNC);
flags |= MAP_SHARED;
addr = posix::mmap(nullptr, length, prot, flags, fd, offset);
}
}
PANIC_IF(addr == MAP_FAILED, "mmap fd = %d failed", fd);
}
VALGRIND_PMC_REGISTER_PMEM_MAPPING(addr, length);
mmap_regions.emplace_back(addr, length);
return static_cast<pmem::Block*>(addr);
}
public:
friend std::ostream& operator<<(std::ostream& out, const MemTable& m) {
out << "MemTable:\n";
out << "\t" << 0 << " - " << m.first_region_num_blocks << ": "
<< m.first_region << "\n";
uint32_t chunk_idx = m.first_region_num_blocks >> GROW_UNIT_IN_BLOCK_SHIFT;
for (const auto& mem_addr : m.table) {
LogicalBlockIdx chunk_begin_lidx = chunk_idx << GROW_UNIT_IN_BLOCK_SHIFT;
out << "\t" << chunk_begin_lidx << " - "
<< chunk_begin_lidx + NUM_BLOCKS_PER_GROW << ": " << mem_addr << "\n";
++chunk_idx;
}
return out;
}
};
} // namespace madfs::dram