Skip to content

Commit

Permalink
MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems
Browse files Browse the repository at this point in the history
When using the default innodb_log_buffer_size=2m, mariadb-backup --backup
would spend a lot of time re-reading and re-parsing the log. For reads,
it would be beneficial to memory-map the entire ib_logfile0 to the
address space (typically 48 bits or 256 TiB) and read it from there,
both during --backup and --prepare.

We will introduce the Boolean read-only parameter innodb_log_file_mmap
that will be OFF by default on most platforms, to avoid aggressive
read-ahead of the entire ib_logfile0 in when only a tiny portion would be
accessed. On Linux and FreeBSD the default is innodb_log_file_mmap=ON,
because those platforms define a specific mmap(2) option for enabling
such read-ahead and therefore it can be assumed that the default would
be on-demand paging. This parameter will only have impact on the initial
InnoDB startup and recovery. Any writes to the log will use regular I/O,
except when the ib_logfile0 is stored in a specially configured file system
that is backed by persistent memory (Linux "mount -o dax").

We also experimented with allowing writes of the ib_logfile0 via a
memory mapping and decided against it. A fundamental problem would be
unnecessary read-before-write in case of a major page fault, that is,
when a new, not yet cached, virtual memory page in the circular
ib_logfile0 is being written to. There appears to be no way to tell
the operating system that we do not care about the previous contents of
the page, or that the page fault handler should just zero it out.

Many references to HAVE_PMEM have been replaced with references to
HAVE_INNODB_MMAP.

The predicate log_sys.is_pmem() has been replaced with
log_sys.is_mmap() && !log_sys.is_opened().

Memory-mapped regular files differ from MAP_SYNC (PMEM) mappings in the
way that an open file handle to ib_logfile0 will be retained. In both
code paths, log_sys.is_mmap() will hold. Holding a file handle open will
allow log_t::clear_mmap() to disable the interface with fewer operations.

It should be noted that ever since
commit 685d958 (MDEV-14425)
most 64-bit Linux platforms on our CI platforms
(s390x a.k.a. IBM System Z being a notable exception) read and write
/dev/shm/*/ib_logfile0 via a memory mapping, pretending that it is
persistent memory (mount -o dax). So, the memory mapping based log
parsing that this change is enabling by default on Linux and FreeBSD
has already been extensively tested on Linux.

::log_mmap(): If a log cannot be opened as PMEM and the desired access
is read-only, try to open a read-only memory mapping.

xtrabackup_copy_mmap_snippet(), xtrabackup_copy_mmap_logfile():
Copy the InnoDB log in mariadb-backup --backup from a memory
mapped file.
  • Loading branch information
dr-m committed Sep 26, 2024
1 parent 971cf59 commit 4b49835
Show file tree
Hide file tree
Showing 22 changed files with 572 additions and 289 deletions.
1 change: 0 additions & 1 deletion cmake/os/WindowsCache.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "")
SET(HAVE_GETCWD 1 CACHE INTERNAL "")
SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "")
SET(HAVE_GETHRTIME CACHE INTERNAL "")
SET(HAVE_GETPAGESIZE CACHE INTERNAL "")
SET(HAVE_GETPASS CACHE INTERNAL "")
SET(HAVE_GETMNTENT CACHE INTERNAL "")
SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "")
Expand Down
1 change: 0 additions & 1 deletion config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@
#cmakedefine HAVE_GETCWD 1
#cmakedefine HAVE_GETHOSTBYADDR_R 1
#cmakedefine HAVE_GETHRTIME 1
#cmakedefine HAVE_GETPAGESIZE 1
#cmakedefine HAVE_GETPAGESIZES 1
#cmakedefine HAVE_GETPASS 1
#cmakedefine HAVE_GETPASSPHRASE 1
Expand Down
1 change: 0 additions & 1 deletion configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE)
CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES)
CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME)
CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48)
CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE)
CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL)
Expand Down
142 changes: 124 additions & 18 deletions extra/mariabackup/xtrabackup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start;
lsn_t checkpoint_no_start;
/** whether log_copying_thread() is active; protected by recv_sys.mutex */
static bool log_copying_running;
/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
lsn_t metadata_to_lsn;

uint xtrabackup_parallel;

Expand Down Expand Up @@ -236,7 +238,6 @@ my_bool opt_encrypted_backup;
#define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints"
char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/
static lsn_t metadata_from_lsn;
lsn_t metadata_to_lsn;
static lsn_t metadata_last_lsn;

static ds_file_t* dst_log_file;
Expand Down Expand Up @@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE;
*/
ulong xtrabackup_innodb_force_recovery = 0;

/* The flushed lsn which is read from data files */
lsn_t flushed_lsn= 0;

ulong xb_open_files_limit= 0;
char *xb_plugin_dir;
char *xb_plugin_load;
Expand Down Expand Up @@ -1329,6 +1327,9 @@ enum options_xtrabackup
OPT_INNODB_BUFFER_POOL_FILENAME,
OPT_INNODB_LOCK_WAIT_TIMEOUT,
OPT_INNODB_LOG_BUFFER_SIZE,
#ifdef HAVE_INNODB_MMAP
OPT_INNODB_LOG_FILE_MMAP,
#endif
#if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING,
#endif
Expand Down Expand Up @@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0,
GET_UINT, REQUIRED_ARG, 2U << 20,
2U << 20, log_sys.buf_size_max, 0, 4096, 0},
#ifdef HAVE_INNODB_MMAP
{"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE,
"Whether ib_logfile0 should be memory-mapped",
(G_PTR*) &log_sys.log_mmap,
(G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG,
log_sys.log_mmap_default, 0, 0, 0, 0, 0},
#endif
#if defined __linux__ || defined _WIN32
{"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING,
"Whether the file system cache for ib_logfile0 is enabled during --backup",
Expand Down Expand Up @@ -3368,25 +3376,126 @@ static my_bool xtrabackup_copy_datafile(ds_ctxt *ds_data,
return(FALSE);
}

#ifdef HAVE_INNODB_MMAP
static int
xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end)
{
if (UNIV_UNLIKELY(start > end))
{
if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start))
return r;
start= log_sys.buf + log_sys.START_OFFSET;
}
return ds_write(ds, start, end - start);
}

/** Copy memory-mapped log until the end of the log is reached
or the log_copying_stop signal is received
@return whether the operation failed */
static bool xtrabackup_copy_mmap_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
recv_sys.len= size_t(log_sys.file_size);
const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const char one{'\1'};

for (unsigned retry_count{0};;)
{
recv_sys_t::parse_mtr_result r;
const byte *start= &log_sys.buf[recv_sys.offset];

if (recv_sys.parse_mmap<false>(false) == recv_sys_t::OK)
{
const byte *end;

do
{
/* Set the sequence bit (the backed-up log will not wrap around) */
size_t seqo= recv_sys.offset - seq_offset;
if (seqo < log_sys.START_OFFSET)
seqo+= log_sys.file_size - log_sys.START_OFFSET;
const byte *seq= &log_sys.buf[seqo];
ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset));
if (!*seq)
{
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) ||
ds_write(dst_log_file, &one, 1))
goto write_error;
start = seq + 1;
}
}
while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);

end= &log_sys.buf[recv_sys.offset];

if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end))
{
write_error:
msg("Error: write to ib_logfile0 failed");
return true;
}

start= end;

pthread_cond_broadcast(&scanned_lsn_cond);

if (r == recv_sys_t::GOT_EOF)
break;

retry_count= 0;
}
else
{
if (metadata_to_lsn)
{
if (metadata_to_lsn <= recv_sys.lsn)
return false;
}
else if (xtrabackup_throttle && io_ticket-- < 0)
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);

if (!retry_count++)
msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn);
else if (retry_count == 100)
break;
else
{
timespec abstime;
set_timespec_nsec(abstime, 1000000ULL /* 1 ms */);
if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex,
&abstime))
return true;
}
}
}

if (verbose)
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
return false;
}
#endif

/** Copy redo log until the current end of the log is reached
@return whether the operation failed */
@return whether the operation failed */
static bool xtrabackup_copy_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
DBUG_EXECUTE_IF("log_checksum_mismatch", return false;);

ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised());

#ifdef HAVE_INNODB_MMAP
if (log_sys.is_mmap())
return xtrabackup_copy_mmap_logfile();
#endif
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.write_size - 1};

ut_ad(!log_sys.is_pmem());

{
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;
}
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;

for (unsigned retry_count{0};;)
{
Expand Down Expand Up @@ -5376,9 +5485,8 @@ static bool xtrabackup_backup_func()
goto fail;
}

if (!log_sys.create()) {
goto fail;
}
log_sys.create();

/* get current checkpoint_lsn */
{
log_sys.latch.wr_lock(SRW_LOCK_CALL);
Expand Down Expand Up @@ -6730,9 +6838,7 @@ static bool xtrabackup_prepare_func(char** argv)
}

recv_sys.create();
if (!log_sys.create()) {
goto error;
}
log_sys.create();
recv_sys.recovery_on = true;

xb_fil_io_init();
Expand Down
4 changes: 0 additions & 4 deletions include/my_sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*);
#endif

/* my_getpagesize */
#ifdef HAVE_GETPAGESIZE
#define my_getpagesize() getpagesize()
#else
int my_getpagesize(void);
#endif

int my_msync(int, void *, size_t, int);

Expand Down
6 changes: 6 additions & 0 deletions mysql-test/suite/innodb/r/log_file_size_online.result
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
Variable_name Value
innodb_log_file_size 4194304
FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err
SET @save=@@GLOBAL.innodb_log_file_buffering;
SET GLOBAL innodb_log_file_buffering=OFF;
SET GLOBAL innodb_log_file_buffering=ON;
SET GLOBAL innodb_log_file_buffering=@save;
SET GLOBAL innodb_log_file_mmap=OFF;
Got one of the listed errors
SET GLOBAL innodb_log_file_size=5242880;
connect con1,localhost,root;
UPDATE t SET b='' WHERE a<10;
Expand Down
11 changes: 11 additions & 0 deletions mysql-test/suite/innodb/t/log_file_size_online.test
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ SHOW VARIABLES LIKE 'innodb_log_file_size';
let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB;
--source include/search_pattern_in_file.inc

--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET @save=@@GLOBAL.innodb_log_file_buffering;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=OFF;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=ON;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=@save;
--error ER_INCORRECT_GLOBAL_LOCAL_VAR,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=OFF;

send SET GLOBAL innodb_log_file_size=5242880;

--connect con1,localhost,root
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/r/sysvars_innodb.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/t/sysvars_innodb.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
12 changes: 7 additions & 5 deletions mysys/my_getpagesize.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

#include "mysys_priv.h"

#ifndef HAVE_GETPAGESIZE

#if defined _WIN32

int my_getpagesize(void)
Expand All @@ -27,6 +25,13 @@ int my_getpagesize(void)
return si.dwPageSize;
}

#elif defined _SC_PAGESIZE

int my_getpagesize(void)
{
return (int)sysconf(_SC_PAGESIZE);
}

#else

/* Default implementation */
Expand All @@ -36,6 +41,3 @@ int my_getpagesize(void)
}

#endif

#endif

4 changes: 1 addition & 3 deletions mysys/my_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,7 @@ my_bool my_init(void)
my_umask= 0660; /* Default umask for new files */
my_umask_dir= 0700; /* Default umask for new directories */
my_global_flags= 0;
#ifdef _SC_PAGESIZE
my_system_page_size= sysconf(_SC_PAGESIZE);
#endif
my_system_page_size= my_getpagesize();

/* Default creation of new files */
if ((str= getenv("UMASK")) != 0)
Expand Down
Loading

0 comments on commit 4b49835

Please sign in to comment.