Skip to content

Commit

Permalink
MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems
Browse files Browse the repository at this point in the history
When using the default innodb_log_buffer_size=2m, mariadb-backup --backup
would spend a lot of time re-reading and re-parsing the log. For reads,
it would be beneficial to memory-map the entire ib_logfile0 to the
address space (typically 48 bits or 256 TiB) and read it from there,
both during --backup and --prepare. That is what we will be doing by
default.

We can also enable memory-mapped log writes in case the new parameter
innodb_log_file_mmap is set to ON. This could speed up I/O and allow
the log data to be shared between mariadbd and mariadb-backup --backup
in the RAM buffer.

Memory-mapped regular files differ from log_sys.is_pmem() in the way
that an open file handle to ib_logfile0 will be retained. That allows
log_t::set_mmap() to enable or disable the interface with fewer
operations.

On log checkpoint we will invoke madvise() with MADV_DONTNEED in order
to reduce the memory pressure. This could lead to reads of old
garbage contents of the circular log file when a page fault occurs
while writing a record. There does not seem to be any way around this;
on Linux, invoking fallocate() with FALLOC_FL_ZERO_RANGE would make
things even worse by triggering additional metadata writes.

Most references to HAVE_PMEM or log_sys.is_pmem() are replaced with
HAVE_INNODB_MMAP or log_sys.is_mmap(). The main difference is that
PMEM skips the use of write_lock and flush_lock and uses pmem_persist(),
while the memory-mapped interface will use a combination of msync()
and fdatasync().

Starting with Linux 2.6.19, msync(MS_ASYNC) is a no-op, so we will not
invoke it on Linux. For durable writes, we will invoke msync(MS_SYNC).

Note: It is probably not advisable to enable memory-mapped log writes.
It could make sense with a small innodb_log_file_size that fits in RAM.

TODO: Can we avoid aggressive read-ahead of all of a huge ib_logfile0 in
mariadb-backup when a tiny portion would be accessed?
On Microsoft Windows, this seems to be an issue.
  • Loading branch information
dr-m committed Jun 19, 2024
1 parent 34813c1 commit 043745b
Show file tree
Hide file tree
Showing 22 changed files with 755 additions and 307 deletions.
1 change: 0 additions & 1 deletion cmake/os/WindowsCache.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "")
SET(HAVE_GETCWD 1 CACHE INTERNAL "")
SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "")
SET(HAVE_GETHRTIME CACHE INTERNAL "")
SET(HAVE_GETPAGESIZE CACHE INTERNAL "")
SET(HAVE_GETPASS CACHE INTERNAL "")
SET(HAVE_GETMNTENT CACHE INTERNAL "")
SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "")
Expand Down
1 change: 0 additions & 1 deletion config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@
#cmakedefine HAVE_GETCWD 1
#cmakedefine HAVE_GETHOSTBYADDR_R 1
#cmakedefine HAVE_GETHRTIME 1
#cmakedefine HAVE_GETPAGESIZE 1
#cmakedefine HAVE_GETPAGESIZES 1
#cmakedefine HAVE_GETPASS 1
#cmakedefine HAVE_GETPASSPHRASE 1
Expand Down
1 change: 0 additions & 1 deletion configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE)
CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES)
CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME)
CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48)
CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE)
CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL)
Expand Down
142 changes: 124 additions & 18 deletions extra/mariabackup/xtrabackup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start;
lsn_t checkpoint_no_start;
/** whether log_copying_thread() is active; protected by recv_sys.mutex */
static bool log_copying_running;
/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
lsn_t metadata_to_lsn;

uint xtrabackup_parallel;

Expand Down Expand Up @@ -236,7 +238,6 @@ my_bool opt_encrypted_backup;
#define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints"
char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/
static lsn_t metadata_from_lsn;
lsn_t metadata_to_lsn;
static lsn_t metadata_last_lsn;

static ds_file_t* dst_log_file;
Expand Down Expand Up @@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE;
*/
ulong xtrabackup_innodb_force_recovery = 0;

/* The flushed lsn which is read from data files */
lsn_t flushed_lsn= 0;

ulong xb_open_files_limit= 0;
char *xb_plugin_dir;
char *xb_plugin_load;
Expand Down Expand Up @@ -1329,6 +1327,9 @@ enum options_xtrabackup
OPT_INNODB_BUFFER_POOL_FILENAME,
OPT_INNODB_LOCK_WAIT_TIMEOUT,
OPT_INNODB_LOG_BUFFER_SIZE,
#ifdef HAVE_INNODB_MMAP
OPT_INNODB_LOG_FILE_MMAP,
#endif
#if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING,
#endif
Expand Down Expand Up @@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0,
GET_UINT, REQUIRED_ARG, 2U << 20,
2U << 20, log_sys.buf_size_max, 0, 4096, 0},
#ifdef HAVE_INNODB_MMAP
{"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE,
"Whether ib_logfile0 should be memory-mapped",
(G_PTR*) &log_sys.log_mmap,
(G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG,
TRUE, 0, 0, 0, 0, 0},
#endif
#if defined __linux__ || defined _WIN32
{"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING,
"Whether the file system cache for ib_logfile0 is enabled during --backup",
Expand Down Expand Up @@ -3360,25 +3368,126 @@ static my_bool xtrabackup_copy_datafile(ds_ctxt *ds_data,
return(FALSE);
}

#ifdef HAVE_INNODB_MMAP
static int
xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end)
{
if (UNIV_UNLIKELY(start > end))
{
if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start))
return r;
start= log_sys.buf + log_sys.START_OFFSET;
}
return ds_write(ds, start, end - start);
}

/** Copy memory-mapped log until the end of the log is reached
or the log_copying_stop signal is received
@return whether the operation failed */
static bool xtrabackup_copy_mmap_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
recv_sys.len= size_t(log_sys.file_size);
const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const char one{'\1'};

for (unsigned retry_count{0};;)
{
recv_sys_t::parse_mtr_result r;
const byte *start= &log_sys.buf[recv_sys.offset];

if (recv_sys.parse_mmap<false>(false) == recv_sys_t::OK)
{
const byte *end;

do
{
/* Set the sequence bit (the backed-up log will not wrap around) */
size_t seqo= recv_sys.offset - seq_offset;
if (seqo < log_sys.START_OFFSET)
seqo+= log_sys.file_size - log_sys.START_OFFSET;
const byte *seq= &log_sys.buf[seqo];
ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset));
if (!*seq)
{
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) ||
ds_write(dst_log_file, &one, 1))
goto write_error;
start = seq + 1;
}
}
while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);

end= &log_sys.buf[recv_sys.offset];

if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end))
{
write_error:
msg("Error: write to ib_logfile0 failed");
return true;
}

start= end;

pthread_cond_broadcast(&scanned_lsn_cond);

if (r == recv_sys_t::GOT_EOF)
break;

retry_count= 0;
}
else
{
if (metadata_to_lsn)
{
if (metadata_to_lsn <= recv_sys.lsn)
return false;
}
else if (xtrabackup_throttle && io_ticket-- < 0)
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);

if (!retry_count++)
msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn);
else if (retry_count == 100)
break;
else
{
timespec abstime;
set_timespec_nsec(abstime, 1000000ULL /* 1 ms */);
if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex,
&abstime))
return true;
}
}
}

if (verbose)
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
return false;
}
#endif

/** Copy redo log until the current end of the log is reached
@return whether the operation failed */
@return whether the operation failed */
static bool xtrabackup_copy_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
DBUG_EXECUTE_IF("log_checksum_mismatch", return false;);

ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised());

#ifdef HAVE_INNODB_MMAP
if (log_sys.is_mmap())
return xtrabackup_copy_mmap_logfile();
#endif
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.get_block_size() - 1};

ut_ad(!log_sys.is_pmem());

{
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;
}
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;

for (unsigned retry_count{0};;)
{
Expand Down Expand Up @@ -5299,9 +5408,8 @@ static bool xtrabackup_backup_func()
goto fail;
}

if (!log_sys.create()) {
goto fail;
}
log_sys.create();

/* get current checkpoint_lsn */
{
log_sys.latch.wr_lock(SRW_LOCK_CALL);
Expand Down Expand Up @@ -6658,9 +6766,7 @@ static bool xtrabackup_prepare_func(char** argv)
}

recv_sys.create();
if (!log_sys.create()) {
goto error;
}
log_sys.create();
recv_sys.recovery_on = true;

xb_fil_io_init();
Expand Down
4 changes: 0 additions & 4 deletions include/my_sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*);
#endif

/* my_getpagesize */
#ifdef HAVE_GETPAGESIZE
#define my_getpagesize() getpagesize()
#else
int my_getpagesize(void);
#endif

int my_msync(int, void *, size_t, int);

Expand Down
8 changes: 8 additions & 0 deletions mysql-test/suite/innodb/r/log_file_size_online.result
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,14 @@ Variable_name Value
innodb_log_file_size 4194304
FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err
UPDATE t SET b='' WHERE a<10;
SET @save=@@GLOBAL.innodb_log_file_buffering;
SET GLOBAL innodb_log_file_buffering=OFF;
SET GLOBAL innodb_log_file_buffering=ON;
SET GLOBAL innodb_log_file_buffering=@save;
SET @save=@@GLOBAL.innodb_log_file_mmap;
SET GLOBAL innodb_log_file_mmap=OFF;
SET GLOBAL innodb_log_file_mmap=ON;
SET GLOBAL innodb_log_file_mmap=@save;
SET GLOBAL innodb_log_file_size=5242880;
SHOW VARIABLES LIKE 'innodb_log_file_size';
Variable_name Value
Expand Down
17 changes: 17 additions & 0 deletions mysql-test/suite/innodb/t/log_file_size_online.test
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB;

UPDATE t SET b='' WHERE a<10;

--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET @save=@@GLOBAL.innodb_log_file_buffering;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=OFF;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=ON;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=@save;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET @save=@@GLOBAL.innodb_log_file_mmap;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=OFF;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=ON;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=@save;

SET GLOBAL innodb_log_file_size=5242880;
SHOW VARIABLES LIKE 'innodb_log_file_size';
SELECT global_value FROM information_schema.system_variables
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/r/sysvars_innodb.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/t/sysvars_innodb.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
12 changes: 7 additions & 5 deletions mysys/my_getpagesize.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

#include "mysys_priv.h"

#ifndef HAVE_GETPAGESIZE

#if defined _WIN32

int my_getpagesize(void)
Expand All @@ -27,6 +25,13 @@ int my_getpagesize(void)
return si.dwPageSize;
}

#elif defined _SC_PAGESIZE

int my_getpagesize(void)
{
return (int)sysconf(_SC_PAGESIZE);
}

#else

/* Default implementation */
Expand All @@ -36,6 +41,3 @@ int my_getpagesize(void)
}

#endif

#endif

4 changes: 1 addition & 3 deletions mysys/my_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,7 @@ my_bool my_init(void)
my_umask= 0660; /* Default umask for new files */
my_umask_dir= 0700; /* Default umask for new directories */
my_global_flags= 0;
#ifdef _SC_PAGESIZE
my_system_page_size= sysconf(_SC_PAGESIZE);
#endif
my_system_page_size= my_getpagesize();

/* Default creation of new files */
if ((str= getenv("UMASK")) != 0)
Expand Down
5 changes: 3 additions & 2 deletions sql/sql_class.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5164,10 +5164,11 @@ extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD thd)
end of async operation (such as end of group commit
write flush)

@param thd THD
@param t THD
*/
extern "C" void thd_decrement_pending_ops(MYSQL_THD thd)
extern "C" void thd_decrement_pending_ops(void *t)
{
THD *thd= static_cast<THD*>(t);
DBUG_ASSERT(thd);
DBUG_ASSERT(thd->system_thread == NON_SYSTEM_THREAD);

Expand Down
Loading

0 comments on commit 043745b

Please sign in to comment.