Skip to content

Commit

Permalink
MDEV-34062: Implement innodb_log_file_mmap on 64-bit systems
Browse files Browse the repository at this point in the history
When using the default innodb_log_buffer_size=2m, mariadb-backup --backup
would spend a lot of time re-reading and re-parsing the log. For reads,
it would be beneficial to memory-map the entire ib_logfile0 to the
address space (typically 48 bits or 256 TiB) and read it from there,
both during --backup and --prepare.

We will introduce the Boolean read-only parameter innodb_log_file_mmap
that will be OFF by default on most platforms, to avoid aggressive
read-ahead of the entire ib_logfile0 in when only a tiny portion would be
accessed. On Linux and FreeBSD the default is innodb_log_file_mmap=ON,
because those platforms define a specific mmap(2) option for enabling
such read-ahead.

We also experimented with allowing writes of the ib_logfile0 via a
memory mapping. A fundamental problem with that would be unnecessary
read-before-write in case of a major page fault, that is, when a new,
not yet cached, virtual memory page in the circular ib_logfile0 is
being written to. There appears to be no way to tell the operating system
that we do not care about the previous contents of the page, or that the
page fault handler should just zero out the contents.

Most references to HAVE_PMEM or log_sys.is_pmem() are replaced with
references to HAVE_INNODB_MMAP or log_sys.is_mmap().

Memory-mapped regular files differ from log_sys.is_pmem() in the way that
an open file handle to ib_logfile0 will be retained. In both code paths,
log_sys.is_mmap() will hold. Holding a file handle open will allow
log_t::clear_mmap() to disable the interface with fewer operations.
  • Loading branch information
dr-m committed Aug 5, 2024
1 parent e515e80 commit 1b03db0
Show file tree
Hide file tree
Showing 23 changed files with 563 additions and 296 deletions.
1 change: 0 additions & 1 deletion cmake/os/WindowsCache.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ SET(HAVE_GETIFADDRS CACHE INTERNAL "")
SET(HAVE_GETCWD 1 CACHE INTERNAL "")
SET(HAVE_GETHOSTBYADDR_R CACHE INTERNAL "")
SET(HAVE_GETHRTIME CACHE INTERNAL "")
SET(HAVE_GETPAGESIZE CACHE INTERNAL "")
SET(HAVE_GETPASS CACHE INTERNAL "")
SET(HAVE_GETMNTENT CACHE INTERNAL "")
SET(HAVE_GETMNTENT_IN_SYS_MNTAB CACHE INTERNAL "")
Expand Down
1 change: 0 additions & 1 deletion config.h.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@
#cmakedefine HAVE_GETCWD 1
#cmakedefine HAVE_GETHOSTBYADDR_R 1
#cmakedefine HAVE_GETHRTIME 1
#cmakedefine HAVE_GETPAGESIZE 1
#cmakedefine HAVE_GETPAGESIZES 1
#cmakedefine HAVE_GETPASS 1
#cmakedefine HAVE_GETPASSPHRASE 1
Expand Down
1 change: 0 additions & 1 deletion configure.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -463,7 +463,6 @@ CHECK_SYMBOL_EXISTS(madvise "sys/mman.h" HAVE_DECL_MADVISE)
CHECK_SYMBOL_EXISTS(getpagesizes "sys/mman.h" HAVE_GETPAGESIZES)
CHECK_SYMBOL_EXISTS(tzname "time.h" HAVE_TZNAME)
CHECK_SYMBOL_EXISTS(lrand48 "stdlib.h" HAVE_LRAND48)
CHECK_SYMBOL_EXISTS(getpagesize "unistd.h" HAVE_GETPAGESIZE)
CHECK_SYMBOL_EXISTS(TIOCGWINSZ "sys/ioctl.h" GWINSZ_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(FIONREAD "sys/ioctl.h" FIONREAD_IN_SYS_IOCTL)
CHECK_SYMBOL_EXISTS(TIOCSTAT "sys/ioctl.h" TIOCSTAT_IN_SYS_IOCTL)
Expand Down
142 changes: 124 additions & 18 deletions extra/mariabackup/xtrabackup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ lsn_t checkpoint_lsn_start;
lsn_t checkpoint_no_start;
/** whether log_copying_thread() is active; protected by recv_sys.mutex */
static bool log_copying_running;
/** for --backup, target LSN to copy the log to; protected by recv_sys.mutex */
lsn_t metadata_to_lsn;

uint xtrabackup_parallel;

Expand Down Expand Up @@ -236,7 +238,6 @@ my_bool opt_encrypted_backup;
#define XTRABACKUP_METADATA_FILENAME "xtrabackup_checkpoints"
char metadata_type[30] = ""; /*[full-backuped|log-applied|incremental]*/
static lsn_t metadata_from_lsn;
lsn_t metadata_to_lsn;
static lsn_t metadata_last_lsn;

static ds_file_t* dst_log_file;
Expand Down Expand Up @@ -282,9 +283,6 @@ my_bool xtrabackup_incremental_force_scan = FALSE;
*/
ulong xtrabackup_innodb_force_recovery = 0;

/* The flushed lsn which is read from data files */
lsn_t flushed_lsn= 0;

ulong xb_open_files_limit= 0;
char *xb_plugin_dir;
char *xb_plugin_load;
Expand Down Expand Up @@ -1329,6 +1327,9 @@ enum options_xtrabackup
OPT_INNODB_BUFFER_POOL_FILENAME,
OPT_INNODB_LOCK_WAIT_TIMEOUT,
OPT_INNODB_LOG_BUFFER_SIZE,
#ifdef HAVE_INNODB_MMAP
OPT_INNODB_LOG_FILE_MMAP,
#endif
#if defined __linux__ || defined _WIN32
OPT_INNODB_LOG_FILE_BUFFERING,
#endif
Expand Down Expand Up @@ -1890,6 +1891,13 @@ struct my_option xb_server_options[] =
(G_PTR*) &log_sys.buf_size, (G_PTR*) &log_sys.buf_size, 0,
GET_UINT, REQUIRED_ARG, 2U << 20,
2U << 20, log_sys.buf_size_max, 0, 4096, 0},
#ifdef HAVE_INNODB_MMAP
{"innodb_log_file_mmap", OPT_INNODB_LOG_FILE_SIZE,
"Whether ib_logfile0 should be memory-mapped",
(G_PTR*) &log_sys.log_mmap,
(G_PTR*) &log_sys.log_mmap, 0, GET_BOOL, NO_ARG,
log_sys.log_mmap_default, 0, 0, 0, 0, 0},
#endif
#if defined __linux__ || defined _WIN32
{"innodb_log_file_buffering", OPT_INNODB_LOG_FILE_BUFFERING,
"Whether the file system cache for ib_logfile0 is enabled during --backup",
Expand Down Expand Up @@ -3368,25 +3376,126 @@ static my_bool xtrabackup_copy_datafile(ds_ctxt *ds_data,
return(FALSE);
}

#ifdef HAVE_INNODB_MMAP
static int
xtrabackup_copy_mmap_snippet(ds_file_t *ds, const byte *start, const byte *end)
{
if (UNIV_UNLIKELY(start > end))
{
if (int r= ds_write(ds, start, log_sys.buf + log_sys.file_size - start))
return r;
start= log_sys.buf + log_sys.START_OFFSET;
}
return ds_write(ds, start, end - start);
}

/** Copy memory-mapped log until the end of the log is reached
or the log_copying_stop signal is received
@return whether the operation failed */
static bool xtrabackup_copy_mmap_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
recv_sys.offset= size_t(log_sys.calc_lsn_offset(recv_sys.lsn));
recv_sys.len= size_t(log_sys.file_size);
const size_t seq_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const char one{'\1'};

for (unsigned retry_count{0};;)
{
recv_sys_t::parse_mtr_result r;
const byte *start= &log_sys.buf[recv_sys.offset];

if (recv_sys.parse_mmap<false>(false) == recv_sys_t::OK)
{
const byte *end;

do
{
/* Set the sequence bit (the backed-up log will not wrap around) */
size_t seqo= recv_sys.offset - seq_offset;
if (seqo < log_sys.START_OFFSET)
seqo+= log_sys.file_size - log_sys.START_OFFSET;
const byte *seq= &log_sys.buf[seqo];
ut_ad(*seq == log_sys.get_sequence_bit(recv_sys.lsn - seq_offset));
if (!*seq)
{
if (xtrabackup_copy_mmap_snippet(dst_log_file, start, seq) ||
ds_write(dst_log_file, &one, 1))
goto write_error;
start = seq + 1;
}
}
while ((r= recv_sys.parse_mmap<false>(false)) == recv_sys_t::OK);

end= &log_sys.buf[recv_sys.offset];

if (xtrabackup_copy_mmap_snippet(dst_log_file, start, end))
{
write_error:
msg("Error: write to ib_logfile0 failed");
return true;
}

start= end;

pthread_cond_broadcast(&scanned_lsn_cond);

if (r == recv_sys_t::GOT_EOF)
break;

retry_count= 0;
}
else
{
if (metadata_to_lsn)
{
if (metadata_to_lsn <= recv_sys.lsn)
return false;
}
else if (xtrabackup_throttle && io_ticket-- < 0)
mysql_cond_wait(&wait_throttle, &recv_sys.mutex);

if (!retry_count++)
msg("Retrying read of log at LSN=" LSN_PF, recv_sys.lsn);
else if (retry_count == 100)
break;
else
{
timespec abstime;
set_timespec_nsec(abstime, 1000000ULL /* 1 ms */);
if (!mysql_cond_timedwait(&log_copying_stop, &recv_sys.mutex,
&abstime))
return true;
}
}
}

if (verbose)
msg(">> log scanned up to (" LSN_PF ")", recv_sys.lsn);
return false;
}
#endif

/** Copy redo log until the current end of the log is reached
@return whether the operation failed */
@return whether the operation failed */
static bool xtrabackup_copy_logfile()
{
mysql_mutex_assert_owner(&recv_sys.mutex);
DBUG_EXECUTE_IF("log_checksum_mismatch", return false;);

ut_a(dst_log_file);
ut_ad(recv_sys.is_initialised());

#ifdef HAVE_INNODB_MMAP
if (log_sys.is_mmap())
return xtrabackup_copy_mmap_logfile();
#endif
const size_t sequence_offset{log_sys.is_encrypted() ? 8U + 5U : 5U};
const size_t block_size_1{log_sys.write_size - 1};

ut_ad(!log_sys.is_pmem());

{
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;
}
recv_sys.offset= size_t(recv_sys.lsn - log_sys.get_first_lsn()) &
block_size_1;
recv_sys.len= 0;

for (unsigned retry_count{0};;)
{
Expand Down Expand Up @@ -5307,9 +5416,8 @@ static bool xtrabackup_backup_func()
goto fail;
}

if (!log_sys.create()) {
goto fail;
}
log_sys.create();

/* get current checkpoint_lsn */
{
log_sys.latch.wr_lock(SRW_LOCK_CALL);
Expand Down Expand Up @@ -6666,9 +6774,7 @@ static bool xtrabackup_prepare_func(char** argv)
}

recv_sys.create();
if (!log_sys.create()) {
goto error;
}
log_sys.create();
recv_sys.recovery_on = true;

xb_fil_io_init();
Expand Down
4 changes: 0 additions & 4 deletions include/my_sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -1017,11 +1017,7 @@ extern int my_win_pclose(FILE*);
#endif

/* my_getpagesize */
#ifdef HAVE_GETPAGESIZE
#define my_getpagesize() getpagesize()
#else
int my_getpagesize(void);
#endif

int my_msync(int, void *, size_t, int);

Expand Down
6 changes: 6 additions & 0 deletions mysql-test/suite/innodb/r/log_file_size_online.result
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ Variable_name Value
innodb_log_file_size 4194304
FOUND 1 /InnoDB: Resized log to 4\.000MiB/ in mysqld.1.err
UPDATE t SET b='' WHERE a<10;
SET @save=@@GLOBAL.innodb_log_file_buffering;
SET GLOBAL innodb_log_file_buffering=OFF;
SET GLOBAL innodb_log_file_buffering=ON;
SET GLOBAL innodb_log_file_buffering=@save;
SET GLOBAL innodb_log_file_mmap=OFF;
Got one of the listed errors
SET GLOBAL innodb_log_file_size=5242880;
SHOW VARIABLES LIKE 'innodb_log_file_size';
Variable_name Value
Expand Down
11 changes: 11 additions & 0 deletions mysql-test/suite/innodb/t/log_file_size_online.test
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,17 @@ let SEARCH_PATTERN = InnoDB: Resized log to 4\\.000MiB;

UPDATE t SET b='' WHERE a<10;

--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET @save=@@GLOBAL.innodb_log_file_buffering;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=OFF;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=ON;
--error 0,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_buffering=@save;
--error ER_INCORRECT_GLOBAL_LOCAL_VAR,ER_UNKNOWN_SYSTEM_VARIABLE
SET GLOBAL innodb_log_file_mmap=OFF;

SET GLOBAL innodb_log_file_size=5242880;
SHOW VARIABLES LIKE 'innodb_log_file_size';
SELECT global_value FROM information_schema.system_variables
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/r/sysvars_innodb.result
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ variable_name not in (
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
Expand Down
1 change: 1 addition & 0 deletions mysql-test/suite/sys_vars/t/sysvars_innodb.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP
'innodb_numa_interleave', # only available WITH_NUMA
'innodb_evict_tables_on_commit_debug', # one may want to override this
'innodb_use_native_aio', # default value depends on OS
'innodb_log_file_mmap', # only available on 64-bit
'innodb_log_file_buffering', # only available on Linux and Windows
'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing
order by variable_name;
12 changes: 7 additions & 5 deletions mysys/my_getpagesize.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

#include "mysys_priv.h"

#ifndef HAVE_GETPAGESIZE

#if defined _WIN32

int my_getpagesize(void)
Expand All @@ -27,6 +25,13 @@ int my_getpagesize(void)
return si.dwPageSize;
}

#elif defined _SC_PAGESIZE

int my_getpagesize(void)
{
return (int)sysconf(_SC_PAGESIZE);
}

#else

/* Default implementation */
Expand All @@ -36,6 +41,3 @@ int my_getpagesize(void)
}

#endif

#endif

4 changes: 1 addition & 3 deletions mysys/my_init.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,9 +151,7 @@ my_bool my_init(void)
my_umask= 0660; /* Default umask for new files */
my_umask_dir= 0700; /* Default umask for new directories */
my_global_flags= 0;
#ifdef _SC_PAGESIZE
my_system_page_size= sysconf(_SC_PAGESIZE);
#endif
my_system_page_size= my_getpagesize();

/* Default creation of new files */
if ((str= getenv("UMASK")) != 0)
Expand Down
5 changes: 3 additions & 2 deletions sql/sql_class.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5223,10 +5223,11 @@ extern "C" MYSQL_THD thd_increment_pending_ops(MYSQL_THD thd)
end of async operation (such as end of group commit
write flush)

@param thd THD
@param t THD
*/
extern "C" void thd_decrement_pending_ops(MYSQL_THD thd)
extern "C" void thd_decrement_pending_ops(void *t)
{
THD *thd= static_cast<THD*>(t);
DBUG_ASSERT(thd);
DBUG_ASSERT(thd->system_thread == NON_SYSTEM_THREAD);

Expand Down
Loading

0 comments on commit 1b03db0

Please sign in to comment.