diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..4b6d8f7
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,60 @@
+name: CI
+
+on:
+
+  push:
+    branches:
+      - main
+    paths:
+      - include/**
+      - src/**
+  workflow_dispatch:
+
+jobs:
+
+  main:
+    name: Create branch 2D and 3D which only contains the dimension and compile each
+    permissions:
+      contents: write
+    strategy:
+      matrix:
+        dimension: [2, 3]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@main
+        with:
+          repository: "NaokiHori/SimpleBubblyFlowSolver"
+          ref: "main"
+          submodules: "recursive"
+      - name: Install dependencies
+        run: |
+          sudo apt-get -y update && \
+          sudo apt-get -y install make libopenmpi-dev libfftw3-dev
+      - name: Remove another dimension
+        run: |
+          set -x
+          set -e
+          python .github/workflows/extract_nd.py ${{ matrix.dimension }}
+      - name: Modify Makefile
+        run: |
+          set -x
+          set -e
+          sed -i "s/DNDIMS=2/DNDIMS=${{ matrix.dimension }}/g" Makefile
+      - name: Compile
+        run: |
+          make all
+      - name: Commit and push change
+        run: |
+          set -x
+          set -e
+          git switch -c ${{ matrix.dimension }}d
+          git config --local user.email "36466440+NaokiHori@users.noreply.github.com"
+          git config --local user.name "NaokiHori"
+          # add, commit, and push
+          git add Makefile
+          git add src
+          git add include
+          git commit -m "Extract ${{ matrix.dimension }}d sources" -a || true
+          git push -f origin ${{ matrix.dimension }}d
+
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
new file mode 100644
index 0000000..3052c06
--- /dev/null
+++ b/.github/workflows/documentation.yml
@@ -0,0 +1,49 @@
+name: Documentation
+
+on:
+
+  push:
+    branches:
+      - main
+    paths:
+      - docs/source/**
+
+jobs:
+
+  main:
+    name: Build and deploy documentation
+    permissions:
+      contents: read
+      pages: write
+      id-token: write
+    concurrency:
+      group: "pages"
+      cancel-in-progress: true
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@main
+        with:
+          repository: "NaokiHori/SimpleBubblyFlowSolver"
+          ref: "main"
+      - name: Build documentation using Sphinx
+        run: |
+          docker run \
+            --rm \
+            --volume ${PWD}:/project \
+            --workdir /project \
+            sphinxdoc/sphinx:latest \
+            sphinx-build "docs/source" "docs/build"
+      - name: Setup GitHub Pages
+        uses: actions/configure-pages@main
+      - name: Upload HTML
+        uses: actions/upload-pages-artifact@main
+        with:
+          path: docs/build
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@main
+
diff --git a/.github/workflows/extract_nd.py b/.github/workflows/extract_nd.py
new file mode 100644
index 0000000..a5be4cd
--- /dev/null
+++ b/.github/workflows/extract_nd.py
@@ -0,0 +1,132 @@
+import os
+import sys
+import glob
+import enum
+
+
+def get_filenames(root):
+    results = glob.glob(f"{root}/**", recursive=True)
+    retvals = list()
+    for result in results:
+        if result.endswith(".c") or result.endswith(".h"):
+            retvals.append(result)
+    return retvals
+
+
+class NdimsType(enum.Enum):
+    IN_2D = enum.auto()
+    IN_3D = enum.auto()
+    OTHER = enum.auto()
+
+
+def extract_given_dim(ndims, lines):
+    state = NdimsType.OTHER
+    if_level = 0
+    if_level_ndims = 0
+    newlines = list()
+    for line in lines:
+        is_on_ndims_macro = False
+        if "#if" in line:
+            # found "if", increase nest counter
+            if_level += 1
+            if " NDIMS" in line:
+                if "NDIMS==2" in line.replace(" ", ""):
+                    is_on_ndims_macro = True
+                    # now in 2D condition
+                    state = NdimsType.IN_2D
+                    if_level_ndims = if_level
+                if "NDIMS==3" in line.replace(" ", ""):
+                    is_on_ndims_macro = True
+                    # now in 3D condition
+                    state = NdimsType.IN_3D
+                    if_level_ndims = if_level
+        elif "#else" in line:
+            # check this "else" is for ndims
+            if if_level == if_level_ndims:
+                is_on_ndims_macro = True
+                # if it is, swap state (3d if now 2d, vice versa)
+                if state == NdimsType.IN_2D:
+                    state = NdimsType.IN_3D
+                elif state == NdimsType.IN_3D:
+                    state = NdimsType.IN_2D
+                else:
+                    print("else found but if not found beforehand")
+                    sys.exit()
+        elif "#endif" in line:
+            if if_level == if_level_ndims:
+                is_on_ndims_macro = True
+                state = NdimsType.OTHER
+            # found "endif", reduce nest counter
+            if_level -= 1
+        if not is_on_ndims_macro:
+            # we do not include macro about ndims
+            if ndims == 2 and state != NdimsType.IN_3D:
+                newlines.append(line)
+            if ndims == 3 and state != NdimsType.IN_2D:
+                newlines.append(line)
+    return newlines
+
+
+def modify_comments(lines):
+    """
+        there are weird comments which are used by Sphinx, which look like
+          // <comment> | <number of lines><cr>
+        I use this function to modify this kind of stuffs as
+          // <comment><cr>
+    """
+    delim = " | "
+    newlines = list()
+    for line in lines:
+        if "//" in line and delim in line:
+            line = line.split(delim)[0] + "\n"
+        newlines.append(line)
+    return newlines
+
+
+def adjust_blank_lines(lines):
+    """
+        this function merges two (and more) successive blank lines
+        into one blank
+    """
+    nitems = len(lines)
+    flags = [True for _ in range(nitems)]
+    for n in range(1, nitems):
+        # check two neighbouring lines
+        l0 = lines[n - 1]
+        l1 = lines[n]
+        if "\n" == l0 and "\n" == l1:
+            flags[n] = False
+    newlines = list()
+    for line, flag in zip(lines, flags):
+        if flag:
+            newlines.append(line)
+    return newlines
+
+
+def main():
+    argv = sys.argv
+    # sanitise input
+    assert 2 == len(argv)
+    ndims = int(argv[1])
+    # input source files
+    fnames = list()
+    fnames += get_filenames("src")
+    fnames += get_filenames("include")
+    for fname in fnames:
+        with open(fname, "r") as f:
+            lines = f.readlines()
+        lines = extract_given_dim(ndims, lines)
+        lines = modify_comments(lines)
+        lines = adjust_blank_lines(lines)
+        if 0 == len(lines):
+            # nothing remains, delete file
+            os.system(f"rm {fname}")
+            continue
+        # dump
+        lines = "".join(lines)
+        with open(fname, "w") as f:
+            f.write(lines)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..0b313fe
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,8 @@
+[submodule "SimpleDecomp"]
+	path = SimpleDecomp
+	url = https://github.com/NaokiHori/SimpleDecomp
+	branch = submodule
+[submodule "SimpleNpyIO"]
+	path = SimpleNpyIO
+	url = https://github.com/NaokiHori/SimpleNpyIO
+	branch = submodule
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d17d187
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 NaokiHori
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cf1b5c0
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,53 @@
+CC     := mpicc
+CFLAG  := -std=c99 -Wall -Wextra -Werror -O3 -DNDIMS=2
+INC    := -Iinclude -ISimpleDecomp/include -ISimpleNpyIO/include
+LIB    := -lfftw3 -lm
+SRCDIR := src SimpleDecomp/src SimpleNpyIO/src
+OBJDIR := obj
+SRCS   := $(shell find $(SRCDIR) -type f -name *.c)
+OBJS   := $(patsubst %.c,obj/%.o,$(SRCS))
+DEPS   := $(patsubst %.c,obj/%.d,$(SRCS))
+OUTDIR := output
+TARGET := a.out
+
+help:
+	@echo "all     : create \"$(TARGET)\""
+	@echo "clean   : remove \"$(TARGET)\" and object files under \"$(OBJDIR)\""
+	@echo "output  : create \"$(OUTDIR)\" to store output"
+	@echo "datadel : clean-up \"$(OUTDIR)\""
+	@echo "help    : show this message"
+
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	$(CC) $(CFLAG) -o $@ $^ $(LIB)
+
+$(OBJDIR)/%.o: %.c
+	@if [ ! -e $(dir $@) ]; then \
+		mkdir -p $(dir $@); \
+	fi
+	$(CC) $(CFLAG) -MMD $(INC) -c $< -o $@
+
+clean:
+	$(RM) -r $(OBJDIR) $(TARGET)
+
+output:
+	@if [ ! -e $(OUTDIR)/log ]; then \
+		mkdir -p $(OUTDIR)/log; \
+	fi
+	@if [ ! -e $(OUTDIR)/save ]; then \
+		mkdir -p $(OUTDIR)/save; \
+	fi
+	@if [ ! -e $(OUTDIR)/stat ]; then \
+		mkdir -p $(OUTDIR)/stat; \
+	fi
+
+datadel:
+	$(RM) -r $(OUTDIR)/log/*
+	$(RM) -r $(OUTDIR)/save/*
+	$(RM) -r $(OUTDIR)/stat/*
+
+-include $(DEPS)
+
+.PHONY : all clean output datadel help
+
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..9b9bbba
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,74 @@
+#########################
+Simple Bubbly Flow Solver
+#########################
+
+|License| |LastCommit| |CI| |DOCS|
+
+.. |License| image:: https://img.shields.io/github/license/NaokiHori/SimpleBubblyFlowSolver
+.. _License: https://opensource.org/licenses/MIT
+
+.. |LastCommit| image:: https://img.shields.io/github/last-commit/NaokiHori/SimpleBubblyFlowSolver/main
+.. _LastCommit: https://github.com/NaokiHori/SimpleBubblyFlowSolver/commits/main
+
+.. |CI| image:: https://github.com/NaokiHori/SimpleBubblyFlowSolver/actions/workflows/ci.yml/badge.svg
+
+.. |DOCS| image:: https://github.com/NaokiHori/SimpleBubblyFlowSolver/actions/workflows/documentation.yml/badge.svg
+.. _DOCS: https://naokihori.github.io/SimpleBubblyFlowSolver
+
+.. image:: https://github.com/NaokiHori/SimpleBubblyFlowSolver/blob/main/docs/source/thumbnail.jpg
+   :target: https://youtu.be/Xr14Kw61ByA
+   :width: 100%
+
+********
+Overview
+********
+
+This library numerically solves the motion of two-liquid mixtures separated by free surfaces using finite-difference and volume-of-fluid methods.
+
+Specifically, its aim is to simulate air-water flows characterised by significant contrasts in density and viscosity at high Reynolds numbers.
+
+***********
+Quick start
+***********
+
+Fetch source
+
+.. code-block:: console
+
+   git clone --recurse-submodules https://github.com/NaokiHori/SimpleBubblyFlowSolver
+   cd SimpleBubblyFlowSolver
+
+Initialise flow fields (needs ``Python3`` with ``NumPy``)
+
+.. code-block:: console
+
+   cd initial_condition
+   make output
+   sh main.sh
+   cd ..
+
+Build and run
+
+.. code-block:: console
+
+   make output
+   make all
+   sh exec/main.sh
+
+This simulates the motion of a 2D rising bubble:
+
+.. image:: https://github.com/NaokiHori/SimpleBubblyFlowSolver/blob/main/docs/source/sample.jpg
+
+*************
+Documentation
+*************
+
+The governing equations, the numerical methods employed, and the discretisations are briefly discussed `here <https://naokihori.github.io/SimpleBubblyFlowSolver/index.html>`_.
+
+**********
+3D version
+**********
+
+Checkout ``3d`` branch.
+Initialise flow fields by yourself.
+
diff --git a/SimpleDecomp b/SimpleDecomp
new file mode 160000
index 0000000..7e10fd0
--- /dev/null
+++ b/SimpleDecomp
@@ -0,0 +1 @@
+Subproject commit 7e10fd0cbfc592e6ceaf04bf91107b27c1de1b0d
diff --git a/SimpleNpyIO b/SimpleNpyIO
new file mode 160000
index 0000000..0338413
--- /dev/null
+++ b/SimpleNpyIO
@@ -0,0 +1 @@
+Subproject commit 03384133f7fefd10b702ae0d659f9c49311fd4d8
diff --git a/docs/source/_static/alabaster.css_t b/docs/source/_static/alabaster.css_t
new file mode 100644
index 0000000..a6a57d1
--- /dev/null
+++ b/docs/source/_static/alabaster.css_t
@@ -0,0 +1,776 @@
+{%- set theme_body_bg = theme_body_bg or theme_base_bg %}
+{%- set theme_code_highlight_bg = theme_code_highlight_bg or theme_body_bg %}
+{%- set theme_sidebar_header = theme_sidebar_header or theme_gray_1 %}
+{%- set theme_sidebar_link = theme_sidebar_link or theme_gray_1 %}
+{%- set theme_anchor_hover_fg = theme_anchor_hover_fg or theme_gray_1 %}
+
+{%- set theme_footnote_border = theme_footnote_border or theme_gray_2 %}
+{%- set theme_pre_bg = theme_pre_bg or theme_gray_2 %}
+
+{%- set theme_head_font_family = theme_head_font_family or theme_font_family %}
+
+{#- set up admonition styling #}
+{#- - basic level #}
+{%- set theme_admonition_xref_bg = theme_admonition_xref_bg or theme_xref_bg %}
+{%- set theme_admonition_bg = theme_admonition_bg or theme_gray_2 %}
+{%- set theme_note_bg = theme_note_bg or theme_gray_2 %}
+{%- set theme_seealso_bg = theme_seealso_bg or theme_gray_2 %}
+
+{#- - critical level #}
+{%- set theme_danger_bg = theme_danger_bg or theme_pink_1 %}
+{%- set theme_danger_border = theme_danger_border or theme_pink_2 %}
+{%- set theme_danger_shadow = theme_danger_shadow or theme_pink_3 %}
+
+{%- set theme_error_bg = theme_error_bg or theme_pink_1 %}
+{%- set theme_error_border = theme_error_border or theme_pink_2 %}
+{%- set theme_error_shadow = theme_error_shadow or theme_pink_3 %}
+
+{#- - warning level #}
+{%- set theme_caution_bg = theme_caution_bg or theme_pink_1 %}
+{%- set theme_caution_border = theme_caution_border or theme_pink_2 %}
+
+{%- set theme_attention_bg = theme_attention_bg or theme_pink_1 %}
+{%- set theme_attention_border = theme_attention_border or theme_pink_2 %}
+
+{%- set theme_warn_bg = theme_warn_bg or theme_pink_1 %}
+{%- set theme_warn_border = theme_warn_border or theme_pink_2 %}
+
+{#- - normal level #}
+{%- set theme_important_bg = theme_important_bg or theme_gray_2 %}
+{%- set theme_tip_bg = theme_tip_bg or theme_gray_2 %}
+{%- set theme_hint_bg = theme_hint_bg or theme_gray_2 %}
+
+{#- /set up admonition styling #}
+
+{%- set theme_shadow = theme_shadow or theme_gray_2 %}
+
+
+{%- set theme_topic_bg = theme_topic_bg or theme_gray_2 %}
+
+{%- set theme_narrow_sidebar_link = theme_narrow_sidebar_link or theme_gray_3 %}
+{%- set theme_sidebar_hr = theme_sidebar_hr or theme_gray_3 %}
+
+{%- set theme_relbar_border = theme_relbar_border or theme_gray_2 -%}
+
+
+@import url("basic.css");
+
+/* -- page layout ----------------------------------------------------------- */
+
+body {
+    font-family: {{ theme_font_family }};
+    font-size: {{ theme_font_size }};
+    background-color: {{ theme_base_bg }};
+    color: {{ theme_base_text }};
+    margin: 0;
+    padding: 0;
+}
+
+
+div.document {
+    width: {{ theme_page_width }};
+    margin: 30px auto 0 auto;
+}
+
+div.documentwrapper {
+    float: left;
+    width: 100%;
+}
+
+div.bodywrapper {
+    margin: 0 0 0 {{ theme_sidebar_width }};
+}
+
+div.sphinxsidebar {
+    width: {{ theme_sidebar_width }};
+    font-size: 14px;
+    line-height: 1.5;
+}
+
+hr {
+    border: 1px solid {{ theme_hr_border }};
+}
+
+div.body {
+    background-color: {{ theme_body_bg  }};
+    color: {{ theme_body_text }};
+    padding: 0 30px 0 30px;
+}
+
+div.body > .section {
+    text-align: {{ theme_body_text_align }};
+}
+
+div.footer {
+    width: {{ theme_page_width }};
+    margin: 20px auto 30px auto;
+    font-size: 14px;
+    color: {{ theme_footer_text }};
+    text-align: right;
+}
+
+div.footer a {
+    color: {{ theme_footer_text }};
+}
+
+p.caption {
+    font-family: {{ theme_caption_font_family }};
+    font-size: {{ theme_caption_font_size }};
+}
+
+{% if theme_show_related|lower == 'false' %}
+div.relations {
+    display: none;
+}
+{% endif %}
+
+div.sphinxsidebar a {
+    color: {{ theme_sidebar_link }};
+    text-decoration: none;
+    border-bottom: 1px dotted {{ theme_sidebar_link_underscore }};
+}
+
+div.sphinxsidebar a:hover {
+    border-bottom: 1px solid {{ theme_sidebar_link_underscore }};
+}
+
+div.sphinxsidebarwrapper {
+    padding: 18px 10px;
+}
+
+div.sphinxsidebarwrapper p.logo {
+    padding: 0;
+    margin: -10px 0 0 0px;
+    text-align: center;
+}
+
+div.sphinxsidebarwrapper h1.logo {
+    margin-top: -10px;
+    text-align: center;
+    margin-bottom: 5px;
+    text-align: {{ theme_logo_text_align }};
+}
+
+div.sphinxsidebarwrapper h1.logo-name {
+    margin-top: 0px;
+}
+
+div.sphinxsidebarwrapper p.blurb {
+    margin-top: 0;
+    font-style: {{ theme_description_font_style }};
+}
+
+div.sphinxsidebar h3,
+div.sphinxsidebar h4 {
+    font-family: {{ theme_head_font_family }};
+    color: {{ theme_sidebar_header }};
+    font-size: 24px;
+    font-weight: normal;
+    margin: 0 0 5px 0;
+    padding: 0;
+}
+
+div.sphinxsidebar h4 {
+    font-size: 20px;
+}
+
+div.sphinxsidebar h3 a {
+    color: {{ theme_sidebar_link }};
+}
+
+div.sphinxsidebar p.logo a,
+div.sphinxsidebar h3 a,
+div.sphinxsidebar p.logo a:hover,
+div.sphinxsidebar h3 a:hover {
+    border: none;
+}
+
+div.sphinxsidebar p {
+    color: {{ theme_sidebar_text }};
+    margin: 10px 0;
+}
+
+div.sphinxsidebar ul {
+    margin: 10px 0;
+    padding: 0;
+    color: {{ theme_sidebar_list }};
+}
+
+div.sphinxsidebar ul li.toctree-l1 > a {
+    font-size: 120%;
+}
+
+div.sphinxsidebar ul li.toctree-l2 > a {
+    font-size: 110%;
+}
+
+div.sphinxsidebar input {
+    border: 1px solid {{ theme_sidebar_search_button }};
+    font-family: {{ theme_font_family }};
+    font-size: 1em;
+}
+
+div.sphinxsidebar hr {
+    border: none;
+    height: 1px;
+    color: {{ theme_sidebar_hr }};
+    background: {{ theme_sidebar_hr }};
+
+    text-align: left;
+    margin-left: 0;
+    width: 50%;
+}
+
+div.sphinxsidebar .badge {
+    border-bottom: none;
+}
+
+div.sphinxsidebar .badge:hover {
+    border-bottom: none;
+}
+
+/* To address an issue with donation coming after search */
+div.sphinxsidebar h3.donation {
+    margin-top: 10px;
+}
+
+/* -- body styles ----------------------------------------------------------- */
+
+a {
+    color: {{ theme_link }};
+    text-decoration: underline;
+}
+
+a:visited {
+    color: {{ theme_link }};
+    text-decoration: underline;
+}
+
+a:hover {
+    color: {{ theme_link_hover }};
+    text-decoration: underline;
+}
+
+div.body h1,
+div.body h2,
+div.body h3,
+div.body h4,
+div.body h5,
+div.body h6 {
+    font-family: {{ theme_head_font_family }};
+    font-weight: normal;
+    margin: 30px 0px 10px 0px;
+    padding: 0;
+}
+
+div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
+div.body h2 { font-size: 180%; }
+div.body h3 { font-size: 150%; }
+div.body h4 { font-size: 130%; }
+div.body h5 { font-size: 100%; }
+div.body h6 { font-size: 100%; }
+
+a.headerlink {
+    color: {{ theme_anchor }};
+    padding: 0 4px;
+    text-decoration: none;
+}
+
+a.headerlink:hover {
+    color: {{ theme_anchor_hover_fg }};
+    background: {{ theme_anchor_hover_bg }};
+}
+
+div.body p, div.body dd, div.body li {
+    line-height: 1.4em;
+}
+
+div.admonition {
+    margin: 20px 0px;
+    padding: 10px 30px;
+    background-color: {{ theme_admonition_bg }};
+    border: 1px solid {{ theme_admonition_border }};
+}
+
+div.admonition tt.xref, div.admonition code.xref, div.admonition a tt {
+    background-color: {{ theme_admonition_xref_bg }};
+    border-bottom: 1px solid {{ theme_admonition_xref_border }};
+}
+
+div.admonition p.admonition-title {
+    font-family: {{ theme_head_font_family }};
+    font-weight: normal;
+    font-size: 24px;
+    margin: 0 0 10px 0;
+    padding: 0;
+    line-height: 1;
+}
+
+div.admonition p.last {
+    margin-bottom: 0;
+}
+
+div.highlight {
+    background-color: {{ theme_code_highlight_bg }};
+}
+
+dt:target, .highlight {
+    background: {{ theme_highlight_bg }};
+}
+
+div.warning {
+    background-color: {{ theme_warn_bg }};
+    border: 1px solid {{ theme_warn_border }};
+}
+
+div.danger {
+    background-color: {{ theme_danger_bg }};
+    border: 1px solid {{ theme_danger_border }};
+    -moz-box-shadow: 2px 2px 4px {{ theme_danger_shadow }};
+    -webkit-box-shadow: 2px 2px 4px {{ theme_danger_shadow }};
+    box-shadow: 2px 2px 4px {{ theme_danger_shadow }};
+}
+
+div.error {
+    background-color: {{ theme_error_bg }};
+    border: 1px solid {{ theme_error_border }};
+    -moz-box-shadow: 2px 2px 4px {{ theme_error_shadow }};
+    -webkit-box-shadow: 2px 2px 4px {{ theme_error_shadow }};
+    box-shadow: 2px 2px 4px {{ theme_error_shadow }};
+}
+
+div.caution {
+    background-color: {{ theme_caution_bg }};
+    border: 1px solid {{ theme_caution_border }};
+}
+
+div.attention {
+    background-color: {{ theme_attention_bg }};
+    border: 1px solid {{ theme_attention_border }};
+}
+
+div.important {
+    background-color: {{ theme_important_bg }};
+    border: 1px solid {{ theme_important_border }};
+}
+
+div.note {
+    background-color: {{ theme_note_bg }};
+    border: 1px solid {{ theme_note_border }};
+}
+
+div.tip {
+    background-color: {{ theme_tip_bg }};
+    border: 1px solid {{ theme_tip_border }};
+}
+
+div.hint {
+    background-color: {{ theme_hint_bg }};
+    border: 1px solid {{ theme_hint_border }};
+}
+
+div.seealso {
+    background-color: {{ theme_seealso_bg }};
+    border: 1px solid {{ theme_seealso_border }};
+}
+
+div.topic {
+    background-color: {{ theme_topic_bg }};
+}
+
+p.admonition-title {
+    display: inline;
+}
+
+p.admonition-title:after {
+    content: ":";
+}
+
+pre, tt, code {
+    font-family: {{theme_code_font_family}};
+    font-size: {{ theme_code_font_size }};
+}
+
+.hll {
+    background-color: {{theme_code_highlight}};
+    margin: 0 -12px;
+    padding: 0 12px;
+    display: block;
+}
+
+img.screenshot {
+}
+
+tt.descname, tt.descclassname, code.descname, code.descclassname {
+    font-size: 0.95em;
+}
+
+tt.descname, code.descname {
+    padding-right: 0.08em;
+}
+
+img.screenshot {
+    -moz-box-shadow: 2px 2px 4px {{ theme_shadow }};
+    -webkit-box-shadow: 2px 2px 4px {{ theme_shadow }};
+    box-shadow: 2px 2px 4px {{ theme_shadow }};
+}
+
+table.docutils {
+    border: 1px solid {{ theme_table_border }};
+    -moz-box-shadow: 2px 2px 4px {{ theme_shadow }};
+    -webkit-box-shadow: 2px 2px 4px {{ theme_shadow }};
+    box-shadow: 2px 2px 4px {{ theme_shadow }};
+}
+
+table.docutils td, table.docutils th {
+    border: 1px solid {{ theme_table_border }};
+    padding: 0.25em 0.7em;
+}
+
+table.field-list, table.footnote {
+    border: none;
+    -moz-box-shadow: none;
+    -webkit-box-shadow: none;
+    box-shadow: none;
+}
+
+table.footnote {
+    margin: 15px 0;
+    width: 100%;
+    border: 1px solid {{ theme_footnote_border }};
+    background: {{ theme_footnote_bg }};
+    font-size: 0.9em;
+}
+
+table.footnote + table.footnote {
+    margin-top: -15px;
+    border-top: none;
+}
+
+table.field-list th {
+    padding: 0 0.8em 0 0;
+}
+
+table.field-list td {
+    padding: 0;
+}
+
+table.field-list p {
+    margin-bottom: 0.8em;
+}
+
+/* Cloned from
+ * https://github.com/sphinx-doc/sphinx/commit/ef60dbfce09286b20b7385333d63a60321784e68
+ */
+.field-name {
+    -moz-hyphens: manual;
+    -ms-hyphens: manual;
+    -webkit-hyphens: manual;
+    hyphens: manual;
+}
+
+table.footnote td.label {
+    width: .1px;
+    padding: 0.3em 0 0.3em 0.5em;
+}
+
+table.footnote td {
+    padding: 0.3em 0.5em;
+}
+
+dl {
+    margin-left: 0;
+    margin-right: 0;
+    margin-top: 0;
+    padding: 0;
+}
+
+dl dd {
+    margin-left: 30px;
+}
+
+blockquote {
+    margin: 0 0 0 30px;
+    padding: 0;
+}
+
+ul, ol {
+    /* Matches the 30px from the narrow-screen "li > ul" selector below */
+    margin: 10px 0 10px 30px;
+    padding: 0;
+}
+
+pre {
+    background: {{ theme_pre_bg }};
+    padding: 7px 30px;
+    margin: 15px 0px;
+    line-height: 1.3em;
+}
+
+div.viewcode-block:target {
+    background: {{ theme_viewcode_target_bg }};
+}
+
+dl pre, blockquote pre, li pre {
+    margin-left: 0;
+    padding-left: 30px;
+}
+
+tt, code {
+    background-color: {{ theme_code_bg }};
+    color: {{ theme_code_text }};
+    /* padding: 1px 2px; */
+}
+
+tt.xref, code.xref, a tt {
+    background-color: {{ theme_xref_bg }};
+    border-bottom: 1px solid {{ theme_xref_border }};
+}
+
+a.reference {
+    text-decoration: none;
+    border-bottom: 1px dotted {{ theme_link }};
+}
+
+/* Don't put an underline on images */
+a.image-reference, a.image-reference:hover {
+    border-bottom: none;
+}
+
+a.reference:hover {
+    border-bottom: 1px solid {{ theme_link_hover }};
+}
+
+a.footnote-reference {
+    text-decoration: none;
+    font-size: 0.7em;
+    vertical-align: top;
+    border-bottom: 1px dotted {{ theme_link }};
+}
+
+a.footnote-reference:hover {
+    border-bottom: 1px solid {{ theme_link_hover }};
+}
+
+a:hover tt, a:hover code {
+    background: {{ theme_code_hover }};
+}
+
+
+@media screen and (max-width: 870px) {
+
+    div.sphinxsidebar {
+    	display: none;
+    }
+
+    div.document {
+       width: 100%;
+
+    }
+
+    div.documentwrapper {
+    	margin-left: 0;
+    	margin-top: 0;
+    	margin-right: 0;
+    	margin-bottom: 0;
+    }
+
+    div.bodywrapper {
+    	margin-top: 0;
+    	margin-right: 0;
+    	margin-bottom: 0;
+    	margin-left: 0;
+    }
+
+    ul {
+    	margin-left: 0;
+    }
+
+	li > ul {
+        /* Matches the 30px from the "ul, ol" selector above */
+		margin-left: 30px;
+	}
+
+    .document {
+    	width: auto;
+    }
+
+    .footer {
+    	width: auto;
+    }
+
+    .bodywrapper {
+    	margin: 0;
+    }
+
+    .footer {
+    	width: auto;
+    }
+
+    .github {
+        display: none;
+    }
+
+
+
+}
+
+
+
+@media screen and (max-width: 875px) {
+
+    body {
+        margin: 0;
+        padding: 20px 30px;
+    }
+
+    div.documentwrapper {
+        float: none;
+        background: {{ theme_base_bg }};
+    }
+
+    div.sphinxsidebar {
+        display: block;
+        float: none;
+        width: 102.5%;
+        {%- if theme_fixed_sidebar|lower == 'true' %}
+        margin: -20px -30px 20px -30px;
+        {%- else %}
+        margin: 50px -30px -20px -30px;
+        {%- endif %}
+        padding: 10px 20px;
+        background: {{ theme_narrow_sidebar_bg }};
+        color: {{ theme_narrow_sidebar_fg }};
+    }
+
+    div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
+    div.sphinxsidebar h3 a {
+        color: {{ theme_base_bg }};
+    }
+
+    div.sphinxsidebar a {
+        color: {{ theme_narrow_sidebar_link }};
+    }
+
+    div.sphinxsidebar p.logo {
+        display: none;
+    }
+
+    div.document {
+        width: 100%;
+        margin: 0;
+    }
+
+    div.footer {
+        display: none;
+    }
+
+    div.bodywrapper {
+        margin: 0;
+    }
+
+    div.body {
+        min-height: 0;
+        padding: 0;
+    }
+
+    .rtd_doc_footer {
+        display: none;
+    }
+
+    .document {
+        width: auto;
+    }
+
+    .footer {
+        width: auto;
+    }
+
+    .footer {
+        width: auto;
+    }
+
+    .github {
+        display: none;
+    }
+}
+
+{%- if theme_fixed_sidebar|lower == 'true' %}
+@media screen and (min-width: 876px) {
+    div.sphinxsidebar {
+        position: fixed;
+        margin-left: 0;
+    }
+}
+{%- endif %}
+
+
+/* misc. */
+
+.revsys-inline {
+    display: none!important;
+}
+
+/* Make nested-list/multi-paragraph items look better in Releases changelog
+ * pages. Without this, docutils' magical list fuckery causes inconsistent
+ * formatting between different release sub-lists.
+ */
+div#changelog > div.section > ul > li > p:only-child {
+    margin-bottom: 0;
+}
+
+/* Hide fugly table cell borders in ..bibliography:: directive output */
+table.docutils.citation, table.docutils.citation td, table.docutils.citation th {
+  border: none;
+  /* Below needed in some edge cases; if not applied, bottom shadows appear */
+  -moz-box-shadow: none;
+  -webkit-box-shadow: none;
+  box-shadow: none;
+}
+
+
+/* relbar */
+
+.related {
+    line-height: 30px;
+    width: 100%;
+    font-size: 0.9rem;
+}
+
+.related.top {
+    border-bottom: 1px solid {{ theme_relbar_border }};
+    margin-bottom: 20px;
+}
+
+.related.bottom {
+    border-top: 1px solid {{ theme_relbar_border }};
+}
+
+.related ul {
+    padding: 0;
+    margin: 0;
+    list-style: none;
+}
+
+.related li {
+    display: inline;
+}
+
+nav#rellinks {
+    float: right;
+}
+
+nav#rellinks li+li:before {
+    content: "|";
+}
+
+nav#breadcrumbs li+li:before {
+    content: "\00BB";
+}
+
+/* Hide certain items when printing */
+@media print {
+    div.related {
+        display: none;
+    }
+}
diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
new file mode 100644
index 0000000..9f14e12
--- /dev/null
+++ b/docs/source/_static/custom.css
@@ -0,0 +1,8 @@
+div.body {
+  max-width: none;
+}
+
+div.MathJax_Display {
+  overflow-x: scroll;
+  overflow-y: hidden;
+}
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..b9c9b75
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,23 @@
+import os
+import sys
+
+sys.path.append(os.path.abspath("./ext"))
+sys.path.append(os.path.abspath("./conf_params"))
+
+project = "Simple Bubbly Flow Solver"
+author = "Naoki Hori"
+copyright = f"2024, {author}"
+
+extensions = [
+    "myliteralinclude",
+]
+
+from alabaster_params import html_theme
+from alabaster_params import html_static_path
+from alabaster_params import html_theme_options
+
+from mathjax_params import mathjax_path
+from mathjax_params import mathjax3_config
+
+pygments_style = "pstyle.MyAlabaster"
+
diff --git a/docs/source/conf_params/alabaster_params.py b/docs/source/conf_params/alabaster_params.py
new file mode 100644
index 0000000..23da676
--- /dev/null
+++ b/docs/source/conf_params/alabaster_params.py
@@ -0,0 +1,60 @@
+html_theme = "alabaster"
+html_static_path = ["_static"]
+html_theme_options = {
+        "description": "NS Solver for bubbly flows",
+        "fixed_sidebar": "false",
+        "github_banner": "false",
+        "github_button": "true",
+        "github_count": "true",
+        "github_repo": "SimpleBubblyFlowSolver",
+        "github_type": "star",
+        "github_user": "NaokiHori",
+        "navigation_with_keys": "true",
+        "nosidebar": "false",
+        "page_width": "95vw",
+        "show_powered_by": "true",
+        "show_related": "false",
+        "show_relbars": "false",
+        "sidebar_collapse": "true",
+        "sidebar_includehidden": "false",
+        "sidebar_width": "25vw",
+        "gray_1": "#bbb",
+        "gray_2": "#111",
+        "gray_3": "#555",
+        "pink_1": "#033",
+        "pink_2": "#055",
+        "pink_3": "#2ad3d3",
+        "base_bg": "#000",
+        "base_text": "#fff",
+        "hr_border": "#4e4b49",
+        "body_text": "#c1bcb6",
+        "footer_text": "#777",
+        "link": "#ffb494",
+        "link_hover": "#92beff",
+        "sidebar_text": "#aaa",
+        "sidebar_link_underscore": "#666",
+        "sidebar_search_button": "#333",
+        "sidebar_list": "#fff",
+        "anchor": "#222",
+        "anchor_hover_bg": "#151515",
+        "table_border": "#777",
+        "admonition_border": "#333",
+        "note_border": "#333",
+        "seealso_border": "#333",
+        "tip_border": "#333",
+        "hint_border": "#333",
+        "important_border": "#333",
+        "highlight_bg": "#050c17",
+        "xref_border": "#000",
+        "xref_bg": "#040404",
+        "admonition_xref_border": "#050505",
+        "footnote_bg": "#020202",
+        "narrow_sidebar_bg": "#ccc",
+        "narrow_sidebar_fg": "#000",
+        "viewcode_target_bg": "#002",
+        "code_bg": "#130f0c",
+        "code_text": "#ddd",
+        "code_hover": "#111",
+        "code_highlight": "#003",
+}
+
diff --git a/docs/source/conf_params/mathjax_macro/discrete.py b/docs/source/conf_params/mathjax_macro/discrete.py
new file mode 100644
index 0000000..c68e1cd
--- /dev/null
+++ b/docs/source/conf_params/mathjax_macro/discrete.py
@@ -0,0 +1,44 @@
+def add(macros):
+    # number of grid points
+    macros["nx"] = "{N_{\\vx}}"
+    macros["ny"] = "{N_{\\vy}}"
+    macros["nz"] = "{N_{\\vz}}"
+    # summation symbols for different locations
+    macros["sumxf"] = "\\sum_{i = \\frac{1}{2}}^{\\nx + \\frac{1}{2}}"
+    macros["sumxc"] = "\\sum_{i = 1}^{\\nx}"
+    macros["sumyf"] = "\\sum_{j = \\frac{1}{2}}^{\\ny - \\frac{1}{2}}"
+    macros["sumyc"] = "\\sum_{j = 1}^{\\ny}"
+    macros["sumzf"] = "\\sum_{k = \\frac{1}{2}}^{\\nz - \\frac{1}{2}}"
+    macros["sumzc"] = "\\sum_{k = 1}^{\\nz}"
+    # discrete momentum balance
+    macros["dmomadv"] = [
+            "-"
+            "\\frac{1}{J}"
+            "\\dif{"
+            "  \\left("
+            "    \\ave{"
+            "      \\frac{J}{h_{\\gcs^{#2}}}"
+            "      \\rho u_{#2}"
+            "    }{\\gcs^{#1}}"
+            "    \\ave{u_{#1}}{\\gcs^{#2}}"
+            "  \\right)"
+            "}{\\gcs^{#2}}"
+            , 2
+    ]
+    macros["dmompre"] = [
+            "-"
+            "\\frac{1}{h_{#1}}"
+            "\\dif{p}{\\gcs_{#1}}"
+            , 1
+    ]
+    macros["dmomdif"] = [
+            "+"
+            "\\frac{1}{J}"
+            "\\dif{}{\\gcs^{#2}}"
+            "\\left("
+            "   \\frac{J}{h_{\\gcs^{#2}}}"
+            "   \\tau_{#1 #2}"
+            "\\right)"
+            , 2
+    ]
+
diff --git a/docs/source/conf_params/mathjax_macro/energy.py b/docs/source/conf_params/mathjax_macro/energy.py
new file mode 100644
index 0000000..6223f6b
--- /dev/null
+++ b/docs/source/conf_params/mathjax_macro/energy.py
@@ -0,0 +1,69 @@
+def add(macros):
+    # continuous energy equation
+    macros["ene"] = [
+        "\\pder{k_{#1}}{t}"
+        "="
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gx}"
+        "\\left("
+        "   \\jhx"
+        "   \\ux k_{#1}"
+        "\\right)"
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gy}"
+        "\\left("
+        "   \\jhy"
+        "   \\uy k_{#1}"
+        "\\right)"
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gz}"
+        "\\left("
+        "   \\jhz"
+        "   \\uz k_{#1}"
+        "\\right)"
+        "-"
+        "u_{#1}"
+        "\\frac{1}{h_{#1}}"
+        "\\pder{p}{\\gcs_{#1}}"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gx}"
+        "\\left("
+        "   \\jhx"
+        "   u_{#1}"
+        "   \\tau_{#1 \\vx}"
+        "\\right)"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gy}"
+        "\\left("
+        "   \\jhy"
+        "   u_{#1}"
+        "   \\tau_{#1 \\vy}"
+        "\\right)"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gz}"
+        "\\left("
+        "   \\jhz"
+        "   u_{#1}"
+        "   \\tau_{#1 \\vz}"
+        "\\right)"
+        "-"
+        "\\frac{1}{\\hx}"
+        "\\pder{u_{#1}}{\\gx}"
+        "\\tau_{#1 \\vx}"
+        "-"
+        "\\frac{1}{\\hy}"
+        "\\pder{u_{#1}}{\\gy}"
+        "\\tau_{#1 \\vy}"
+        "-"
+        "\\frac{1}{\\hz}"
+        "\\pder{u_{#1}}{\\gz}"
+        "\\tau_{#1 \\vz}"
+        "+"
+        "f_{#1} u_{#1}"
+    , 1]
diff --git a/docs/source/conf_params/mathjax_macro/general.py b/docs/source/conf_params/mathjax_macro/general.py
new file mode 100644
index 0000000..c17025e
--- /dev/null
+++ b/docs/source/conf_params/mathjax_macro/general.py
@@ -0,0 +1,33 @@
+def add(macros):
+    # coordinate
+    macros["vx"] = "{x}"
+    macros["vy"] = "{y}"
+    macros["vz"] = "{z}"
+    # a symbol used to denote general coordinate
+    macros["gcs"] = "{\\xi}"
+    macros["gx"] = "{\\gcs^{\\vx}}"
+    macros["gy"] = "{\\gcs^{\\vy}}"
+    macros["gz"] = "{\\gcs^{\\vz}}"
+    # velocity
+    macros["ux"] = "{u_{\\vx}}"
+    macros["uy"] = "{u_{\\vy}}"
+    macros["uz"] = "{u_{\\vz}}"
+    # quadratic quantities
+    macros["kx"] = "{k_{\\vx}}"
+    macros["ky"] = "{k_{\\vy}}"
+    macros["kz"] = "{k_{\\vz}}"
+    # scale factors
+    macros["hx"] = "{h_{\\gx}}"
+    macros["hy"] = "{h_{\\gy}}"
+    macros["hz"] = "{h_{\\gz}}"
+    # jacobian determinant divided by the scale factors
+    macros["jhx"] = "{\\frac{J}{\\hx}}"
+    macros["jhy"] = "{\\frac{J}{\\hy}}"
+    macros["jhz"] = "{\\frac{J}{\\hz}}"
+    # differentiations
+    macros["pder"] = ["{\\frac{\\partial #1}{\\partial #2}}", 2]
+    macros["dder"] = ["{\\frac{\\delta #1}{\\delta #2}}", 2]
+    # discrete operators
+    macros["ave"] = ["{\\overline{#1}^{#2}}", 2]
+    macros["dif"] = ["{\\delta_{#2} {#1}}", 2]
+    macros["vat"] = ["{\\left. {#1} \\right|_{#2}}", 2]
diff --git a/docs/source/conf_params/mathjax_macro/momentum.py b/docs/source/conf_params/mathjax_macro/momentum.py
new file mode 100644
index 0000000..f1d6930
--- /dev/null
+++ b/docs/source/conf_params/mathjax_macro/momentum.py
@@ -0,0 +1,53 @@
+def add(macros):
+    # continuous momentum equation
+    macros["mom"] = [
+        "\\pder{\\rho u_{#1}}{t}"
+        "="
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gx}"
+        "\\left("
+        "   \\jhx"
+        "   \\rho \\ux u_{#1}"
+        "\\right)"
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gy}"
+        "\\left("
+        "   \\jhy"
+        "   \\rho \\uy u_{#1}"
+        "\\right)"
+        "-"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gz}"
+        "\\left("
+        "   \\jhz"
+        "   \\rho \\uz u_{#1}"
+        "\\right)"
+        "-"
+        "\\frac{1}{h_{#1}}"
+        "\\pder{p}{\\gcs_{#1}}"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gx}"
+        "\\left("
+        "   \\jhx"
+        "   \\tau_{#1 \\vx}"
+        "\\right)"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gy}"
+        "\\left("
+        "   \\jhy"
+        "   \\tau_{#1 \\vy}"
+        "\\right)"
+        "+"
+        "\\frac{1}{J}"
+        "\\pder{}{\\gz}"
+        "\\left("
+        "   \\jhz"
+        "   \\tau_{#1 \\vz}"
+        "\\right)"
+        "+"
+        "f_{#1}"
+    , 1]
diff --git a/docs/source/conf_params/mathjax_params.py b/docs/source/conf_params/mathjax_params.py
new file mode 100644
index 0000000..abdafbe
--- /dev/null
+++ b/docs/source/conf_params/mathjax_params.py
@@ -0,0 +1,15 @@
+from mathjax_macro.general import add as add_general
+from mathjax_macro.momentum import add as add_momentum
+from mathjax_macro.energy import add as add_energy
+from mathjax_macro.discrete import add as add_discrete
+
+mathjax_path = "https://cdn.jsdelivr.net/npm/mathjax@2/MathJax.js?config=TeX-AMS-MML_HTMLorMML"
+
+macros = dict()
+
+add_general(macros)
+add_momentum(macros)
+add_energy(macros)
+add_discrete(macros)
+
+mathjax3_config = {"TeX": {"Macros": macros}}
diff --git a/docs/source/conf_params/pstyle.py b/docs/source/conf_params/pstyle.py
new file mode 100644
index 0000000..0bd4d02
--- /dev/null
+++ b/docs/source/conf_params/pstyle.py
@@ -0,0 +1,85 @@
+# ref: alabaster/support.py
+
+from pygments.style import Style
+from pygments.token import (
+    Keyword,
+    Name,
+    Comment,
+    String,
+    Error,
+    Number,
+    Operator,
+    Generic,
+    Whitespace,
+    Punctuation,
+    Other,
+    Literal,
+)
+
+# color fliped
+
+class MyAlabaster(Style):
+    background_color = "#070707"
+    default_style = ""
+    styles = {
+        Error: "#5BFFFF border:#10D6D6",
+        Other: "#FFFFFF",
+        Comment: "italic #70A6FD",
+        Comment.Preproc: "noitalic",
+        Keyword: "bold #FFBB9E",
+        Keyword.Constant: "bold #FFBB9E",
+        Keyword.Declaration: "bold #FFBB9E",
+        Keyword.Namespace: "bold #FFBB9E",
+        Keyword.Pseudo: "bold #FFBB9E",
+        Keyword.Reserved: "bold #FFBB9E",
+        Keyword.Type: "bold #FFBB9E",
+        Operator: "#A7D7FF",
+        Operator.Word: "bold #FFBB9E",
+        Punctuation: "bold #FFFFFF",
+        Name: "#FFFFFF",
+        Name.Attribute: "#3B5FFF",
+        Name.Builtin: "#FFBB9E",
+        Name.Builtin.Pseudo: "#CB9A5B",
+        Name.Class: "#FFFFFF",
+        Name.Constant: "#FFFFFF",
+        Name.Decorator: "#888",
+        Name.Entity: "#31A3FF",
+        Name.Exception: "bold #33FFFF",
+        Name.Function: "#FFFFFF",
+        Name.Property: "#FFFFFF",
+        Name.Label: "#0A86FF",
+        Name.Namespace: "#FFFFFF",
+        Name.Other: "#FFFFFF",
+        Name.Tag: "bold #FFBB9E",
+        Name.Variable: "#FFFFFF",
+        Name.Variable.Class: "#FFFFFF",
+        Name.Variable.Global: "#FFFFFF",
+        Name.Variable.Instance: "#FFFFFF",
+        Number: "#66FFFF",
+        Literal: "#FFFFFF",
+        Literal.Date: "#FFFFFF",
+        String: "#B165F9",
+        String.Backtick: "#B165F9",
+        String.Char: "#B165F9",
+        String.Doc: "italic #70A6FD",
+        String.Double: "#B165F9",
+        String.Escape: "#B165F9",
+        String.Heredoc: "#B165F9",
+        String.Interpol: "#B165F9",
+        String.Other: "#B165F9",
+        String.Regex: "#B165F9",
+        String.Single: "#B165F9",
+        String.Symbol: "#B165F9",
+        Generic: "#FFFFFF",
+        Generic.Deleted: "#5BFFFF",
+        Generic.Emph: "italic #FFFFFF",
+        Generic.Error: "#10D6D6",
+        Generic.Heading: "bold #FFFF7F",
+        Generic.Inserted: "#FF5FFF",
+        Generic.Output: "#888",
+        Generic.Prompt: "#8BACCB",
+        Generic.Strong: "bold #FFFFFF",
+        Generic.Subheading: "bold #7FFF7F",
+        Generic.Traceback: "bold #5BFFFF",
+    }
+
diff --git a/docs/source/discretisation/main.rst b/docs/source/discretisation/main.rst
new file mode 100644
index 0000000..705c546
--- /dev/null
+++ b/docs/source/discretisation/main.rst
@@ -0,0 +1,12 @@
+##############
+Discretisation
+##############
+
+The equations are discretised as follows in this project.
+
+.. toctree::
+   :maxdepth: 1
+
+   spatial/main
+   temporal/main
+
diff --git a/docs/source/discretisation/spatial/energy/advective.rst b/docs/source/discretisation/spatial/energy/advective.rst
new file mode 100644
index 0000000..478fb6f
--- /dev/null
+++ b/docs/source/discretisation/spatial/energy/advective.rst
@@ -0,0 +1,140 @@
+I consider the effects of
+
+.. math::
+
+   \pder{\rho u_i}{t}
+   +
+   \frac{1}{J}
+   \dif{
+      \left(
+         \ave{
+            \frac{J}{h_{\gcs^j}}
+            \rho u_j
+         }{\gcs^i}
+         \ave{
+            u_i
+         }{\gcs^j}
+      \right)
+   }{\gcs^j}
+
+on the global kinetic energy balance; namely:
+
+.. math::
+
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \left\{
+      \pder{\rho u_i}{t}
+      +
+      \frac{1}{J}
+      \dif{
+         \left(
+            \ave{
+               \frac{J}{h_{\gcs^j}}
+               \rho u_j
+            }{\gcs^i}
+            \ave{
+               u_i
+            }{\gcs^j}
+         \right)
+      }{\gcs^j}
+   \right\}.
+
+Contribution of the first term leads to
+
+.. math::
+
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \pder{\rho u_i}{t}
+   =
+   \sum_{i\,\text{face}}
+   J
+   \pder{\rho u_i u_i}{t}
+   -
+   \sum_{i\,\text{face}}
+   J
+   \rho u_i
+   \pder{u_i}{t},
+
+while the second term gives
+
+.. math::
+
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \frac{1}{J}
+   \dif{
+      \left(
+         \ave{
+            \frac{J}{h_{\gcs^j}}
+            \rho u_j
+         }{\gcs^i}
+         \ave{
+            u_i
+         }{\gcs^j}
+      \right)
+   }{\gcs^j}
+   &
+   =
+   -
+   \sum_{i\,\text{center}}
+   \ave{
+      u_i
+   }{\gcs^j}
+   \ave{
+      \frac{J}{h_{\gcs^j}}
+      \rho u_j
+   }{\gcs^i}
+   \dif{
+      u_i
+   }{\gcs^j} \\
+   &
+   =
+   -
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \frac{1}{J}
+   \ave{
+      \ave{
+         \frac{J}{h_{\gcs^j}}
+         \rho u_j
+      }{\gcs^i}
+      \dif{
+         u_i
+      }{\gcs^j}
+   }{\gcs^j}.
+
+As a result, I obtain
+
+.. math::
+
+   \sum_{i\,\text{face}}
+   J
+   \pder{\rho u_i u_i}{t}
+   -
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \left(
+      \rho
+      \pder{u_i}{t}
+      +
+      \frac{1}{J}
+      \ave{
+         \ave{
+            \frac{J}{h_{\gcs^j}}
+            \rho u_j
+         }{\gcs^i}
+         \dif{
+            u_i
+         }{\gcs^j}
+      }{\gcs^j}
+   \right).
+
+What I have inside the parentheses of the second term is the so-called gradient form of the advective terms.
+
diff --git a/docs/source/discretisation/spatial/energy/diffusive.rst b/docs/source/discretisation/spatial/energy/diffusive.rst
new file mode 100644
index 0000000..8412f90
--- /dev/null
+++ b/docs/source/discretisation/spatial/energy/diffusive.rst
@@ -0,0 +1,242 @@
+In general, the diffusive terms are written as
+
+.. math::
+
+   \frac{1}{J}
+   \dif{}{\gcs^j}
+   \left(
+      \frac{J}{h_{\gcs^j}}
+      \tau_{i j}
+   \right),
+
+which is the diffusion of the :math:`i`-th momentum in the :math:`j`-th direction.
+
+The global contribution
+
+.. math::
+
+   \sum_{i\,\text{face}}
+   J
+   u_i
+   \frac{1}{J}
+   \dif{}{\gcs^j}
+   \left(
+      \frac{J}{h_{\gcs^j}}
+      \tau_{i j}
+   \right)
+   =
+   \sum_{i\,\text{face}}
+   u_i
+   \dif{}{\gcs^j}
+   \left(
+      \frac{J}{h_{\gcs^j}}
+      \tau_{i j}
+   \right)
+
+is as follows.
+
+:math:`x` momentum contribution:
+
+.. math::
+
+   &
+   \sumzc
+   \sumyc
+   \sumxf
+   \ux
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \tau_{\vx \vx}
+   \right)
+   -
+   =
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   l_{\vx \vx}
+   \tau_{\vx \vx}
+
+   &
+   \sumzc
+   \sumyc
+   \sumxf
+   \ux
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \tau_{\vx \vy}
+   \right)
+   =
+   -
+   \sumzc
+   \sumyf
+   \sumxf
+   J
+   l_{\vx \vy}
+   \tau_{\vx \vy}
+
+   &
+   \sumzc
+   \sumyc
+   \sumxf
+   \ux
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \tau_{\vx \vz}
+   \right)
+   =
+   -
+   \sumzf
+   \sumyc
+   \sumxf
+   J
+   l_{\vx \vz}
+   \tau_{\vx \vz}
+
+:math:`y` momentum contribution:
+
+.. math::
+
+   &
+   \sumzc
+   \sumyf
+   \sumxc
+   \uy
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \tau_{\vy \vx}
+   \right)
+   =
+   -
+   \sumzc
+   \sumyf
+   \vat{
+      \left(
+         \uy
+         \jhx
+         \tau_{\vy \vx}
+      \right)
+   }{\frac{1}{2}}
+   +
+   \sumzc
+   \sumyf
+   \vat{
+      \left(
+         \uy
+         \jhx
+         \tau_{\vy \vx}
+      \right)
+   }{\nx + \frac{1}{2}}
+   -
+   \sumzc
+   \sumyf
+   \sumxf
+   J
+   l_{\vy \vx}
+   \tau_{\vy \vx}
+
+   &
+   \sumzc
+   \sumyf
+   \sumxc
+   \uy
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \tau_{\vy \vy}
+   \right)
+   =
+   -
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   l_{\vy \vy}
+   \tau_{\vy \vy}
+
+   &
+   \sumzc
+   \sumyf
+   \sumxc
+   \uy
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \tau_{\vy \vz}
+   \right)
+   =
+   -
+   \sumzf
+   \sumyf
+   \sumxc
+   J
+   l_{\vy \vz}
+   \tau_{\vy \vz}
+
+:math:`z` momentum contribution:
+
+.. math::
+
+   &
+   \sumzf
+   \sumyc
+   \sumxc
+   \uz
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \tau_{\vz \vx}
+   \right)
+   =
+   -
+   \sumzf
+   \sumyc
+   \sumxf
+   J
+   l_{\vz \vx}
+   \tau_{\vz \vx}
+
+   &
+   \sumzf
+   \sumyc
+   \sumxc
+   \uz
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \tau_{\vz \vy}
+   \right)
+   =
+   -
+   \sumzf
+   \sumyf
+   \sumxc
+   J
+   l_{\vz \vy}
+   \tau_{\vz \vy}
+
+   &
+   \sumzf
+   \sumyc
+   \sumxc
+   \uz
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \tau_{\vz \vz}
+   \right)
+   =
+   -
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   l_{\vz \vz}
+   \tau_{\vz \vz}
+
+All terms are dissipative, except the two terms in the :math:`y` contribution which are the energy throughput on the walls.
+
diff --git a/docs/source/discretisation/spatial/energy/main.rst b/docs/source/discretisation/spatial/energy/main.rst
new file mode 100644
index 0000000..5b40b6d
--- /dev/null
+++ b/docs/source/discretisation/spatial/energy/main.rst
@@ -0,0 +1,24 @@
+##############
+Energy balance
+##############
+
+To see the energy balance globally and discretely, :ref:`the momentum balance <discrete_momentum>` for each direction is integrated in the whole volume after multiplied by the corresponding local velocity.
+
+********************************
+Temporal evolution and advection
+********************************
+
+.. include:: ./advective.rst
+
+*****************
+Pressure gradient
+*****************
+
+.. include:: ./pressure.rst
+
+*********
+Diffusion
+*********
+
+.. include:: ./diffusive.rst
+
diff --git a/docs/source/discretisation/spatial/energy/pressure.rst b/docs/source/discretisation/spatial/energy/pressure.rst
new file mode 100644
index 0000000..437f9fb
--- /dev/null
+++ b/docs/source/discretisation/spatial/energy/pressure.rst
@@ -0,0 +1,119 @@
+The pressure-gradient terms
+
+.. math::
+
+   -
+   \frac{1}{h_{\gcs^i}}
+   \dif{p}{\gcs^i},
+
+which contributes to the energy balance as follows:
+
+.. math::
+
+   -
+   \sumzc
+   \sumyc
+   \sumxf
+   J
+   \ux
+   \frac{1}{\hx}
+   \dif{p}{\gx}
+   =
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   p
+   \frac{1}{J}
+   \dif{
+      \left(
+         \jhx
+         \ux
+      \right)
+   }{\gx}
+
+.. math::
+
+   -
+   \sumzc
+   \sumyf
+   \sumxc
+   J
+   \uy
+   \frac{1}{\hy}
+   \dif{p}{\gy}
+   =
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   p
+   \frac{1}{J}
+   \dif{
+      \left(
+         \jhy
+         \uy
+      \right)
+   }{\gy}
+
+.. math::
+
+   -
+   \sumzf
+   \sumyc
+   \sumxc
+   J
+   \uz
+   \frac{1}{\hz}
+   \dif{p}{\gz}
+   =
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   p
+   \frac{1}{J}
+   \dif{
+      \left(
+         \jhz
+         \uz
+      \right)
+   }{\gz}
+
+The sum is
+
+.. math::
+
+   \sumzc
+   \sumyc
+   \sumxc
+   J
+   p
+   \left\{
+      \frac{1}{J}
+      \dif{
+         \left(
+            \jhx
+            \ux
+         \right)
+      }{\gx}
+      +
+      \frac{1}{J}
+      \dif{
+         \left(
+            \jhy
+            \uy
+         \right)
+      }{\gy}
+      +
+      \frac{1}{J}
+      \dif{
+         \left(
+            \jhz
+            \uz
+         \right)
+      }{\gz}
+   \right\},
+
+which is zero because the component inside the wavy parentheses is :ref:`the incompressibility constraint <discrete_incompressibility>`.
+
diff --git a/docs/source/discretisation/spatial/incompressibility.rst b/docs/source/discretisation/spatial/incompressibility.rst
new file mode 100644
index 0000000..92f2c81
--- /dev/null
+++ b/docs/source/discretisation/spatial/incompressibility.rst
@@ -0,0 +1,42 @@
+
+.. _discrete_incompressibility:
+
+############################
+Incompressibility constraint
+############################
+
+The incompressibility constraint is defined at each cell center:
+
+.. math::
+
+   \frac{1}{J}
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \ux
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \uy
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \uz
+   \right)
+   =
+   0.
+
+.. myliteralinclude:: /../../src/logging/divergence.c
+   :language: c
+   :tag: compute local divergence
+
+.. myliteralinclude:: /../../src/fluid/compute_potential.c
+   :language: c
+   :tag: compute local divergence
+
diff --git a/docs/source/discretisation/spatial/main.rst b/docs/source/discretisation/spatial/main.rst
new file mode 100644
index 0000000..15ad2b8
--- /dev/null
+++ b/docs/source/discretisation/spatial/main.rst
@@ -0,0 +1,16 @@
+######################
+Spatial discretisation
+######################
+
+In this project, grids are equidistantly positioned in all directions, as there is no reason why stretched grids can be used even in the wall-normal direction.
+
+.. toctree::
+   :maxdepth: 1
+
+   symbol/main
+   shear_stress_tensor
+   incompressibility
+   mass
+   momentum/main
+   energy/main
+
diff --git a/docs/source/discretisation/spatial/mass.rst b/docs/source/discretisation/spatial/mass.rst
new file mode 100644
index 0000000..aa22fc8
--- /dev/null
+++ b/docs/source/discretisation/spatial/mass.rst
@@ -0,0 +1,77 @@
+
+.. _discrete_mass:
+
+#################
+Mass conservation
+#################
+
+The mass conservation is defined at each cell center and described as:
+
+.. math::
+
+   \pder{\rho}{t}
+   +
+   \frac{1}{J}
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \rho
+      \ux
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \rho
+      \uy
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \rho
+      \uz
+   \right)
+   =
+   0.
+
+Specifically the rescaled version
+
+.. math::
+
+   \pder{H}{t}
+   +
+   \frac{1}{J}
+   \dif{}{\gx}
+   \left(
+      \jhx
+      \ux
+      H
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gy}
+   \left(
+      \jhy
+      \uy
+      H
+   \right)
+   +
+   \frac{1}{J}
+   \dif{}{\gz}
+   \left(
+      \jhz
+      \uz
+      H
+   \right)
+   =
+   0
+
+is considered.
+
+.. myliteralinclude:: /../../src/interface/update/main.c
+   :language: c
+   :tag: compute source of volume-of-fluid
+
diff --git a/docs/source/discretisation/spatial/momentum/main.rst b/docs/source/discretisation/spatial/momentum/main.rst
new file mode 100644
index 0000000..0f45b01
--- /dev/null
+++ b/docs/source/discretisation/spatial/momentum/main.rst
@@ -0,0 +1,18 @@
+
+.. _discrete_momentum:
+
+################
+Momentum balance
+################
+
+The wall-normal, the stream-wise, and the span-wise momentum equations are defined at each :math:`x`, :math:`y`, and :math:`z` cell faces, respectively.
+
+Note that the body force terms are omitted as adding their contributions is straightforward.
+
+.. toctree::
+   :maxdepth: 1
+
+   x
+   y
+   z
+
diff --git a/docs/source/discretisation/spatial/momentum/x.rst b/docs/source/discretisation/spatial/momentum/x.rst
new file mode 100644
index 0000000..9cae457
--- /dev/null
+++ b/docs/source/discretisation/spatial/momentum/x.rst
@@ -0,0 +1,89 @@
+#####################
+Wall-normal direction
+#####################
+
+.. math::
+
+   \pder{\rho \ux}{t}
+   =
+   &
+   \dmomadv{\vx}{\vx}
+   \dmomadv{\vx}{\vy}
+   \dmomadv{\vx}{\vz}
+
+   &
+   \dmompre{\vx}
+
+   &
+   \dmomdif{\vx}{\vx}
+   \dmomdif{\vx}{\vy}
+   \dmomdif{\vx}{\vz}
+
+*********
+Advection
+*********
+
+.. math::
+
+   \dmomadv{\vx}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is advected in x
+
+.. math::
+
+   \dmomadv{\vx}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is advected in y
+
+.. math::
+
+   \dmomadv{\vx}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is advected in z
+
+*****************
+Pressure-gradient
+*****************
+
+.. math::
+
+   \dmompre{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: pressure-gradient contribution
+
+*********
+Diffusion
+*********
+
+.. math::
+
+   \dmomdif{\vx}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is diffused in x
+
+.. math::
+
+   \dmomdif{\vx}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is diffused in y
+
+.. math::
+
+   \dmomdif{\vx}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/ux.c
+   :language: c
+   :tag: ux is diffused in z
+
diff --git a/docs/source/discretisation/spatial/momentum/y.rst b/docs/source/discretisation/spatial/momentum/y.rst
new file mode 100644
index 0000000..51ab163
--- /dev/null
+++ b/docs/source/discretisation/spatial/momentum/y.rst
@@ -0,0 +1,89 @@
+#####################
+Stream-wise direction
+#####################
+
+.. math::
+
+   \pder{\rho \uy}{t}
+   =
+   &
+   \dmomadv{\vy}{\vx}
+   \dmomadv{\vy}{\vy}
+   \dmomadv{\vy}{\vz}
+
+   &
+   \dmompre{\vy}
+
+   &
+   \dmomdif{\vy}{\vx}
+   \dmomdif{\vy}{\vy}
+   \dmomdif{\vy}{\vz}
+
+*********
+Advection
+*********
+
+.. math::
+
+   \dmomadv{\vy}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is advected in x
+
+.. math::
+
+   \dmomadv{\vy}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is advected in y
+
+.. math::
+
+   \dmomadv{\vy}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is advected in z
+
+*****************
+Pressure-gradient
+*****************
+
+.. math::
+
+   \dmompre{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: pressure-gradient contribution
+
+*********
+Diffusion
+*********
+
+.. math::
+
+   \dmomdif{\vy}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is diffused in x
+
+.. math::
+
+   \dmomdif{\vy}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is diffused in y
+
+.. math::
+
+   \dmomdif{\vy}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/uy.c
+   :language: c
+   :tag: uy is diffused in z
+
diff --git a/docs/source/discretisation/spatial/momentum/z.rst b/docs/source/discretisation/spatial/momentum/z.rst
new file mode 100644
index 0000000..f4592af
--- /dev/null
+++ b/docs/source/discretisation/spatial/momentum/z.rst
@@ -0,0 +1,89 @@
+###################
+Span-wise direction
+###################
+
+.. math::
+
+   \pder{\rho \uz}{t}
+   =
+   &
+   \dmomadv{\vz}{\vx}
+   \dmomadv{\vz}{\vy}
+   \dmomadv{\vz}{\vz}
+
+   &
+   \dmompre{\vz}
+
+   &
+   \dmomdif{\vz}{\vx}
+   \dmomdif{\vz}{\vy}
+   \dmomdif{\vz}{\vz}
+
+*********
+Advection
+*********
+
+.. math::
+
+   \dmomadv{\vz}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is advected in x
+
+.. math::
+
+   \dmomadv{\vz}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is advected in y
+
+.. math::
+
+   \dmomadv{\vz}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is advected in z
+
+*****************
+Pressure-gradient
+*****************
+
+.. math::
+
+   \dmompre{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: pressure-gradient contribution
+
+*********
+Diffusion
+*********
+
+.. math::
+
+   \dmomdif{\vz}{\vx}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is diffused in x
+
+.. math::
+
+   \dmomdif{\vz}{\vy}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is diffused in y
+
+.. math::
+
+   \dmomdif{\vz}{\vz}
+
+.. myliteralinclude:: /../../src/fluid/predict/uz.c
+   :language: c
+   :tag: uz is diffused in z
+
diff --git a/docs/source/discretisation/spatial/shear_stress_tensor.rst b/docs/source/discretisation/spatial/shear_stress_tensor.rst
new file mode 100644
index 0000000..b2309c0
--- /dev/null
+++ b/docs/source/discretisation/spatial/shear_stress_tensor.rst
@@ -0,0 +1,52 @@
+###################
+Shear-stress tensor
+###################
+
+********************
+:math:`xx` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/txx.c
+   :language: c
+   :tag: txx at cell center
+
+********************
+:math:`xy` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/txy.c
+   :language: c
+   :tag: txy at cell corner
+
+********************
+:math:`xz` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/txz.c
+   :language: c
+   :tag: txz at cell corner
+
+********************
+:math:`yy` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/tyy.c
+   :language: c
+   :tag: tyy at cell center
+
+********************
+:math:`yz` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/tyz.c
+   :language: c
+   :tag: tyz at cell corner
+
+********************
+:math:`zz` component
+********************
+
+.. myliteralinclude:: /../../src/fluid/predict/tzz.c
+   :language: c
+   :tag: tzz at cell center
+
diff --git a/docs/source/discretisation/spatial/symbol/main.rst b/docs/source/discretisation/spatial/symbol/main.rst
new file mode 100644
index 0000000..ca2d940
--- /dev/null
+++ b/docs/source/discretisation/spatial/symbol/main.rst
@@ -0,0 +1,72 @@
+######
+Symbol
+######
+
+Averages, differentiations, and summations are denotes as follows.
+
+.. toctree::
+   :maxdepth: 1
+
+   x
+   y
+   z
+
+**********************************************
+Discrete scale factor and Jacobian determinant
+**********************************************
+
+In Cartesian coordinate systems, scale factors are simply equal to the grid sizes:
+
+.. math::
+
+   \hx
+   &
+   \equiv
+   \Delta x,
+
+   \hy
+   &
+   \equiv
+   \Delta y,
+
+   \hz
+   &
+   \equiv
+   \Delta z.
+
+.. myliteralinclude:: /../../src/domain.c
+   :language: c
+   :tag: scale factor in x, defined at x cell faces
+
+.. myliteralinclude:: /../../src/domain.c
+   :language: c
+   :tag: scale factor in x, defined at x cell centers
+
+Note that the wall-normal grid sizes are halved on the walls:
+
+.. math::
+
+   \vat{\Delta x}{\frac{1}{2}}
+   =
+   \vat{\Delta x}{\nx + \frac{1}{2}}
+   =
+   \frac{1}{2} \Delta x_{\text{rest}}.
+
+The Jacobian determinants, which are also defined at cell centers and faces, are simply the product of the local scale factors:
+
+.. math::
+
+   J
+   \equiv
+   \hx
+   \hy
+   \hz.
+
+.. myliteralinclude:: /../../src/domain.c
+   :language: c
+   :tag: Jacobian determinant, defined at x cell faces
+
+.. myliteralinclude:: /../../src/domain.c
+   :language: c
+   :tag: Jacobian determinant, defined at x cell centers
+
diff --git a/docs/source/discretisation/spatial/symbol/x.rst b/docs/source/discretisation/spatial/symbol/x.rst
new file mode 100644
index 0000000..27eec76
--- /dev/null
+++ b/docs/source/discretisation/spatial/symbol/x.rst
@@ -0,0 +1,125 @@
+####################
+Wall-normal operator
+####################
+
+*******
+Average
+*******
+
+Average at wall-normal cell faces:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gx
+      }
+   }{
+      i + \frac{1}{2}
+   }
+   =
+   \left\{
+      \begin{alignedat}{2}
+         & \text{Negative wall:} & \vat{q}{\frac{1}{2}}, \\
+         & \text{Positive wall:} & \vat{q}{\nx + \frac{1}{2}}, \\
+         & \text{Otherwise:} & \frac{1}{2} \vat{q}{i} + \frac{1}{2} \vat{q}{i + 1}
+      \end{alignedat}
+   \right.
+
+Average at wall-normal cell centers:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gx
+      }
+   }{
+      i
+   }
+   =
+   \frac{1}{2} \vat{q}{i - \frac{1}{2}} + \frac{1}{2} \vat{q}{i + \frac{1}{2}}
+
+***************
+Differentiation
+***************
+
+Differentiation at wall-normal cell faces:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gx
+      }
+   }{
+      i + \frac{1}{2}
+   }
+   =
+   \left\{
+      \begin{alignedat}{2}
+         & \text{Negative wall:} & - \vat{q}{\frac{1}{2}} + \vat{q}{1}, \\
+         & \text{Positive wall:} & - \vat{q}{\nx}+ \vat{q}{\nx + \frac{1}{2}} , \\
+         & \text{Otherwise:} & - \vat{q}{i} + \vat{q}{i + 1}
+      \end{alignedat}
+   \right.
+
+Differentiation at wall-normal cell centers:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gx
+      }
+   }{
+      i
+   }
+   =
+   - \vat{q}{i - \frac{1}{2}}
+   + \vat{q}{i + \frac{1}{2}}
+
+*********
+Summation
+*********
+
+Summation of a quantity defined at wall-normal cell faces:
+
+.. math::
+
+   \sumxf q
+   \equiv
+   \vat{q}{\frac{1}{2}}
+   +
+   \vat{q}{\frac{3}{2}}
+   +
+   \cdots
+   +
+   \vat{q}{\nx - \frac{1}{2}}
+   +
+   \vat{q}{\nx + \frac{1}{2}}
+
+Summation of a quantity defined at wall-normal cell centers:
+
+.. math::
+
+   \sumxc q
+   \equiv
+   \vat{q}{1}
+   +
+   \vat{q}{2}
+   +
+   \cdots
+   +
+   \vat{q}{\nx - 1}
+   +
+   \vat{q}{\nx}
+
diff --git a/docs/source/discretisation/spatial/symbol/y.rst b/docs/source/discretisation/spatial/symbol/y.rst
new file mode 100644
index 0000000..6f7fae0
--- /dev/null
+++ b/docs/source/discretisation/spatial/symbol/y.rst
@@ -0,0 +1,118 @@
+####################
+Stream-wise operator
+####################
+
+*******
+Average
+*******
+
+Average at stream-wise cell faces:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gy
+      }
+   }{
+      j + \frac{1}{2}
+   }
+   =
+   \frac{1}{2} \vat{q}{j} + \frac{1}{2} \vat{q}{j + 1}
+
+Average at stream-wise cell centers:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gy
+      }
+   }{
+      j
+   }
+   =
+   \frac{1}{2} \vat{q}{j - \frac{1}{2}} + \frac{1}{2} \vat{q}{j + \frac{1}{2}}
+
+***************
+Differentiation
+***************
+
+Differentiation at stream-wise cell faces:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gy
+      }
+   }{
+      j + \frac{1}{2}
+   }
+   =
+   -
+   \vat{q}{j}
+   +
+   \vat{q}{j + 1}
+
+Differentiation at stream-wise cell centers:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gy
+      }
+   }{
+      j
+   }
+   =
+   -
+   \vat{q}{j - \frac{1}{2}}
+   +
+   \vat{q}{j + \frac{1}{2}}
+
+*********
+Summation
+*********
+
+Summation of a quantity defined at stream-wise cell faces:
+
+.. math::
+
+   \sumyf q
+   \equiv
+   \vat{q}{\frac{1}{2}}
+   +
+   \vat{q}{\frac{3}{2}}
+   +
+   \cdots
+   +
+   \vat{q}{\ny - \frac{3}{2}}
+   +
+   \vat{q}{\ny - \frac{1}{2}}
+
+Summation of a quantity defined at stream-wise cell centers:
+
+.. math::
+
+   \sumyc q
+   \equiv
+   \vat{q}{1}
+   +
+   \vat{q}{2}
+   +
+   \cdots
+   +
+   \vat{q}{\ny - 1}
+   +
+   \vat{q}{\ny}
+
diff --git a/docs/source/discretisation/spatial/symbol/z.rst b/docs/source/discretisation/spatial/symbol/z.rst
new file mode 100644
index 0000000..c3eb2a9
--- /dev/null
+++ b/docs/source/discretisation/spatial/symbol/z.rst
@@ -0,0 +1,118 @@
+##################
+Span-wise operator
+##################
+
+*******
+Average
+*******
+
+Average at span-wise cell faces:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gz
+      }
+   }{
+      k + \frac{1}{2}
+   }
+   =
+   \frac{1}{2} \vat{q}{k} + \frac{1}{2} \vat{q}{k + 1}
+
+Average at span-wise cell centers:
+
+.. math::
+
+   \vat{
+      \ave{
+         q
+      }{
+         \gz
+      }
+   }{
+      k
+   }
+   =
+   \frac{1}{2} \vat{q}{k - \frac{1}{2}} + \frac{1}{2} \vat{q}{k + \frac{1}{2}}
+
+***************
+Differentiation
+***************
+
+Differentiation at span-wise cell faces:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gz
+      }
+   }{
+      k + \frac{1}{2}
+   }
+   =
+   -
+   \vat{q}{k}
+   +
+   \vat{q}{k + 1}
+
+Differentiation at span-wise cell centers:
+
+.. math::
+
+   \vat{
+      \dif{
+         q
+      }{
+         \gz
+      }
+   }{
+      k
+   }
+   =
+   -
+   \vat{q}{k - \frac{1}{2}}
+   +
+   \vat{q}{k + \frac{1}{2}}
+
+*********
+Summation
+*********
+
+Summation of a quantity defined at span-wise cell faces:
+
+.. math::
+
+   \sumzf q
+   \equiv
+   \vat{q}{\frac{1}{2}}
+   +
+   \vat{q}{\frac{3}{2}}
+   +
+   \cdots
+   +
+   \vat{q}{\nz - \frac{3}{2}}
+   +
+   \vat{q}{\nz - \frac{1}{2}}
+
+Summation of a quantity defined at span-wise cell centers:
+
+.. math::
+
+   \sumzc q
+   \equiv
+   \vat{q}{1}
+   +
+   \vat{q}{2}
+   +
+   \cdots
+   +
+   \vat{q}{\nz - 1}
+   +
+   \vat{q}{\nz}
+
diff --git a/docs/source/discretisation/temporal/main.rst b/docs/source/discretisation/temporal/main.rst
new file mode 100644
index 0000000..aaf7264
--- /dev/null
+++ b/docs/source/discretisation/temporal/main.rst
@@ -0,0 +1,209 @@
+
+.. include:: /reference/reference.txt
+
+#######################
+Temporal discretisation
+#######################
+
+:ref:`The mass conservation <discrete_mass>` is solved first by means of the :ref:`THINC method <numerical_method>`, which is followed by integrating :ref:`the momentum balance <discrete_momentum>`.
+
+To enforce :ref:`the incompressibility constraint <discrete_incompressibility>`, the updated velocity field is corrected:
+
+.. math::
+
+   \newcommand{bef}{n-\frac{1}{2}}
+   \newcommand{aft}{n+\frac{1}{2}}
+   \frac{
+      u_i^{n+1}
+      -
+      u_i^*
+   }{\Delta t}
+   =
+   -
+   \frac{1}{\rho^{n+1}}
+   \frac{1}{h_{\gcs^i}}
+   \pder{\psi^{\aft}}{\gcs^i}
+
+where :math:`\psi` is a scalar potential obtained by solving the variable-coefficient Poisson equation:
+
+.. math::
+
+   \frac{1}{J}
+   \pder{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      \frac{1}{\rho^{n+1}}
+      \frac{1}{h_{\gcs^i}}
+      \pder{\psi^{\aft}}{\gcs^i}
+   \right)
+   =
+   \frac{1}{\Delta t}
+   \frac{1}{J}
+   \pder{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      u_i^*
+   \right),
+
+or discretely:
+
+.. math::
+
+   \frac{1}{J}
+   \dif{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      \frac{1}{\rho^{n+1}}
+      \frac{1}{h_{\gcs^i}}
+      \dif{\psi^{\aft}}{\gcs^i}
+   \right)
+   =
+   \frac{1}{\Delta t}
+   \frac{1}{J}
+   \dif{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      u_i^*
+   \right).
+
+Since this is a variable-coefficient Poisson equation, I adopt the approach proposed by |DODD2014| to utilise the orthogonal decomposition.
+With this approximation and :math:`\rho_{ref} \equiv \min \left( 1, \hat{\rho} \right)`, the Poisson equation is modified as
+
+.. math::
+
+   \frac{1}{J}
+   \dif{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      \frac{1}{h_{\gcs^i}}
+      \dif{\psi^{\aft}}{\gcs^i}
+   \right)
+   =
+   \frac{1}{J}
+   \dif{}{\gcs^i}
+   \left\{
+      \frac{J}{h_{\gcs^i}}
+      \left(
+         \frac{\rho_{ref}}{\rho^{n+1}}
+         -
+         1
+      \right)
+      \frac{1}{h_{\gcs^i}}
+      \dif{\psi^{\bef}}{\gcs^i}
+   \right\}
+   +
+   \frac{\rho_{ref}}{\Delta t}
+   \frac{1}{J}
+   \dif{}{\gcs^i}
+   \left(
+      \frac{J}{h_{\gcs^i}}
+      u_i^*
+   \right).
+
+.. myliteralinclude:: /../../src/fluid/compute_potential.c
+   :language: c
+   :tag: additional contribution
+
+The velocity corrections are modified as
+
+.. math::
+
+   \newcommand{\old}[1]{
+      +
+      \left(
+         \frac{1}{\rho^{n+1}}
+         -
+         \frac{1}{\rho_{ref}}
+      \right)
+      \frac{1}{h_{\gcs^#1}}
+      \pder{\psi^{\bef}}{\gcs^#1}
+   }
+   \newcommand{\new}[1]{
+      -
+      \frac{1}{\rho_{ref}}
+      \frac{1}{h_{\gcs^#1}}
+      \pder{\psi^{\aft}}{\gcs^#1}
+   }
+   \frac{
+      u_i^{n+1}
+      -
+      u_i^*
+   }{\Delta t}
+   =
+   \new{i}
+   \old{i}.
+
+There are two contributions, which are from the new scalar potential and the old scalar potential.
+
+The contributions of the new scalar potential:
+
+.. math::
+
+   \new{\vx}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/ux.c
+   :language: c
+   :tag: new scalar potential contribution
+
+.. math::
+
+   \new{\vy}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/uy.c
+   :language: c
+   :tag: new scalar potential contribution
+
+.. math::
+
+   \new{\vz}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/uz.c
+   :language: c
+   :tag: new scalar potential contribution
+
+The contributions of the old scalar potential:
+
+.. math::
+
+   \old{\vx}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/ux.c
+   :language: c
+   :tag: old scalar potential contribution
+
+.. math::
+
+   \old{\vy}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/uy.c
+   :language: c
+   :tag: old scalar potential contribution
+
+.. math::
+
+   \old{\vz}
+
+.. myliteralinclude:: /../../src/fluid/correct_velocity/uz.c
+   :language: c
+   :tag: old scalar potential contribution
+
+Since I treat the diffusive terms fully-explicitly, the scalar potential and the pressure are simply related by
+
+.. math::
+
+   p^{n+1}
+   -
+   p^n
+   =
+   \psi^{\aft}.
+
+.. myliteralinclude:: /../../src/fluid/update_pressure.c
+   :language: c
+   :tag: explicit contribution
+
+This is followed by updating :math:`\psi`:
+
+.. myliteralinclude:: /../../src/fluid/update_pressure.c
+   :language: c
+   :tag: update psi
+
diff --git a/docs/source/equation/incompressibility.rst b/docs/source/equation/incompressibility.rst
new file mode 100644
index 0000000..ab8f621
--- /dev/null
+++ b/docs/source/equation/incompressibility.rst
@@ -0,0 +1,22 @@
+.. math::
+
+   \frac{1}{J}
+   \pder{}{\gx}
+   \left(
+      \jhx \ux
+   \right)
+   +
+   \frac{1}{J}
+   \pder{}{\gy}
+   \left(
+      \jhy \uy
+   \right)
+   +
+   \frac{1}{J}
+   \pder{}{\gz}
+   \left(
+      \jhz \uz
+   \right)
+   =
+   0.
+
diff --git a/docs/source/equation/main.rst b/docs/source/equation/main.rst
new file mode 100644
index 0000000..967373c
--- /dev/null
+++ b/docs/source/equation/main.rst
@@ -0,0 +1,275 @@
+
+.. _equation:
+
+########
+Equation
+########
+
+******************
+Boundary condition
+******************
+
+Note that the following set of boundary conditions is assumed throughout the project.
+
+* The domain is wall-bounded in the :math:`x` direction, while periodic in the other directions.
+* The walls are no-slip and impermeable.
+* The walls may move in the :math:`y` direction with constant speeds over time, while there is no :math:`z` motion.
+
+******
+Metric
+******
+
+For simplicity and generality, the governing equations are written in a general rectilinear coordinate system :math:`\gcs^i` with normalised Cartesian components :math:`u_i`.
+:math:`h_{\gcs^i}` denote scale factors and its product is the Jacobian determinant :math:`J` due to the orthogonality.
+
+See `e.g. this <https://naokihori.github.io/OrthogonalNS/index.html>`_ for the derivations.
+
+**********************
+Non-dimensionalisation
+**********************
+
+To normalise the equations, I use the density :math:`\rho` and the dynamic viscosity :math:`\mu` of the primary liquid as reference values.
+Namely, the equations described below all assume that the density and the dynamic viscosity of the primary liquids are unity.
+In particular, through the phase indicator function :math:`H` (defined later), the local and the instantaneous density and dynamic viscosity are given by
+
+.. math::
+
+   \rho \left( x_i, t \right) & = 1 + \left( \hat{\rho} - 1 \right) H \left( x_i, t \right),
+
+   \frac{1}{\mu \left( x_i, t \right)} & = 1 + \left( \frac{1}{\hat{\mu}} - 1 \right) H \left( x_i, t \right).
+
+Additionally, by using the reference length / velocity scales, we have three non-dimensional numbers: :math:`Re, We, Fr`.
+
+************************
+Velocity-gradient tensor
+************************
+
+.. include:: velocity_gradient_tensor.rst
+
+*******************
+Shear-stress tensor
+*******************
+
+The shear-stress tensor for Newtonian liquids is defined as
+
+.. math::
+
+   \tau_{ij}
+   \equiv
+   2 \mu s_{ij},
+
+where :math:`s_{ij}` is the strain-rate tensor:
+
+.. math::
+
+   s_{ij}
+   \equiv
+   \frac{1}{2}
+   l_{ij}
+   +
+   \frac{1}{2}
+   l_{ji}.
+
+Thus
+
+.. math::
+
+   \tau_{ij}
+   =
+   \mu
+   l_{ij}
+   +
+   \mu
+   l_{ji}.
+
+****************************
+Incompressibility constraint
+****************************
+
+.. include:: incompressibility.rst
+
+*****************
+Mass conservation
+*****************
+
+.. include:: mass.rst
+
+****************
+Momentum balance
+****************
+
+.. include:: mom.rst
+
+******************
+Quadratic quantity
+******************
+
+I consider the quadratic quantities
+
+.. math::
+
+   k_i
+   \equiv
+   \frac{1}{2}
+   \rho
+   u_i u_i \,\, \text{(No summation)},
+
+which satisfy the following relations.
+
+.. include:: quad.rst
+
+By volume-integrating these three relations inside the whole domain and summing them up, I obtain the relation of the global kinetic energy:
+
+.. math::
+
+   \pder{}{t}
+   \int_V
+   \left(
+      \kx
+      +
+      \ky
+      +
+      \kz
+   \right)
+   J
+   d\gx
+   d\gy
+   d\gz
+   =
+   \left( \text{transport} \right)
+   +
+   \left( \text{dissipation} \right).
+
+Note that the body force contributions :math:`f_i u_i` are omitted.
+
+Here the *transport* is the net kinetic energy going through the walls which attributes to the first diffusive term in the stream-wise momentum equation:
+
+.. math::
+
+   -
+   \int_{\gz}
+   \int_{\gy}
+   \vat{
+      \left(
+         \jhx
+         \uy
+         \tau_{\vx \vy}
+      \right)
+   }{x = 0}
+   d\gy
+   d\gz
+   +
+   \int_{\gz}
+   \int_{\gy}
+   \vat{
+      \left(
+         \jhx
+         \uy
+         \tau_{\vx \vy}
+      \right)
+   }{x = 1}
+   d\gy
+   d\gz
+   =
+   -
+   \int_{S, x = 0}
+   \uy
+   \tau_{\vx \vy}
+   dS
+   +
+   \int_{S, x = 1}
+   \uy
+   \tau_{\vx \vy}
+   dS,
+
+while the *dissipation* is handled by the other terms
+
+.. math::
+
+   -
+   \int_V
+   \left(
+      \begin{aligned}
+         &
+         +
+         \frac{1}{\hx}
+         \pder{\ux}{\gx}
+         \tau_{\vx \vx}
+         +
+         \frac{1}{\hy}
+         \pder{\ux}{\gy}
+         \tau_{\vy \vx}
+         +
+         \frac{1}{\hz}
+         \pder{\ux}{\gz}
+         \tau_{\vz \vx} \\
+         &
+         +
+         \frac{1}{\hx}
+         \pder{\uy}{\gx}
+         \tau_{\vx \vy}
+         +
+         \frac{1}{\hy}
+         \pder{\uy}{\gy}
+         \tau_{\vy \vy}
+         +
+         \frac{1}{\hz}
+         \pder{\uy}{\gz}
+         \tau_{\vz \vy} \\
+         &
+         +
+         \frac{1}{\hx}
+         \pder{\uz}{\gx}
+         \tau_{\vx \vz}
+         +
+         \frac{1}{\hy}
+         \pder{\uz}{\gy}
+         \tau_{\vy \vz}
+         +
+         \frac{1}{\hz}
+         \pder{\uz}{\gz}
+         \tau_{\vz \vz}
+      \end{aligned}
+   \right)
+   dV
+   =
+   -
+   \int_V
+   \left(
+      \begin{aligned}
+         &
+         +
+         l_{\vx \vx}
+         \tau_{\vx \vx}
+         +
+         l_{\vy \vx}
+         \tau_{\vy \vx}
+         +
+         l_{\vz \vx}
+         \tau_{\vz \vx} \\
+         &
+         +
+         l_{\vx \vy}
+         \tau_{\vx \vy}
+         +
+         l_{\vy \vy}
+         \tau_{\vy \vy}
+         +
+         l_{\vz \vy}
+         \tau_{\vz \vy} \\
+         &
+         +
+         l_{\vx \vz}
+         \tau_{\vx \vz}
+         +
+         l_{\vy \vz}
+         \tau_{\vy \vz}
+         +
+         l_{\vz \vz}
+         \tau_{\vz \vz}
+      \end{aligned}
+   \right)
+   dV.
+
+Note that the advective contributions on the global energy balance vanish due to the prescribed boundary conditions.
+
diff --git a/docs/source/equation/mass.rst b/docs/source/equation/mass.rst
new file mode 100644
index 0000000..f534329
--- /dev/null
+++ b/docs/source/equation/mass.rst
@@ -0,0 +1,30 @@
+.. math::
+
+   \pder{\rho}{t}
+   +
+   \frac{1}{J}
+   \pder{}{\gx}
+   \left(
+      \jhx
+      \rho
+      \ux
+   \right)
+   +
+   \frac{1}{J}
+   \pder{}{\gy}
+   \left(
+      \jhy
+      \rho
+      \uy
+   \right)
+   +
+   \frac{1}{J}
+   \pder{}{\gz}
+   \left(
+      \jhz
+      \rho
+      \uz
+   \right)
+   =
+   0.
+
diff --git a/docs/source/equation/mom.rst b/docs/source/equation/mom.rst
new file mode 100644
index 0000000..9c6deb3
--- /dev/null
+++ b/docs/source/equation/mom.rst
@@ -0,0 +1,8 @@
+.. math::
+
+   & \mom{\vx}
+
+   & \mom{\vy}
+
+   & \mom{\vz}
+
diff --git a/docs/source/equation/quad.rst b/docs/source/equation/quad.rst
new file mode 100644
index 0000000..823adc1
--- /dev/null
+++ b/docs/source/equation/quad.rst
@@ -0,0 +1,8 @@
+.. math::
+
+   & \ene{x}
+
+   & \ene{y}
+
+   & \ene{z}
+
diff --git a/docs/source/equation/velocity_gradient_tensor.rst b/docs/source/equation/velocity_gradient_tensor.rst
new file mode 100644
index 0000000..5406007
--- /dev/null
+++ b/docs/source/equation/velocity_gradient_tensor.rst
@@ -0,0 +1,48 @@
+The velocity-gradient tensor is defined as
+
+.. math::
+
+   \vec{e}_i
+   \otimes
+   \vec{e}_j
+   l_{ij},
+
+where the components are given as
+
+.. math::
+
+   \begin{pmatrix}
+      l_{\vx \vx} & l_{\vx \vy} & l_{\vx \vz} \\
+      l_{\vy \vx} & l_{\vy \vy} & l_{\vy \vz} \\
+      l_{\vz \vx} & l_{\vz \vy} & l_{\vz \vz} \\
+   \end{pmatrix}
+   =
+   \begin{pmatrix}
+      \frac{1}{\hx}
+      \pder{\ux}{\gx}
+      &
+      \frac{1}{\hx}
+      \pder{\uy}{\gx}
+      &
+      \frac{1}{\hx}
+      \pder{\uz}{\gx}
+      \\
+      \frac{1}{\hy}
+      \pder{\ux}{\gy}
+      &
+      \frac{1}{\hy}
+      \pder{\uy}{\gy}
+      &
+      \frac{1}{\hy}
+      \pder{\uz}{\gy}
+      \\
+      \frac{1}{\hz}
+      \pder{\ux}{\gz}
+      &
+      \frac{1}{\hz}
+      \pder{\uy}{\gz}
+      &
+      \frac{1}{\hz}
+      \pder{\uz}{\gz}
+   \end{pmatrix}.
+
diff --git a/docs/source/ext/myliteralinclude.py b/docs/source/ext/myliteralinclude.py
new file mode 100644
index 0000000..8db0414
--- /dev/null
+++ b/docs/source/ext/myliteralinclude.py
@@ -0,0 +1,167 @@
+import os
+import re
+from typing import List
+from docutils import nodes
+from docutils.nodes import Node
+from docutils.parsers.rst import directives
+from docutils.statemachine import StringList
+from sphinx.directives import optional_int
+from sphinx.util.docutils import SphinxDirective
+from sphinx.util.typing import OptionSpec
+from sphinx.directives.code import LiteralIncludeReader
+
+
+def get_lines(filename, tag):
+    # load all file contents
+    with open(filename, "r") as f:
+        lines = f.readlines()
+    # check number of tags in the document
+    n_tags = 0
+    for line in lines:
+        if tag in line:
+            n_tags += 1
+    # there should be at least 1
+    if n_tags == 0:
+        return
+    # check pairs of "start lineno" and "end lineno"
+    ss = list()
+    es = list()
+    for cnt, line in enumerate(lines):
+        if f" {tag} " in line:
+            if not "|" in line:
+                msg = "delimiter | is not found"
+                raise RuntimeError(msg)
+            if not "//" in line and not ("/*" in line and "*/" in line):
+                msg = "// nor /* */ are found"
+                raise RuntimeError(msg)
+            num = line.split("|")[1]
+            num = re.sub(r"[^0-9]", "", num)
+            num = int(num)
+            s = cnt + 1
+            e = s + num - 1
+            s_string = lines[s].strip()
+            e_string = lines[e].strip()
+            if s_string == "{":
+                if e_string == "}":
+                    s += 1
+                    e -= 1
+                else:
+                    # starting from "{", but does not end with "}"
+                    # maybe making a mistake in "num" setting
+                    assert(0 == 1)
+            # python index starts from 0, while line number starts from 1
+            s += 1
+            e += 1
+            ss.append(s)
+            es.append(e)
+    # check number of "{" and "}" are same
+    for s, e in zip(ss, es):
+        part_of_lines = lines[s-1:e]
+        num_curly_s = 0
+        num_curly_e = 0
+        for l in part_of_lines:
+            num_curly_s += l.count("{")
+            num_curly_e += l.count("}")
+        assert(num_curly_s == num_curly_e)
+    # pack result and return
+    retval = list()
+    for s, e in zip(ss, es):
+        retval.append((f"{s}-{e}", s))
+    return retval
+
+def remove_head_spaces(lines):
+    lines = lines.split("\n")[:-1]
+    nspaces = 0
+    for lcnt, line in enumerate(lines):
+        for ccnt, char in enumerate(line):
+            if char != " ":
+                if lcnt == 0:
+                    nspaces = ccnt
+                else:
+                    nspaces = min(nspaces, ccnt)
+                break
+    newlines = list()
+    for line in lines:
+        newlines.append(line[nspaces:])
+    newlines = "\n".join(newlines)
+    return newlines
+
+def container_wrapper(directive: SphinxDirective, literal_node: Node, caption: str) -> nodes.container:  # NOQA
+    container_node = nodes.container('', literal_block=True,
+                                     classes=['literal-block-wrapper'])
+    parsed = nodes.Element()
+    directive.state.nested_parse(StringList([caption], source=''),
+                                 directive.content_offset, parsed)
+    if isinstance(parsed[0], nodes.system_message):
+        msg = __('Invalid caption: %s' % parsed[0].astext())
+        raise ValueError(msg)
+    elif isinstance(parsed[0], nodes.Element):
+        caption_node = nodes.caption(parsed[0].rawsource, '',
+                                     *parsed[0].children)
+        caption_node.source = literal_node.source
+        caption_node.line = literal_node.line
+        container_node += caption_node
+        container_node += literal_node
+        return container_node
+    else:
+        raise RuntimeError  # never reached
+
+class MyLiteralInclude(SphinxDirective):
+    """
+    Customised LiteralInclude class
+    """
+    has_content = False
+    required_arguments = 1
+    optional_arguments = 0
+    final_argument_whitespace = True
+    option_spec: OptionSpec = {
+        'language': directives.unchanged_required,
+        'tag': directives.unchanged_required,
+        'caption': directives.unchanged,
+    }
+    def run(self) -> List[Node]:
+        document = self.state.document
+        if not document.settings.file_insertion_enabled:
+            return [document.reporter.warning('File insertion disabled',
+                                              line=self.lineno)]
+        try:
+            location = self.state_machine.get_source_and_line(self.lineno)
+            rel_filename, filename = self.env.relfn2path(self.arguments[0])
+            self.env.note_dependency(rel_filename)
+            pairs = get_lines(filename, self.options['tag'])
+            retnodes = list()
+            for pair in pairs:
+                self.options['lines'], self.options['lineno-start'] = pair[0], pair[1]
+                reader = LiteralIncludeReader(filename, self.options, self.config)
+                text, lines = reader.read(location=location)
+                text = remove_head_spaces(text)
+                retnode: Element = nodes.literal_block(text, text, source=filename)
+                retnode['force'] = 'force' in self.options
+                self.set_source_info(retnode)
+                retnode['language'] = self.options['language']
+                retnode['linenos'] = True
+                extra_args = retnode['highlight_args'] = {}
+                extra_args['linenostart'] = reader.lineno_start
+                ## caption, always show filename
+                # /../../src/<filename>.c -> src/<filename>.c
+                if "src/" in self.arguments[0]:
+                    delimiter = "src/"
+                elif "include/" in self.arguments[0]:
+                    delimiter = "include/"
+                caption = delimiter + self.arguments[0].split(delimiter)[1]
+                retnode = container_wrapper(self, retnode, caption)
+                ##
+                self.add_name(retnode)
+                retnodes.append(retnode)
+            return retnodes
+        except Exception as exc:
+            return [document.reporter.warning(exc, line=self.lineno)]
+
+def setup(app):
+    app.add_directive("myliteralinclude", MyLiteralInclude)
+
+    return {
+        'version': '0.1',
+        'parallel_read_safe': True,
+        'parallel_write_safe': True,
+    }
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..4ade161
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,15 @@
+#########################
+Simple Bubbly Flow Solver
+#########################
+
+.. image:: ./thumbnail.jpg
+   :width: 80%
+
+.. toctree::
+   :hidden:
+
+   equation/main
+   numerical_method
+   discretisation/main
+   reference/main
+
diff --git a/docs/source/numerical_method.rst b/docs/source/numerical_method.rst
new file mode 100644
index 0000000..f4a83c5
--- /dev/null
+++ b/docs/source/numerical_method.rst
@@ -0,0 +1,154 @@
+
+.. _numerical_method:
+
+.. include:: /reference/reference.txt
+
+################
+Numerical method
+################
+
+*******************
+Interface capturing
+*******************
+
+To begin with, I consider an indicator function :math:`H`, which works as a marker taking :math:`0` or :math:`1` when the infinitesimal control volume is occupied with the primary or the secondary liquids, respectively.
+By this definition, as an equation governing the evolution of :math:`H`, I have
+
+.. math::
+
+   \pder{H}{t}
+   +
+   \frac{u_j}{h_{\gcs^j}}
+   \pder{H}{\gcs^j}
+   =
+   0.
+
+Since I assume the liquids are incompressible, this advection equation is equal to
+
+.. math::
+
+   \pder{H}{t}
+   +
+   \frac{1}{J}
+   \pder{}{\gcs^j}
+   \left(
+      \frac{J}{h_{\gcs^j}}
+      u_j
+      H
+   \right)
+   =
+   0,
+
+whose volume-integrated form leads to
+
+.. math::
+
+   J
+   \pder{\phi}{t}
+   +
+   \int_{\partial V}
+   \frac{J}{h_{\gcs^j}}
+   u_j
+   H
+   n_j
+   dS
+   =
+   0,
+
+where :math:`\phi` is the rate of the primary phase volume inside the control volume, or known as `volume-of-fluid`.
+Note that I assume the coordinate systems do not change in time.
+To integrate this equation, I employ the THINC/QQ method (|XIE2017|), whose numerical treatment is extensively discussed in `the other project <https://naokihori.github.io/SimpleVOFSolver/numerical_method/main.html>`_.
+
+***********
+Consistency
+***********
+
+Note that the following idea is widely known in the phase-field community (e.g. |MIRJALILI2021|), which is slightly customised and applied here.
+
+As discussed above, how the volume fraction is evolved is totally determined by the volume-of-fluid method.
+Since :math:`H` and :math:`\rho` are related by
+
+.. math::
+
+   \rho = 1 + \left( \hat{\rho} - 1 \right) H,
+
+the evolution of the density (i.e. mass per unit volume) is also governed by the VOF method.
+Since the momentum is tightly linked to the flux of the mass, it is natural to apply this information (mass flux) to update the momentum field.
+To this end, I focus on the evolution of :math:`\phi` again:
+
+.. math::
+
+   J
+   \pder{\phi}{t}
+   +
+   \pder{}{\gcs^j}
+   \left(
+      \frac{J}{h_{\gcs^j}}
+      H
+      u_j
+   \right)
+   =
+   0.
+
+As the flux of the indicator function :math:`H u_j` is already known, the density flux :math:`\rho u_j` is obtained by
+
+.. math::
+
+   \rho u_j
+   =
+   u_j
+   +
+   \left(
+      \hat{\rho}
+      -
+      1
+   \right)
+   H
+   u_j.
+
+.. myliteralinclude:: /../../src/interface/mass_flux.c
+   :language: c
+   :tag: convert x vof flux to x mass flux
+
+.. myliteralinclude:: /../../src/interface/mass_flux.c
+   :language: c
+   :tag: convert y vof flux to y mass flux
+
+.. myliteralinclude:: /../../src/interface/mass_flux.c
+   :language: c
+   :tag: convert z vof flux to z mass flux
+
+*********************
+Surface tension force
+*********************
+
+For simplicity, the continuum surface force model (|BRACKBILL1992|) is adopted to model the interfacial tension:
+
+.. math::
+
+   f_i
+   \approx
+   \frac{2 \rho}{1 + \hat{\rho}}
+   \sigma
+   \kappa
+   \frac{1}{h_{\gcs^i}}
+   \pder{\phi}{\gcs^i}.
+
+.. myliteralinclude:: /../../src/interface/force.c
+   :language: c
+   :tag: compute surface tension force in x direction
+
+.. myliteralinclude:: /../../src/interface/force.c
+   :language: c
+   :tag: compute surface tension force in y direction
+
+.. myliteralinclude:: /../../src/interface/force.c
+   :language: c
+   :tag: compute surface tension force in z direction
+
+The pre-factor is computed here:
+
+.. myliteralinclude:: /../../src/interface/force.c
+   :language: c
+   :tag: compute density factor
+
diff --git a/docs/source/reference/main.rst b/docs/source/reference/main.rst
new file mode 100644
index 0000000..334ff81
--- /dev/null
+++ b/docs/source/reference/main.rst
@@ -0,0 +1,11 @@
+#########
+Reference
+#########
+
+.. include:: ./reference.txt
+
+* |BRACKBILL1992|
+* |DODD2014|
+* |XIE2017|
+* |MIRJALILI2021|
+
diff --git a/docs/source/reference/reference.txt b/docs/source/reference/reference.txt
new file mode 100644
index 0000000..6ed06ea
--- /dev/null
+++ b/docs/source/reference/reference.txt
@@ -0,0 +1,4 @@
+.. |BRACKBILL1992| replace:: Brackbill et al., *J. Comput. Phys.* (**100**), 1992
+.. |DODD2014|      replace:: Dodd and Ferrante, *J. Comput. Phys.* (**273**), 2014
+.. |XIE2017|       replace:: Xie and Xiao, *J. Comput. Phys.* (**349**), 2017
+.. |MIRJALILI2021| replace:: Mirjalili and Mani, *J. Comput. Phys.* (**426**), 2021
diff --git a/docs/source/sample.jpg b/docs/source/sample.jpg
new file mode 100644
index 0000000..12d7982
Binary files /dev/null and b/docs/source/sample.jpg differ
diff --git a/docs/source/thumbnail.jpg b/docs/source/thumbnail.jpg
new file mode 100644
index 0000000..4b16e8a
Binary files /dev/null and b/docs/source/thumbnail.jpg differ
diff --git a/exec/main.sh b/exec/main.sh
new file mode 100644
index 0000000..acfdb6f
--- /dev/null
+++ b/exec/main.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+## temporal information
+# maximum duration (in free-fall time)
+export timemax=5.0e+0
+# maximum duration (in wall time [s])
+export wtimemax=6.0e+2
+# logging rate (in free-fall time)
+export log_rate=1.0e-1
+# save rate (in free-fall time)
+export save_rate=5.0e-1
+# save after (in free-fall time)
+export save_after=0.0e+0
+# statistics collection rate (in free-fall time)
+export stat_rate=1.0e-1
+# statistics collection after (in free-fall time)
+export stat_after=1.0e+2
+
+## safety factors to decide time step size
+## for advective and diffusive terms
+export coef_dt_adv=0.20
+export coef_dt_dif=0.50
+export coef_dt_int=0.95
+
+## physical parameters
+export Re=35.
+export We=10.
+export Fr=1.
+export denr=0.0013
+export visr=0.018
+
+# give name of the directory in which the initial conditions
+#   (incl. domain size etc.) are stored as an argument
+dirname_ic=initial_condition/output
+# dirname_ic=$(find output/save -type d | sort | tail -n 1)
+
+mpirun -n 2 --oversubscribe ./a.out ${dirname_ic}
diff --git a/include/array.h b/include/array.h
new file mode 100644
index 0000000..9db0e07
--- /dev/null
+++ b/include/array.h
@@ -0,0 +1,57 @@
+#if !defined(ARRAY_H)
+#define ARRAY_H
+
+// struct and methods for multi-dimensional arrays
+//   distributed among multiple processes
+
+#include <stddef.h>
+#include "domain.h"
+
+typedef struct {
+  // size of each element
+  size_t size;
+  // number of additional cells w.r.t. no-halo array
+  //   i.e. 0th elements: [1 : mysizes[0]]
+  //        1st elements: [1 : mysizes[1]]
+  //        2nd elements: [1 : mysizes[2]]
+  //        ...
+  int (* nadds)[2];
+  // total size of local array (i.e. product of mysizes)
+  size_t datasize;
+  // pointer to the raw local array
+  void * data;
+} array_t;
+
+typedef struct {
+  // allocate array and store its size information
+  int (* const prepare)(
+      const domain_t * domain,
+      const int nadds[NDIMS][2],
+      const size_t size,
+      array_t * array
+  );
+  // clean-up local memory to store the array
+  int (* const destroy)(
+      array_t * array
+  );
+  // load array from NPY file
+  int (* const load)(
+      const domain_t * domain,
+      const char dirname[],
+      const char dsetname[],
+      const char dtype[],
+      array_t * array
+  );
+  // save array to NPY file
+  int (* const dump)(
+      const domain_t * domain,
+      const char dirname[],
+      const char dsetname[],
+      const char dtype[],
+      const array_t * array
+  );
+} array_method_t;
+
+extern const array_method_t array;
+
+#endif // ARRAY_H
diff --git a/include/array_macros/domain/hxxc.h b/include/array_macros/domain/hxxc.h
new file mode 100644
index 0000000..0ec589e
--- /dev/null
+++ b/include/array_macros/domain/hxxc.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_HXXC_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_HXXC_H
+
+// This file is generated by tools/define_arrays.py
+
+// [1 : isize+0]
+#define HXXC(I) (hxxc[(I-1)])
+#define HXXC_NADDS (int [2]){0, 0}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_HXXC_H
diff --git a/include/array_macros/domain/hxxf.h b/include/array_macros/domain/hxxf.h
new file mode 100644
index 0000000..5be5c33
--- /dev/null
+++ b/include/array_macros/domain/hxxf.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_HXXF_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_HXXF_H
+
+// This file is generated by tools/define_arrays.py
+
+// [1 : isize+1]
+#define HXXF(I) (hxxf[(I-1)])
+#define HXXF_NADDS (int [2]){0, 1}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_HXXF_H
diff --git a/include/array_macros/domain/jdxc.h b/include/array_macros/domain/jdxc.h
new file mode 100644
index 0000000..5c567d2
--- /dev/null
+++ b/include/array_macros/domain/jdxc.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_JDXC_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_JDXC_H
+
+// This file is generated by tools/define_arrays.py
+
+// [1 : isize+0]
+#define JDXC(I) (jdxc[(I-1)])
+#define JDXC_NADDS (int [2]){0, 0}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_JDXC_H
diff --git a/include/array_macros/domain/jdxf.h b/include/array_macros/domain/jdxf.h
new file mode 100644
index 0000000..24123a4
--- /dev/null
+++ b/include/array_macros/domain/jdxf.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_JDXF_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_JDXF_H
+
+// This file is generated by tools/define_arrays.py
+
+// [1 : isize+1]
+#define JDXF(I) (jdxf[(I-1)])
+#define JDXF_NADDS (int [2]){0, 1}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_JDXF_H
diff --git a/include/array_macros/domain/xc.h b/include/array_macros/domain/xc.h
new file mode 100644
index 0000000..d7b88ec
--- /dev/null
+++ b/include/array_macros/domain/xc.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_XC_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_XC_H
+
+// This file is generated by tools/define_arrays.py
+
+// [0 : isize+1]
+#define XC(I) (xc[(I  )])
+#define XC_NADDS (int [2]){1, 1}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_XC_H
diff --git a/include/array_macros/domain/xf.h b/include/array_macros/domain/xf.h
new file mode 100644
index 0000000..e2a0db6
--- /dev/null
+++ b/include/array_macros/domain/xf.h
@@ -0,0 +1,10 @@
+#if !defined(INCLUDE_ARRAY_MACROS_DOMAIN_XF_H)
+#define INCLUDE_ARRAY_MACROS_DOMAIN_XF_H
+
+// This file is generated by tools/define_arrays.py
+
+// [1 : isize+1]
+#define XF(I) (xf[(I-1)])
+#define XF_NADDS (int [2]){0, 1}
+
+#endif // INCLUDE_ARRAY_MACROS_DOMAIN_XF_H
diff --git a/include/array_macros/fluid/den.h b/include/array_macros/fluid/den.h
new file mode 100644
index 0000000..7ddaaed
--- /dev/null
+++ b/include/array_macros/fluid/den.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_DEN_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_DEN_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define DEN(I, J) (den[(I  ) + (isize+2) * (J  )])
+#define DEN_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define DEN(I, J, K) (den[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define DEN_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_DEN_H
diff --git a/include/array_macros/fluid/p.h b/include/array_macros/fluid/p.h
new file mode 100644
index 0000000..4af25f4
--- /dev/null
+++ b/include/array_macros/fluid/p.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_P_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_P_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define P(I, J) (p[(I  ) + (isize+2) * (J  )])
+#define P_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define P(I, J, K) (p[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define P_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_P_H
diff --git a/include/array_macros/fluid/psi.h b/include/array_macros/fluid/psi.h
new file mode 100644
index 0000000..5108e70
--- /dev/null
+++ b/include/array_macros/fluid/psi.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_PSI_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_PSI_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define PSI(I, J) (psi[(I  ) + (isize+2) * (J  )])
+#define PSI_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define PSI(I, J, K) (psi[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define PSI_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_PSI_H
diff --git a/include/array_macros/fluid/srcux.h b/include/array_macros/fluid/srcux.h
new file mode 100644
index 0000000..907fd49
--- /dev/null
+++ b/include/array_macros/fluid/srcux.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_SRCUX_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_SRCUX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [2 : isize+0], [1 : jsize+0]
+#define SRCUX(I, J) (srcux[(I-2) + (isize-1) * (J-1)])
+#define SRCUX_NADDS (int [NDIMS][2]){ {-1, 0}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [2 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define SRCUX(I, J, K) (srcux[(I-2) + (isize-1) * ((J-1) + (jsize+0) * (K-1))])
+#define SRCUX_NADDS (int [NDIMS][2]){ {-1, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_SRCUX_H
diff --git a/include/array_macros/fluid/srcuy.h b/include/array_macros/fluid/srcuy.h
new file mode 100644
index 0000000..f148359
--- /dev/null
+++ b/include/array_macros/fluid/srcuy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_SRCUY_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_SRCUY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [1 : jsize+0]
+#define SRCUY(I, J) (srcuy[(I-1) + (isize+0) * (J-1)])
+#define SRCUY_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define SRCUY(I, J, K) (srcuy[(I-1) + (isize+0) * ((J-1) + (jsize+0) * (K-1))])
+#define SRCUY_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_SRCUY_H
diff --git a/include/array_macros/fluid/srcuz.h b/include/array_macros/fluid/srcuz.h
new file mode 100644
index 0000000..d38afee
--- /dev/null
+++ b/include/array_macros/fluid/srcuz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_SRCUZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_SRCUZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [1 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define SRCUZ(I, J, K) (srcuz[(I-1) + (isize+0) * ((J-1) + (jsize+0) * (K-1))])
+#define SRCUZ_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_SRCUZ_H
diff --git a/include/array_macros/fluid/txx.h b/include/array_macros/fluid/txx.h
new file mode 100644
index 0000000..ecf2b8b
--- /dev/null
+++ b/include/array_macros/fluid/txx.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TXX_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TXX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [0 : jsize+1]
+#define TXX(I, J) (txx[(I-1) + (isize+0) * (J  )])
+#define TXX_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [0 : jsize+1], [0 : ksize+1]
+#define TXX(I, J, K) (txx[(I-1) + (isize+0) * ((J  ) + (jsize+2) * (K  ))])
+#define TXX_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TXX_H
diff --git a/include/array_macros/fluid/txy.h b/include/array_macros/fluid/txy.h
new file mode 100644
index 0000000..1b5663d
--- /dev/null
+++ b/include/array_macros/fluid/txy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TXY_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TXY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [0 : jsize+1]
+#define TXY(I, J) (txy[(I-1) + (isize+1) * (J  )])
+#define TXY_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define TXY(I, J, K) (txy[(I-1) + (isize+1) * ((J  ) + (jsize+2) * (K  ))])
+#define TXY_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TXY_H
diff --git a/include/array_macros/fluid/txz.h b/include/array_macros/fluid/txz.h
new file mode 100644
index 0000000..d887eb1
--- /dev/null
+++ b/include/array_macros/fluid/txz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TXZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TXZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define TXZ(I, J, K) (txz[(I-1) + (isize+1) * ((J  ) + (jsize+2) * (K  ))])
+#define TXZ_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TXZ_H
diff --git a/include/array_macros/fluid/tyy.h b/include/array_macros/fluid/tyy.h
new file mode 100644
index 0000000..8870323
--- /dev/null
+++ b/include/array_macros/fluid/tyy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TYY_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TYY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [0 : jsize+1]
+#define TYY(I, J) (tyy[(I-1) + (isize+0) * (J  )])
+#define TYY_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [0 : jsize+1], [0 : ksize+1]
+#define TYY(I, J, K) (tyy[(I-1) + (isize+0) * ((J  ) + (jsize+2) * (K  ))])
+#define TYY_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TYY_H
diff --git a/include/array_macros/fluid/tyz.h b/include/array_macros/fluid/tyz.h
new file mode 100644
index 0000000..4513da0
--- /dev/null
+++ b/include/array_macros/fluid/tyz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TYZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TYZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [1 : isize+0], [0 : jsize+1], [0 : ksize+1]
+#define TYZ(I, J, K) (tyz[(I-1) + (isize+0) * ((J  ) + (jsize+2) * (K  ))])
+#define TYZ_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TYZ_H
diff --git a/include/array_macros/fluid/tzz.h b/include/array_macros/fluid/tzz.h
new file mode 100644
index 0000000..e8639fa
--- /dev/null
+++ b/include/array_macros/fluid/tzz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_TZZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_TZZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [1 : isize+0], [0 : jsize+1], [0 : ksize+1]
+#define TZZ(I, J, K) (tzz[(I-1) + (isize+0) * ((J  ) + (jsize+2) * (K  ))])
+#define TZZ_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_TZZ_H
diff --git a/include/array_macros/fluid/ux.h b/include/array_macros/fluid/ux.h
new file mode 100644
index 0000000..c9ef585
--- /dev/null
+++ b/include/array_macros/fluid/ux.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_UX_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_UX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [0 : jsize+1]
+#define UX(I, J) (ux[(I-1) + (isize+1) * (J  )])
+#define UX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define UX(I, J, K) (ux[(I-1) + (isize+1) * ((J  ) + (jsize+2) * (K  ))])
+#define UX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_UX_H
diff --git a/include/array_macros/fluid/uy.h b/include/array_macros/fluid/uy.h
new file mode 100644
index 0000000..0c57698
--- /dev/null
+++ b/include/array_macros/fluid/uy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_UY_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_UY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define UY(I, J) (uy[(I  ) + (isize+2) * (J  )])
+#define UY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define UY(I, J, K) (uy[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define UY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_UY_H
diff --git a/include/array_macros/fluid/uz.h b/include/array_macros/fluid/uz.h
new file mode 100644
index 0000000..a5e67fc
--- /dev/null
+++ b/include/array_macros/fluid/uz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_UZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_UZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define UZ(I, J, K) (uz[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define UZ_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_UZ_H
diff --git a/include/array_macros/fluid/visux.h b/include/array_macros/fluid/visux.h
new file mode 100644
index 0000000..e47714f
--- /dev/null
+++ b/include/array_macros/fluid/visux.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_VISUX_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_VISUX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [0 : jsize+1]
+#define VISUX(I, J) (visux[(I-1) + (isize+1) * (J  )])
+#define VISUX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define VISUX(I, J, K) (visux[(I-1) + (isize+1) * ((J  ) + (jsize+2) * (K  ))])
+#define VISUX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_VISUX_H
diff --git a/include/array_macros/fluid/visuy.h b/include/array_macros/fluid/visuy.h
new file mode 100644
index 0000000..633f8e9
--- /dev/null
+++ b/include/array_macros/fluid/visuy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_VISUY_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_VISUY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define VISUY(I, J) (visuy[(I  ) + (isize+2) * (J  )])
+#define VISUY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define VISUY(I, J, K) (visuy[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define VISUY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_VISUY_H
diff --git a/include/array_macros/fluid/visuz.h b/include/array_macros/fluid/visuz.h
new file mode 100644
index 0000000..69dc896
--- /dev/null
+++ b/include/array_macros/fluid/visuz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_FLUID_VISUZ_H)
+#define INCLUDE_ARRAY_MACROS_FLUID_VISUZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define VISUZ(I, J, K) (visuz[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define VISUZ_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_FLUID_VISUZ_H
diff --git a/include/array_macros/interface/curv.h b/include/array_macros/interface/curv.h
new file mode 100644
index 0000000..1364563
--- /dev/null
+++ b/include/array_macros/interface/curv.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_CURV_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_CURV_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define CURV(I, J) (curv[(I  ) + (isize+2) * (J  )])
+#define CURV_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define CURV(I, J, K) (curv[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define CURV_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_CURV_H
diff --git a/include/array_macros/interface/dvof.h b/include/array_macros/interface/dvof.h
new file mode 100644
index 0000000..14fa080
--- /dev/null
+++ b/include/array_macros/interface/dvof.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_DVOF_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_DVOF_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [0 : jsize+2]
+#define DVOF(I, J) (dvof[(I-1) + (isize+1) * (J  )])
+#define DVOF_NADDS (int [NDIMS][2]){ {0, 1}, {1, 2}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+2], [0 : ksize+2]
+#define DVOF(I, J, K) (dvof[(I-1) + (isize+1) * ((J  ) + (jsize+3) * (K  ))])
+#define DVOF_NADDS (int [NDIMS][2]){ {0, 1}, {1, 2}, {1, 2}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_DVOF_H
diff --git a/include/array_macros/interface/fluxx.h b/include/array_macros/interface/fluxx.h
new file mode 100644
index 0000000..99ec591
--- /dev/null
+++ b/include/array_macros/interface/fluxx.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_FLUXX_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_FLUXX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [0 : jsize+1]
+#define FLUXX(I, J) (fluxx[(I-1) + (isize+1) * (J  )])
+#define FLUXX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define FLUXX(I, J, K) (fluxx[(I-1) + (isize+1) * ((J  ) + (jsize+2) * (K  ))])
+#define FLUXX_NADDS (int [NDIMS][2]){ {0, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_FLUXX_H
diff --git a/include/array_macros/interface/fluxy.h b/include/array_macros/interface/fluxy.h
new file mode 100644
index 0000000..fabdf5a
--- /dev/null
+++ b/include/array_macros/interface/fluxy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_FLUXY_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_FLUXY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [0 : jsize+1]
+#define FLUXY(I, J) (fluxy[(I  ) + (isize+2) * (J  )])
+#define FLUXY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define FLUXY(I, J, K) (fluxy[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define FLUXY_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_FLUXY_H
diff --git a/include/array_macros/interface/fluxz.h b/include/array_macros/interface/fluxz.h
new file mode 100644
index 0000000..587a72b
--- /dev/null
+++ b/include/array_macros/interface/fluxz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_FLUXZ_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_FLUXZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [0 : isize+1], [0 : jsize+1], [0 : ksize+1]
+#define FLUXZ(I, J, K) (fluxz[(I  ) + (isize+2) * ((J  ) + (jsize+2) * (K  ))])
+#define FLUXZ_NADDS (int [NDIMS][2]){ {1, 1}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_FLUXZ_H
diff --git a/include/array_macros/interface/ifrcx.h b/include/array_macros/interface/ifrcx.h
new file mode 100644
index 0000000..2288cdd
--- /dev/null
+++ b/include/array_macros/interface/ifrcx.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_IFRCX_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_IFRCX_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [2 : isize+0], [1 : jsize+0]
+#define IFRCX(I, J) (ifrcx[(I-2) + (isize-1) * (J-1)])
+#define IFRCX_NADDS (int [NDIMS][2]){ {-1, 0}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [2 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define IFRCX(I, J, K) (ifrcx[(I-2) + (isize-1) * ((J-1) + (jsize+0) * (K-1))])
+#define IFRCX_NADDS (int [NDIMS][2]){ {-1, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_IFRCX_H
diff --git a/include/array_macros/interface/ifrcy.h b/include/array_macros/interface/ifrcy.h
new file mode 100644
index 0000000..6a15f27
--- /dev/null
+++ b/include/array_macros/interface/ifrcy.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_IFRCY_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_IFRCY_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [1 : jsize+0]
+#define IFRCY(I, J) (ifrcy[(I-1) + (isize+0) * (J-1)])
+#define IFRCY_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define IFRCY(I, J, K) (ifrcy[(I-1) + (isize+0) * ((J-1) + (jsize+0) * (K-1))])
+#define IFRCY_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_IFRCY_H
diff --git a/include/array_macros/interface/ifrcz.h b/include/array_macros/interface/ifrcz.h
new file mode 100644
index 0000000..d74bf29
--- /dev/null
+++ b/include/array_macros/interface/ifrcz.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_IFRCZ_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_IFRCZ_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [1 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define IFRCZ(I, J, K) (ifrcz[(I-1) + (isize+0) * ((J-1) + (jsize+0) * (K-1))])
+#define IFRCZ_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_IFRCZ_H
diff --git a/include/array_macros/interface/normal.h b/include/array_macros/interface/normal.h
new file mode 100644
index 0000000..bee4eb8
--- /dev/null
+++ b/include/array_macros/interface/normal.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_NORMAL_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_NORMAL_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [0 : jsize+1]
+#define NORMAL(I, J) (normal[(I-1) + (isize+0) * (J  )])
+#define NORMAL_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [0 : jsize+1], [0 : ksize+1]
+#define NORMAL(I, J, K) (normal[(I-1) + (isize+0) * ((J  ) + (jsize+2) * (K  ))])
+#define NORMAL_NADDS (int [NDIMS][2]){ {0, 0}, {1, 1}, {1, 1}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_NORMAL_H
diff --git a/include/array_macros/interface/src.h b/include/array_macros/interface/src.h
new file mode 100644
index 0000000..4b36841
--- /dev/null
+++ b/include/array_macros/interface/src.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_SRC_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_SRC_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+0], [1 : jsize+0]
+#define SRC(I, J) (src[(I-1) + (isize+0) * (J-1)])
+#define SRC_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+0], [1 : jsize+0], [1 : ksize+0]
+#define SRC(I, J, K) (src[(I-1) + (isize+0) * ((J-1) + (jsize+0) * (K-1))])
+#define SRC_NADDS (int [NDIMS][2]){ {0, 0}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_SRC_H
diff --git a/include/array_macros/interface/vof.h b/include/array_macros/interface/vof.h
new file mode 100644
index 0000000..1eb44a2
--- /dev/null
+++ b/include/array_macros/interface/vof.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_INTERFACE_VOF_H)
+#define INCLUDE_ARRAY_MACROS_INTERFACE_VOF_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [-1 : jsize+2]
+#define VOF(I, J) (vof[(I  ) + (isize+2) * (J+1)])
+#define VOF_NADDS (int [NDIMS][2]){ {1, 1}, {2, 2}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [-1 : jsize+2], [-1 : ksize+2]
+#define VOF(I, J, K) (vof[(I  ) + (isize+2) * ((J+1) + (jsize+4) * (K+1))])
+#define VOF_NADDS (int [NDIMS][2]){ {1, 1}, {2, 2}, {2, 2}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_INTERFACE_VOF_H
diff --git a/include/array_macros/statistics/ux1.h b/include/array_macros/statistics/ux1.h
new file mode 100644
index 0000000..04e58b7
--- /dev/null
+++ b/include/array_macros/statistics/ux1.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_STATISTICS_UX1_H)
+#define INCLUDE_ARRAY_MACROS_STATISTICS_UX1_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [1 : isize+1], [1 : jsize+0]
+#define UX1(I, J) (ux1[(I-1) + (isize+1) * (J-1)])
+#define UX1_NADDS (int [NDIMS][2]){ {0, 1}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [1 : isize+1], [1 : jsize+0], [1 : ksize+0]
+#define UX1(I, J, K) (ux1[(I-1) + (isize+1) * ((J-1) + (jsize+0) * (K-1))])
+#define UX1_NADDS (int [NDIMS][2]){ {0, 1}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_STATISTICS_UX1_H
diff --git a/include/array_macros/statistics/uy1.h b/include/array_macros/statistics/uy1.h
new file mode 100644
index 0000000..4eed5fe
--- /dev/null
+++ b/include/array_macros/statistics/uy1.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_STATISTICS_UY1_H)
+#define INCLUDE_ARRAY_MACROS_STATISTICS_UY1_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [1 : jsize+0]
+#define UY1(I, J) (uy1[(I  ) + (isize+2) * (J-1)])
+#define UY1_NADDS (int [NDIMS][2]){ {1, 1}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [1 : jsize+0], [1 : ksize+0]
+#define UY1(I, J, K) (uy1[(I  ) + (isize+2) * ((J-1) + (jsize+0) * (K-1))])
+#define UY1_NADDS (int [NDIMS][2]){ {1, 1}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_STATISTICS_UY1_H
diff --git a/include/array_macros/statistics/uz1.h b/include/array_macros/statistics/uz1.h
new file mode 100644
index 0000000..396c670
--- /dev/null
+++ b/include/array_macros/statistics/uz1.h
@@ -0,0 +1,12 @@
+#if !defined(INCLUDE_ARRAY_MACROS_STATISTICS_UZ1_H)
+#define INCLUDE_ARRAY_MACROS_STATISTICS_UZ1_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 3
+// [0 : isize+1], [1 : jsize+0], [1 : ksize+0]
+#define UZ1(I, J, K) (uz1[(I  ) + (isize+2) * ((J-1) + (jsize+0) * (K-1))])
+#define UZ1_NADDS (int [NDIMS][2]){ {1, 1}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_STATISTICS_UZ1_H
diff --git a/include/array_macros/statistics/vof1.h b/include/array_macros/statistics/vof1.h
new file mode 100644
index 0000000..e6c670d
--- /dev/null
+++ b/include/array_macros/statistics/vof1.h
@@ -0,0 +1,18 @@
+#if !defined(INCLUDE_ARRAY_MACROS_STATISTICS_VOF1_H)
+#define INCLUDE_ARRAY_MACROS_STATISTICS_VOF1_H
+
+// This file is generated by tools/define_arrays.py
+
+#if NDIMS == 2
+// [0 : isize+1], [1 : jsize+0]
+#define VOF1(I, J) (vof1[(I  ) + (isize+2) * (J-1)])
+#define VOF1_NADDS (int [NDIMS][2]){ {1, 1}, {0, 0}, }
+#endif
+
+#if NDIMS == 3
+// [0 : isize+1], [1 : jsize+0], [1 : ksize+0]
+#define VOF1(I, J, K) (vof1[(I  ) + (isize+2) * ((J-1) + (jsize+0) * (K-1))])
+#define VOF1_NADDS (int [NDIMS][2]){ {1, 1}, {0, 0}, {0, 0}, }
+#endif
+
+#endif // INCLUDE_ARRAY_MACROS_STATISTICS_VOF1_H
diff --git a/include/config.h b/include/config.h
new file mode 100644
index 0000000..fdec867
--- /dev/null
+++ b/include/config.h
@@ -0,0 +1,14 @@
+#if !defined(CONFIG_H)
+#define CONFIG_H
+
+typedef struct {
+  // getters for a double-precision value
+  int (* const get_double)(
+      const char dsetname[],
+      double * value
+  );
+} config_t;
+
+extern const config_t config;
+
+#endif // CONFIG_H
diff --git a/include/decide_dt.h b/include/decide_dt.h
new file mode 100644
index 0000000..d2ad58e
--- /dev/null
+++ b/include/decide_dt.h
@@ -0,0 +1,12 @@
+#if !defined(DECIDE_DT_H)
+#define DECIDE_DT_H
+
+// decide next time step size
+extern int decide_dt(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    const interface_t * interface,
+    double * dt
+);
+
+#endif // DECIDE_DT_H
diff --git a/include/domain.h b/include/domain.h
new file mode 100644
index 0000000..2b758fb
--- /dev/null
+++ b/include/domain.h
@@ -0,0 +1,47 @@
+#if !defined(DOMAIN_H)
+#define DOMAIN_H
+
+#include "sdecomp.h"
+
+// definition of a structure domain_t
+/**
+ * @struct domain_t
+ * @brief struct storing parameters relevant to spatial domain
+ * @var info       : MPI domain decomposition
+ * @var glsizes    : global     number of grid points in each direction
+ * @var mysizes    : local (my) number of grid points in each direction
+ * @var offsets    : offsets to my starting index in each direction
+ * @var lengths    : domain size in each direction
+ * @var xf, xc     : cell-face and cell-center locations in x direction
+ * @var jdxf, jdxc : Jacobian determinants at x cell faces and centers
+ * @var hxxf, hxxc : scale factors at x cell faces and centers
+ * @var hy, hz     : y and z scale factors
+ */
+typedef struct {
+  sdecomp_info_t * info;
+  size_t glsizes[NDIMS];
+  size_t mysizes[NDIMS];
+  size_t offsets[NDIMS];
+  double lengths[NDIMS];
+  double * restrict xf, * restrict xc;
+  double * restrict jdxf, * restrict jdxc;
+  double * restrict hxxf, * restrict hxxc;
+  double hy;
+#if NDIMS == 3
+  double hz;
+#endif
+} domain_t;
+
+// constructor
+extern int domain_init(
+    const char dirname_ic[],
+    domain_t * domain
+);
+
+// save members which are necessary to restart
+extern int domain_save(
+    const char dirname[],
+    const domain_t * domain
+);
+
+#endif // DOMAIN_H
diff --git a/include/fileio.h b/include/fileio.h
new file mode 100644
index 0000000..33d0122
--- /dev/null
+++ b/include/fileio.h
@@ -0,0 +1,84 @@
+#if !defined(FILEIO_H)
+#define FILEIO_H
+
+#include <stdio.h> // FILE, size_t
+#include <mpi.h>   // MPI_Datatype
+
+typedef struct {
+  // NPY datatypes, which are embedded in NPY files ("dtype" argument)
+  // they are declared here and defined in src/fileio.c
+  // NOTE: size (x-byte) may be wrong, depending on the architecture
+  // 4-byte little-endian integer
+  const char * npy_int;
+  // 8-byte little-endian unsigned integer
+  const char * npy_size_t;
+  // 8-byte little-endian floating point
+  const char * npy_double;
+  // initialiser
+  int (* const init)(
+      void
+  );
+  // general-purpose file opener
+  FILE * (* const fopen)(
+      const char * path,
+      const char * mode
+  );
+  // general-purpose file closer
+  int (* const fclose)(
+      FILE * stream
+  );
+  // prepare directory to be stored
+  int (* const mkdir)(
+      const char dirname[]
+  );
+  // NPY serial read (called by one process)
+  int (* const r_serial)(
+      const char dirname[],
+      const char dsetname[],
+      const size_t ndims,
+      const size_t * shape,
+      const char dtype[],
+      const size_t size,
+      void * data
+  );
+  // NPY serial write (called by one process)
+  int (* const w_serial)(
+      const char dirname[],
+      const char dsetname[],
+      const size_t ndims,
+      const size_t * shape,
+      const char dtype[],
+      const size_t size,
+      const void * data
+  );
+  // NPY parallel read of N-dimensional array (called by all processes)
+  int (* const r_nd_parallel)(
+      const MPI_Comm comm,
+      const char dirname[],
+      const char dsetname[],
+      const size_t ndims,
+      const int * array_of_sizes,
+      const int * array_of_subsizes,
+      const int * array_of_starts,
+      const char dtype[],
+      const size_t size,
+      void * data
+  );
+  // NPY parallel write of N-dimensional array (called by all processes)
+  int (* const w_nd_parallel)(
+      const MPI_Comm comm,
+      const char dirname[],
+      const char dsetname[],
+      const size_t ndims,
+      const int * array_of_sizes,
+      const int * array_of_subsizes,
+      const int * array_of_starts,
+      const char dtype[],
+      const size_t size,
+      const void * data
+  );
+} fileio_t;
+
+extern const fileio_t fileio;
+
+#endif // FILEIO_H
diff --git a/include/fluid.h b/include/fluid.h
new file mode 100644
index 0000000..cf7aaed
--- /dev/null
+++ b/include/fluid.h
@@ -0,0 +1,63 @@
+#if !defined(FLUID_H)
+#define FLUID_H
+
+#include "array.h"
+#include "domain.h"
+
+// definition of a structure fluid_t_
+/**
+ * @struct fluid_t
+ * @brief struct storing fluid-related variables
+ * @var ux, uy, uz : velocity in each direction
+ * @var p, psi     : pressure, scalar potentials
+ * @var den        : density
+ * @var visux      : dynamic viscosity at ux
+ * @var visuy      : dynamic viscosity at uy
+ * @var visuz      : dynamic viscosity at uz
+ * @var srcux      : Runge-Kutta source terms for ux
+ * @var srcuy      : Runge-Kutta source terms for uy
+ * @var srcuz      : Runge-Kutta source terms for uz
+ * @var t[x-z][x-z]: shear-stress tensor
+ * @var Re         : Reynolds number
+ * @var Fr         : Froude   number
+ * @var denr       : density ratio
+ * @var visr       : viscosity ratio
+ * @var refden     : reference density
+ */
+typedef struct {
+  array_t ux;
+  array_t uy;
+#if NDIMS == 3
+  array_t uz;
+#endif
+  array_t p;
+  array_t psi[2];
+  array_t den[2];
+  array_t visux;
+  array_t visuy;
+#if NDIMS == 3
+  array_t visuz;
+#endif
+  array_t txx;
+  array_t txy;
+#if NDIMS == 3
+  array_t txz;
+#endif
+  array_t tyy;
+#if NDIMS == 3
+  array_t tyz;
+  array_t tzz;
+#endif
+  array_t srcux[3];
+  array_t srcuy[3];
+#if NDIMS == 3
+  array_t srcuz[3];
+#endif
+  double Re;
+  double Fr;
+  double denr;
+  double visr;
+  double refden;
+} fluid_t;
+
+#endif // FLUID_H
diff --git a/include/fluid_solver.h b/include/fluid_solver.h
new file mode 100644
index 0000000..ad22ca6
--- /dev/null
+++ b/include/fluid_solver.h
@@ -0,0 +1,98 @@
+#if !defined(FLUID_SOLVER_H)
+#define FLUID_SOLVER_H
+
+#include "array.h"
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+
+// initialiser of fluid_t
+extern int fluid_init(
+    const char dirname_ic[],
+    const domain_t * domain,
+    fluid_t * fluid
+);
+
+// save flow field
+extern int fluid_save(
+    const char dirname[],
+    const domain_t * domain,
+    const fluid_t * fluid
+);
+
+// compute density field from vof field
+extern int fluid_compute_density(
+    const domain_t * restrict domain,
+    fluid_t * restrict fluid,
+    const interface_t * restrict interface,
+    const size_t index
+);
+
+// compute viscosity field from vof field
+extern int fluid_compute_viscosity(
+    const domain_t * restrict domain,
+    fluid_t * restrict fluid,
+    const interface_t * restrict interface
+);
+
+// predict the new velocity field and update the temperature field
+extern int fluid_predict_field(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid,
+    const interface_t * interface
+);
+
+// compute scalar potential by solving Poisson equation
+extern int fluid_compute_potential(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+);
+
+// correct velocity field using scalar potential
+extern int fluid_correct_velocity(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+);
+
+// update pressure
+extern int fluid_update_pressure(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+
+// exchange halos and impose boundary conditions
+
+extern int fluid_update_boundaries_ux(
+    const domain_t * domain,
+    array_t * array
+);
+
+extern int fluid_update_boundaries_uy(
+    const domain_t * domain,
+    array_t * array
+);
+
+#if NDIMS == 3
+extern int fluid_update_boundaries_uz(
+    const domain_t * domain,
+    array_t * array
+);
+#endif
+
+extern int fluid_update_boundaries_p(
+    const domain_t * domain,
+    array_t * array
+);
+
+extern int fluid_update_boundaries_psi(
+    const domain_t * domain,
+    array_t * array
+);
+
+#endif // FLUID_SOLVER_H
diff --git a/include/halo.h b/include/halo.h
new file mode 100644
index 0000000..177cec9
--- /dev/null
+++ b/include/halo.h
@@ -0,0 +1,18 @@
+#if !defined(HALO_H)
+#define HALO_H
+
+int halo_communicate_in_y(
+    const domain_t * domain,
+    MPI_Datatype * dtype,
+    array_t * array
+);
+
+#if NDIMS == 3
+int halo_communicate_in_z(
+    const domain_t * domain,
+    MPI_Datatype * dtype,
+    array_t * array
+);
+#endif
+
+#endif // HALO_H
diff --git a/include/integrate.h b/include/integrate.h
new file mode 100644
index 0000000..e35ff7e
--- /dev/null
+++ b/include/integrate.h
@@ -0,0 +1,12 @@
+#if !defined(INTEGRATE_H)
+#define INTEGRATE_H
+
+// main integrator
+extern int integrate(
+    const domain_t * domain,
+    fluid_t * fluid,
+    interface_t * interface,
+    double * dt
+);
+
+#endif // INTEGRATE_H
diff --git a/include/interface.h b/include/interface.h
new file mode 100644
index 0000000..08f9b83
--- /dev/null
+++ b/include/interface.h
@@ -0,0 +1,32 @@
+#if !defined(INTERFACE_H)
+#define INTERFACE_H
+
+#include "array.h"
+
+// data type for N-dimensional vectors
+typedef double vector_t[NDIMS];
+// data type for (N+1)-dimensional vectors
+//   to store additional element (segment),
+//   which is stored at the last
+typedef double normal_t[NDIMS + 1];
+
+typedef struct {
+  array_t vof;
+  array_t ifrcx;
+  array_t ifrcy;
+#if NDIMS == 3
+  array_t ifrcz;
+#endif
+  array_t dvof;
+  array_t normal;
+  array_t curv;
+  array_t fluxx;
+  array_t fluxy;
+#if NDIMS == 3
+  array_t fluxz;
+#endif
+  array_t src[2];
+  double We;
+} interface_t;
+
+#endif // INTERFACE_H
diff --git a/include/interface_solver.h b/include/interface_solver.h
new file mode 100644
index 0000000..94094f8
--- /dev/null
+++ b/include/interface_solver.h
@@ -0,0 +1,50 @@
+#if !defined(INTERFACE_SOLVER_H)
+#define INTERFACE_SOLVER_H
+
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+
+extern int interface_init(
+    const char dirname_ic[],
+    const domain_t * domain,
+    interface_t * interface
+);
+
+extern int interface_save(
+    const char dirname[],
+    const domain_t * domain,
+    const interface_t * interface
+);
+
+extern int interface_compute_curvature_tensor(
+    const domain_t * domain,
+    interface_t * interface
+);
+
+extern int interface_compute_force(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+
+extern int interface_compute_mass_flux(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+
+extern int interface_update_vof(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+
+extern int interface_update_boundaries_vof(
+    const domain_t * domain,
+    array_t * vof
+);
+
+#endif // INTERFACE_SOLVER_H
diff --git a/include/linear_system.h b/include/linear_system.h
new file mode 100644
index 0000000..fa217b8
--- /dev/null
+++ b/include/linear_system.h
@@ -0,0 +1,61 @@
+#if !defined(LINEAR_SYSTEM_H)
+#define LINEAR_SYSTEM_H
+
+#include <stdbool.h>
+#include "sdecomp.h"
+#include "domain.h"
+#include "tdm.h"
+
+/**
+ * @struct linear_system_t
+ * @brief structure storing buffers and plans to solve tri-diagonal linear systems in each dimension A x = b
+ * @var is_initialised      : flag to check the variable is initialised
+ * @var implicit            : flags whether directions are treated implicitly,
+ *                              namely linear systems are to be solved
+ * @var x1pncl              : buffers to store x1-pencil
+ * @var y1pncl              : buffers to store y1-pencil
+ * @var z2pncl              : buffers to store z2-pencil
+ * @var x1pncl_mysizes      : size of (local) x1pencil
+ * @var y1pncl_mysizes      : size of (local) y1pencil
+ * @var z2pncl_mysizes      : size of (local) z2pencil
+ * @var tdm_[x-z]           : thomas algorithm solvers in all directions
+ * @var transposer_xx_to_xx : plans to transpose between two pencils
+ */
+typedef struct {
+  bool is_initialised;
+  bool implicit[NDIMS];
+  double * restrict x1pncl;
+  double * restrict y1pncl;
+#if NDIMS == 3
+  double * restrict z2pncl;
+#endif
+  size_t x1pncl_mysizes[NDIMS];
+  size_t y1pncl_mysizes[NDIMS];
+#if NDIMS == 3
+  size_t z2pncl_mysizes[NDIMS];
+#endif
+  tdm_info_t * tdm_x;
+  tdm_info_t * tdm_y;
+#if NDIMS == 3
+  tdm_info_t * tdm_z;
+#endif
+  sdecomp_transpose_plan_t * transposer_x1_to_y1;
+  sdecomp_transpose_plan_t * transposer_y1_to_x1;
+#if NDIMS == 3
+  sdecomp_transpose_plan_t * transposer_x1_to_z2;
+  sdecomp_transpose_plan_t * transposer_z2_to_x1;
+#endif
+} linear_system_t;
+
+extern int linear_system_init(
+    const sdecomp_info_t * info,
+    const bool implicit[NDIMS],
+    const size_t glsizes[NDIMS],
+    linear_system_t * linear_system
+);
+
+extern int linear_system_finalise(
+    linear_system_t * linear_system
+);
+
+#endif // LINEAR_SYSTEM_H
diff --git a/include/logging.h b/include/logging.h
new file mode 100644
index 0000000..cfe83d8
--- /dev/null
+++ b/include/logging.h
@@ -0,0 +1,31 @@
+#if !defined(LOGGING_H)
+#define LOGGING_H
+
+#include "domain.h"
+#include "fluid.h"
+
+typedef struct {
+  // constructor
+  int (* const init)(
+      const domain_t * domain,
+      const double time
+  );
+  // check quantities and dump to log files
+  void (* const check_and_output)(
+      const domain_t * domain,
+      const size_t step,
+      const double time,
+      const double dt,
+      const double wtime,
+      const fluid_t * fluid,
+      const interface_t * interface
+  );
+  // getter, next timing to call "check_and_output"
+  double (* const get_next_time)(
+      void
+  );
+} logging_t;
+
+extern const logging_t logging;
+
+#endif // LOGGING_H
diff --git a/include/memory.h b/include/memory.h
new file mode 100644
index 0000000..01fbf08
--- /dev/null
+++ b/include/memory.h
@@ -0,0 +1,17 @@
+#if !defined(MEMORY_H)
+#define MEMORY_H
+
+#include <stddef.h> // size_t
+
+// general-purpose memory allocator
+extern void * memory_calloc(
+    const size_t count,
+    const size_t size
+);
+
+// corresponding memory deallocator
+extern void memory_free(
+    void * ptr
+);
+
+#endif // MEMORY_H
diff --git a/include/param.h b/include/param.h
new file mode 100644
index 0000000..68c19ab
--- /dev/null
+++ b/include/param.h
@@ -0,0 +1,23 @@
+#if !defined(PARAM_H)
+#define PARAM_H
+
+// fixed parameters, which are usually fixed
+//   but still user can easily control, are declared
+// they are defined under src/param/xxx.c
+
+#include <stdbool.h>
+
+/* boundary-condition.c */
+// NOTE: impermeable walls and Neumann BC for the pressure are unchangeable
+// negative-x-wall velocity in y direction
+extern const double param_uy_xm;
+// positive-x-wall velocity in y direction
+extern const double param_uy_xp;
+#if NDIMS == 3
+// negative-x-wall velocity in z direction
+extern const double param_uz_xm;
+// positive-x-wall velocity in z direction
+extern const double param_uz_xp;
+#endif
+
+#endif // PARAM_H
diff --git a/include/runge_kutta.h b/include/runge_kutta.h
new file mode 100644
index 0000000..13bc5a8
--- /dev/null
+++ b/include/runge_kutta.h
@@ -0,0 +1,17 @@
+#if !defined(RUNGE_KUTTA_H)
+#define RUNGE_KUTTA_H
+
+#include <stdint.h>
+
+// Runge-Kutta configurations
+// indices
+extern const uint_fast8_t rk_a; // 0
+extern const uint_fast8_t rk_b; // 1
+extern const uint_fast8_t rk_g; // 2
+// coefficients of three-step RK scheme
+// NOTE: only three is allowed
+#define RKSTEPMAX 3
+typedef double rkcoef_t[RKSTEPMAX];
+extern const rkcoef_t rkcoefs[RKSTEPMAX];
+
+#endif // RUNGE_KUTTA_H
diff --git a/include/save.h b/include/save.h
new file mode 100644
index 0000000..d2e127d
--- /dev/null
+++ b/include/save.h
@@ -0,0 +1,27 @@
+#if !defined(SAVE_H)
+#define SAVE_H
+
+#include "domain.h"
+#include "fluid.h"
+
+typedef struct save_t_ {
+  // constructor
+  int (* const init)(
+    const domain_t * domain,
+    const double time
+  );
+  // make space to save flow fields, save some scalars
+  int (* const prepare)(
+      const domain_t * domain,
+      const int step,
+      char ** dirname
+  );
+  // getter, next timing to call "output"
+  double (* const get_next_time)(
+      void
+  );
+} save_t;
+
+extern const save_t save;
+
+#endif // SAVE_H
diff --git a/include/statistics.h b/include/statistics.h
new file mode 100644
index 0000000..17d3789
--- /dev/null
+++ b/include/statistics.h
@@ -0,0 +1,33 @@
+#if !defined(STATISTICS_H)
+#define STATISTICS_H
+
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+
+typedef struct {
+  // constructor
+  int (* const init)(
+      const domain_t * domain,
+      const double time
+  );
+  // collecting statistics
+  int (* const collect)(
+      const domain_t * domain,
+      const fluid_t * fluid,
+      const interface_t * interface
+  );
+  // save statistics to files
+  int (* const output)(
+      const domain_t * domain,
+      const size_t step
+  );
+  // getter, next timing to call "collect"
+  double (* const get_next_time)(
+      void
+  );
+} statistics_t;
+
+extern const statistics_t statistics;
+
+#endif // STATISTICS_H
diff --git a/include/tdm.h b/include/tdm.h
new file mode 100644
index 0000000..06250a8
--- /dev/null
+++ b/include/tdm.h
@@ -0,0 +1,47 @@
+#if !defined(TDM_H)
+#define TDM_H
+
+#include <stdbool.h>
+
+typedef struct tdm_info_t_ tdm_info_t;
+
+typedef struct {
+  int (* const construct)(
+      const int size,
+      const int nrhs,
+      const bool is_periodic,
+      const bool is_complex,
+      tdm_info_t ** info
+  );
+  int (* const get_l)(
+      const tdm_info_t * info,
+      double * restrict * l
+  );
+  int (* const get_c)(
+      const tdm_info_t * info,
+      double * restrict * c
+  );
+  int (* const get_u)(
+      const tdm_info_t * info,
+      double * restrict * u
+  );
+  int (* const get_size)(
+      const tdm_info_t * info,
+      int * size
+  );
+  int (* const get_nrhs)(
+      const tdm_info_t * info,
+      int * nrhs
+  );
+  int (* const solve)(
+      const tdm_info_t * info,
+      void * restrict data
+  );
+  int (* const destruct)(
+      tdm_info_t * info
+  );
+} tdm_t;
+
+extern const tdm_t tdm;
+
+#endif // TDM_H
diff --git a/include/timer.h b/include/timer.h
new file mode 100644
index 0000000..9103330
--- /dev/null
+++ b/include/timer.h
@@ -0,0 +1,9 @@
+#if !defined(TIMER_H)
+#define TIMER_H
+
+// get current time
+extern double timer(
+    void
+);
+
+#endif // TIMER_H
diff --git a/initial_condition/Makefile b/initial_condition/Makefile
new file mode 100644
index 0000000..6072c6d
--- /dev/null
+++ b/initial_condition/Makefile
@@ -0,0 +1,15 @@
+help:
+	@echo "output  : make directory to store NPY files"
+	@echo "datadel : remove NPY files"
+	@echo "help    : show this message"
+
+output:
+	@if [ ! -e output ]; then \
+		mkdir output; \
+	fi
+
+datadel:
+	$(RM) output/*.npy
+
+.PHONY : help output datadel
+
diff --git a/initial_condition/README.rst b/initial_condition/README.rst
new file mode 100644
index 0000000..75d7b55
--- /dev/null
+++ b/initial_condition/README.rst
@@ -0,0 +1,30 @@
+#################
+initial_condition
+#################
+
+********
+Overview
+********
+
+This directory contains a Python script to initialise the domain and the flow field, which will be simulated by the main solver.
+Note that this initialiser is not parallelised for simplicity.
+
+*************
+Configuration
+*************
+
+See ``main.py``.
+
+*****
+Usage
+*****
+
+.. code-block:: console
+
+   make output
+   bash main.sh
+
+giving several ``NPY`` files under the specified directory (``output`` by default).
+
+These files will be loaded by the main simulator.
+
diff --git a/initial_condition/main.py b/initial_condition/main.py
new file mode 100644
index 0000000..c660cae
--- /dev/null
+++ b/initial_condition/main.py
@@ -0,0 +1,83 @@
+import os
+import sys
+import numpy as np
+
+
+def init_time(dest):
+    # iterator and time
+    step = np.array(0, dtype=np.uint64)
+    time = np.array(0, dtype=np.float64)
+    np.save(f"{dest}/step.npy", step)
+    np.save(f"{dest}/time.npy", time)
+    return
+
+
+def init_domain(lengths, glsizes, dest):
+    xf = np.linspace(0., lengths[0], glsizes[0] + 1, endpoint=True)
+    xc = 0.
+    xc = np.append(xc, 0.5 * xf[:-1] + 0.5 * xf[1:])
+    xc = np.append(xc, lengths[0])
+    np.save(f"{dest}/xf.npy", np.array(xf, dtype=np.float64))
+    np.save(f"{dest}/xc.npy", np.array(xc, dtype=np.float64))
+    np.save(f"{dest}/glsizes.npy", np.array(glsizes, dtype=np.uint64))
+    np.save(f"{dest}/lengths.npy", np.array(lengths, dtype=np.float64))
+    return xc
+
+
+def init_interface(lengths, glsizes, xc, dest):
+    d = 1.0
+    r = 0.5 * d
+    cx = d
+    cy = d
+    lx = lengths[0]
+    ly = lengths[1]
+    nx = glsizes[0]
+    ny = glsizes[1]
+    dx = lx / nx
+    dy = ly / ny
+    yc = np.linspace(0.5 * dy, ly - 0.5 * dy, ny)
+    xc, yc = np.meshgrid(xc, yc)
+    eta = r - np.sqrt(np.power(xc - cx, 2.) + np.power(yc - cy, 2.))
+    vof = 0.5 * (1. + np.tanh(0.5 * glsizes[0] * eta))
+    np.save(f"{dest}/vof.npy", vof)
+    return
+
+
+def init_fluid(lengths, glsizes, dest):
+    lx = lengths[0]
+    ly = lengths[1]
+    dx = lx / glsizes[0]
+    dy = ly / glsizes[1]
+    shape0 = (glsizes[1], glsizes[0] + 1)
+    shape1 = (glsizes[1], glsizes[0] + 2)
+    ux  = np.zeros(shape0, dtype=np.float64)
+    uy  = np.zeros(shape1, dtype=np.float64)
+    p   = np.zeros(shape1, dtype=np.float64)
+    psi = np.zeros(shape1, dtype=np.float64)
+    np.save(f"{dest}/ux.npy", ux)
+    np.save(f"{dest}/uy.npy", uy)
+    np.save(f"{dest}/p.npy", p)
+    np.save(f"{dest}/psi.npy", psi)
+    return
+
+
+def main():
+    lengths = list()
+    lengths.append(float(os.environ["lx"]))
+    lengths.append(float(os.environ["ly"]))
+    glsizes = list()
+    glsizes.append(int(os.environ["glisize"]))
+    glsizes.append(int(os.environ["gljsize"]))
+    dest = sys.argv[1]
+    # sanitise
+    ndims = len(lengths)
+    assert 2 == ndims
+    # init and save
+    init_time(dest)
+    xc = init_domain(lengths, glsizes, dest)
+    init_interface(lengths, glsizes, xc, dest)
+    init_fluid(lengths, glsizes, dest)
+    return
+
+
+main()
diff --git a/initial_condition/main.sh b/initial_condition/main.sh
new file mode 100644
index 0000000..4c32b3e
--- /dev/null
+++ b/initial_condition/main.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+## domain
+# domain lengths
+export lx=4.0e+0
+export ly=2.0e+0
+# number of cell centers
+export glisize=128
+export gljsize=64
+
+## where to write resulting NPY files
+dirname="output"
+
+python3 main.py ${dirname}
diff --git a/src/array.c b/src/array.c
new file mode 100644
index 0000000..2ae63f6
--- /dev/null
+++ b/src/array.c
@@ -0,0 +1,296 @@
+#include <string.h>
+#include <limits.h>
+#include <mpi.h>
+#include "sdecomp.h"
+#include "memory.h"
+#include "domain.h"
+#include "array.h"
+#include "fileio.h"
+
+static int prepare(
+    const domain_t * domain,
+    const int nadds[NDIMS][2],
+    size_t size,
+    array_t * array
+){
+  // get total number of cells
+  const size_t * mysizes = domain->mysizes;
+  size_t nitems = 1;
+  for(size_t dim = 0; dim < NDIMS; dim++){
+    nitems *= mysizes[dim] + nadds[dim][0] + nadds[dim][1];
+  }
+  // for now local array size should be smaller than INT_MAX,
+  //   since (4-byte) integer counters are used to sweep arrays
+  //   to allow negative array indices
+  int retval = 0;
+  if(nitems > INT_MAX){
+    printf("local array size (%zu) exceeds INT_MAX (%d)\n", nitems, INT_MAX);
+    retval = 1;
+  }
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  MPI_Allreduce(MPI_IN_PLACE, &retval, 1, MPI_INT, MPI_MAX, comm_cart);
+  if(0 != retval){
+    return retval;
+  }
+  // assign members
+  array->size = size;
+  array->nadds = memory_calloc(NDIMS, 2 * sizeof(int));
+  for(size_t dim = 0; dim < NDIMS; dim++){
+    array->nadds[dim][0] = nadds[dim][0];
+    array->nadds[dim][1] = nadds[dim][1];
+  }
+  array->datasize = nitems * size;
+  array->data = memory_calloc(nitems, size);
+  return 0;
+}
+
+static int destroy(
+    array_t * array
+){
+  memory_free(array->nadds);
+  memory_free(array->data);
+  return 0;
+}
+
+// array->data, including additional (boundary & halo) cells
+//   [1 - nadds[0][0] : mysizes[0] + nadds[0][1]]
+//   x
+//   [1 - nadds[1][0] : mysizes[1] + nadds[1][1]]
+//   x
+//   [1 - nadds[2][0] : mysizes[2] + nadds[2][1]]
+// buf, holding only data to be written / loaded
+//   [1 - nadds[0][0] : mysizes[0] + nadds[0][1]]
+//   x
+//   [1               : mysizes[1]              ]
+//   x
+//   [1               : mysizes[2]              ]
+
+static int get_index(
+    const int mysizes[NDIMS],
+    const int nadds[NDIMS][2],
+    const int indices[NDIMS]
+){
+#if NDIMS == 2
+  const int index =
+    +  indices[0]
+    + (indices[1] + nadds[1][0]) * (mysizes[0] + nadds[0][0] + nadds[0][1]);
+#else
+  const int index =
+    +  indices[0]
+    + (indices[1] + nadds[1][0]) * (mysizes[0] + nadds[0][0] + nadds[0][1])
+    + (indices[2] + nadds[2][0]) * (mysizes[0] + nadds[0][0] + nadds[0][1])
+                                 * (mysizes[1] + nadds[1][0] + nadds[1][1]);
+#endif
+  return index;
+}
+
+static int load(
+    const domain_t * domain,
+    const char dirname[],
+    const char dsetname[],
+    const char dtype[],
+    array_t * array
+){
+  const size_t * glsizes = domain->glsizes;
+  const size_t * mysizes = domain->mysizes;
+  const size_t * offsets = domain->offsets;
+  const int nadds[NDIMS][2] = {
+    {array->nadds[0][0], array->nadds[0][1]},
+    {array->nadds[1][0], array->nadds[1][1]},
+#if NDIMS == 3
+    {array->nadds[2][0], array->nadds[2][1]},
+#endif
+  };
+  const size_t size = array->size;
+  char * data = array->data;
+  // prepare a buffer
+#if NDIMS == 2
+  char * buf = memory_calloc(
+      (mysizes[0] + nadds[0][0] + nadds[0][1]) * mysizes[1],
+      size
+  );
+#else
+  char * buf = memory_calloc(
+      (mysizes[0] + nadds[0][0] + nadds[0][1]) * mysizes[1] * mysizes[2],
+      size
+  );
+#endif
+  // read
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int retval = fileio.r_nd_parallel(
+      comm_cart,
+      dirname,
+      dsetname,
+      NDIMS,
+      (int [NDIMS]){
+#if NDIMS == 3
+        glsizes[2],
+#endif
+        glsizes[1],
+        glsizes[0] + nadds[0][0] + nadds[0][1],
+      },
+      (int [NDIMS]){
+#if NDIMS == 3
+        mysizes[2],
+#endif
+        mysizes[1],
+        mysizes[0] + nadds[0][0] + nadds[0][1],
+      },
+      (int [NDIMS]){
+#if NDIMS == 3
+        offsets[2],
+#endif
+        offsets[1],
+        offsets[0],
+      },
+      dtype,
+      size,
+      buf
+  );
+  if(0 != retval){
+    memory_free(buf);
+    return 1;
+  }
+  // copy
+#if NDIMS == 2
+  const int imax = mysizes[0] + nadds[0][0] + nadds[0][1];
+  const int jmax = mysizes[1];
+  for(int cnt = 0, j = 0; j < jmax; j++){
+    for(int i = 0; i < imax; i++, cnt++){
+      const int index = get_index(
+          (int [NDIMS]){mysizes[0], mysizes[1]},
+          nadds,
+          (int [NDIMS]){i, j}
+      );
+      memcpy(data + size * index, buf + size * cnt, size);
+    }
+  }
+#else
+  const int imax = mysizes[0] + nadds[0][0] + nadds[0][1];
+  const int jmax = mysizes[1];
+  const int kmax = mysizes[2];
+  for(int cnt = 0, k = 0; k < kmax; k++){
+    for(int j = 0; j < jmax; j++){
+      for(int i = 0; i < imax; i++, cnt++){
+        const int index = get_index(
+            (int [NDIMS]){mysizes[0], mysizes[1], mysizes[2]},
+            nadds,
+            (int [NDIMS]){i, j, k}
+        );
+        memcpy(data + size * index, buf + size * cnt, size);
+      }
+    }
+  }
+#endif
+  memory_free(buf);
+  return 0;
+}
+
+static int dump(
+    const domain_t * domain,
+    const char dirname[],
+    const char dsetname[],
+    const char dtype[],
+    const array_t * array
+){
+  const size_t * glsizes = domain->glsizes;
+  const size_t * mysizes = domain->mysizes;
+  const size_t * offsets = domain->offsets;
+  const int nadds[NDIMS][2] = {
+    {array->nadds[0][0], array->nadds[0][1]},
+    {array->nadds[1][0], array->nadds[1][1]},
+#if NDIMS == 3
+    {array->nadds[2][0], array->nadds[2][1]},
+#endif
+  };
+  const size_t size = array->size;
+  const char * data = array->data;
+  // prepare a buffer
+#if NDIMS == 2
+  char * buf = memory_calloc(
+      (mysizes[0] + nadds[0][0] + nadds[0][1]) * mysizes[1],
+      size
+  );
+#else
+  char * buf = memory_calloc(
+      (mysizes[0] + nadds[0][0] + nadds[0][1]) * mysizes[1] * mysizes[2],
+      size
+  );
+#endif
+  // copy
+#if NDIMS == 2
+  const int imax = mysizes[0] + nadds[0][0] + nadds[0][1];
+  const int jmax = mysizes[1];
+  for(int cnt = 0, j = 0; j < jmax; j++){
+    for(int i = 0; i < imax; i++, cnt++){
+      const int index = get_index(
+          (int [NDIMS]){mysizes[0], mysizes[1]},
+          nadds,
+          (int [NDIMS]){i, j}
+      );
+      memcpy(buf + size * cnt, data + size * index, size);
+    }
+  }
+#else
+  const int imax = mysizes[0] + nadds[0][0] + nadds[0][1];
+  const int jmax = mysizes[1];
+  const int kmax = mysizes[2];
+  for(int cnt = 0, k = 0; k < kmax; k++){
+    for(int j = 0; j < jmax; j++){
+      for(int i = 0; i < imax; i++, cnt++){
+        const int index = get_index(
+            (int [NDIMS]){mysizes[0], mysizes[1], mysizes[2]},
+            nadds,
+            (int [NDIMS]){i, j, k}
+        );
+        memcpy(buf + size * cnt, data + size * index, size);
+      }
+    }
+  }
+#endif
+  // write
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  fileio.w_nd_parallel(
+      comm_cart,
+      dirname,
+      dsetname,
+      NDIMS,
+      (int [NDIMS]){
+#if NDIMS == 3
+        glsizes[2],
+#endif
+        glsizes[1],
+        glsizes[0] + nadds[0][0] + nadds[0][1],
+      },
+      (int [NDIMS]){
+#if NDIMS == 3
+        mysizes[2],
+#endif
+        mysizes[1],
+        mysizes[0] + nadds[0][0] + nadds[0][1],
+      },
+      (int [NDIMS]){
+#if NDIMS == 3
+        offsets[2],
+#endif
+        offsets[1],
+        offsets[0],
+      },
+      dtype,
+      size,
+      buf
+  );
+  memory_free(buf);
+  return 0;
+}
+
+const array_method_t array = {
+  .prepare = prepare,
+  .destroy = destroy,
+  .load    = load,
+  .dump    = dump,
+};
+
diff --git a/src/config.c b/src/config.c
new file mode 100644
index 0000000..d86b993
--- /dev/null
+++ b/src/config.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <mpi.h>
+#include "config.h"
+
+/**
+ * @brief load environment variable and interpret it as an double-precision value
+ * @param[in]  dsetname : name of the environment variable
+ * @param[out] value    : resulting value
+ * @return              : error code
+ */
+static int get_double(
+    const char dsetname[],
+    double * value
+){
+  // error code
+  int retval = 0;
+  // try to load, may fail if the variable is not defined
+  char * string = getenv(dsetname);
+  if(NULL == string){
+    retval = 1;
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &retval, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  if(0 != retval){
+    printf("%s not found\n", dsetname);
+    return 1;
+  }
+  // try to convert a string to a double-precision value
+  errno = 0;
+  *value = strtod(string, NULL);
+  if(0 != errno){
+    retval = 1;
+  }
+  MPI_Allreduce(MPI_IN_PLACE, &retval, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+  if(0 != retval){
+    printf("%s: invalid value as double\n", dsetname);
+    return 1;
+  }
+  return 0;
+}
+
+const config_t config = {
+  .get_double = get_double,
+};
+
diff --git a/src/decide_dt.c b/src/decide_dt.c
new file mode 100644
index 0000000..385cfc7
--- /dev/null
+++ b/src/decide_dt.c
@@ -0,0 +1,230 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include <math.h>
+#include <float.h>
+#include <mpi.h>
+#include "config.h"
+#include "array.h"
+#include "sdecomp.h"
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "array_macros/domain/hxxf.h"
+
+static const double pi = 3.1415926535897932384626433832;
+
+// overriden later using environment variables
+static bool initialised = false;
+static double coef_dt_adv = 0.;
+static double coef_dt_dif = 0.;
+static double coef_dt_int = 0.;
+
+// max possible dt
+static const double dt_max = 1.;
+
+/**
+ * @brief decide time step size restricted by the advective terms
+ * @param[in]  domain : information about domain decomposition and size
+ * @param[in]  fluid  : velocity
+ * @param[out] dt     : time step size
+ * @return            : error code
+ */
+static int decide_dt_adv(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    double * restrict dt
+){
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  // sufficiently small number to avoid zero division
+  const double small = 1.e-8;
+  *dt = dt_max;
+  // compute grid-size over velocity in x
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      const double hx = HXXF(i  );
+      const double vel = fabs(UX(i, j)) + small;
+      *dt = fmin(*dt, hx / vel);
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double hx = HXXF(i  );
+        const double vel = fabs(UX(i, j, k)) + small;
+        *dt = fmin(*dt, hx / vel);
+      }
+    }
+  }
+#endif
+  // compute grid-size over velocity in y
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double vel = fabs(UY(i, j)) + small;
+      *dt = fmin(*dt, hy / vel);
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double vel = fabs(UY(i, j, k)) + small;
+        *dt = fmin(*dt, hy / vel);
+      }
+    }
+  }
+#endif
+  // compute grid-size over velocity in z
+#if NDIMS == 3
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double vel = fabs(UZ(i, j, k)) + small;
+        *dt = fmin(*dt, hz / vel);
+      }
+    }
+  }
+#endif
+  // unify result, multiply safety factor
+  MPI_Allreduce(MPI_IN_PLACE, dt, 1, MPI_DOUBLE, MPI_MIN, comm_cart);
+  *dt *= coef_dt_adv;
+  return 0;
+}
+
+/**
+ * @brief decide time step size restricted by the diffusive terms
+ * @param[in]  domain      : grid size
+ * @param[in]  diffusivity : fluid / temperature diffusivity
+ * @param[out] dt          : time step size
+ * @return                 : error code
+ */
+static int decide_dt_dif(
+    const domain_t * domain,
+    const double denr,
+    const double visr,
+    const double diffusivity,
+    double * restrict dt
+){
+  const int isize = domain->mysizes[0];
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  double grid_sizes[NDIMS] = {0.};
+  // find minimum grid size in x direction
+  grid_sizes[0] = DBL_MAX;
+  for(int i = 2; i <= isize; i++){
+    const double hx = HXXF(i  );
+    grid_sizes[0] = fmin(grid_sizes[0], hx);
+  }
+  grid_sizes[1] = hy;
+#if NDIMS == 3
+  grid_sizes[2] = hz;
+#endif
+  // compute diffusive constraints
+  for(int dim = 0; dim < NDIMS; dim++){
+    dt[dim] = fmin(1., denr / visr) / diffusivity * 0.5 / NDIMS * pow(grid_sizes[dim], 2.);
+  }
+  for(int dim = 0; dim < NDIMS; dim++){
+    dt[dim] *= coef_dt_dif;
+  }
+  return 0;
+}
+
+static int decide_dt_int(
+    const domain_t * domain,
+    const double tension,
+    double * restrict dt
+){
+  const int isize = domain->mysizes[0];
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  double grid_sizes[NDIMS] = {0.};
+  // find minimum grid size in x direction
+  grid_sizes[0] = DBL_MAX;
+  for(int i = 2; i <= isize; i++){
+    const double hx = HXXF(i  );
+    grid_sizes[0] = fmin(grid_sizes[0], hx);
+  }
+  grid_sizes[1] = hy;
+#if NDIMS == 3
+  grid_sizes[2] = hz;
+#endif
+  // compute interfacial constraints
+  *dt = dt_max;
+  for(int dim = 0; dim < NDIMS; dim++){
+    *dt = fmin(*dt, sqrt(1. / tension / 4. / pi * pow(grid_sizes[dim], 3.)));
+  }
+  *dt *= coef_dt_int;
+  return 0;
+}
+
+/**
+ * @brief decide time step size which can integrate the equations stably
+ * @param[in]  domain : information about domain decomposition and size
+ * @param[in]  fluid  : velocity and diffusivities
+ * @param[out] dt     : time step size
+ * @return            : (success) 0
+ *                    : (failure) non-zero value
+ */
+int decide_dt(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    const interface_t * interface,
+    double * dt
+){
+  // load safety factors
+  if(!initialised){
+    if(0 != config.get_double("coef_dt_adv", &coef_dt_adv)) return 1;
+    if(0 != config.get_double("coef_dt_dif", &coef_dt_dif)) return 1;
+    if(0 != config.get_double("coef_dt_int", &coef_dt_int)) return 1;
+    initialised = true;
+  }
+  // compute advective and diffusive constraints
+  double dt_adv[    1] = {0.};
+  double dt_dif[NDIMS] = {0.};
+  double dt_int[    1] = {0.};
+  decide_dt_adv(domain, fluid, dt_adv);
+  decide_dt_dif(domain, fluid->denr, fluid->visr, 1. / fluid->Re, dt_dif);
+  decide_dt_int(domain, 1. / interface->We, dt_int);
+  // choose smallest value as dt
+  // advection
+  *dt = dt_adv[0];
+  // diffusion, momentum
+  *dt = fmin(*dt, dt_dif[0]);
+  *dt = fmin(*dt, dt_dif[1]);
+#if NDIMS == 3
+  *dt = fmin(*dt, dt_dif[2]);
+#endif
+  // surface tension
+  *dt = fmin(*dt, dt_int[0]);
+  return 0;
+}
+
diff --git a/src/domain.c b/src/domain.c
new file mode 100644
index 0000000..4bc592d
--- /dev/null
+++ b/src/domain.c
@@ -0,0 +1,450 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include <math.h>
+#include <float.h>
+#include <mpi.h>
+#if NDIMS == 3
+#include <fftw3.h>
+#include "timer.h"
+#endif
+#include "sdecomp.h"
+#include "memory.h"
+#include "domain.h"
+#include "fileio.h"
+#include "array_macros/domain/xf.h"
+#include "array_macros/domain/xc.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/hxxc.h"
+
+/**
+ * @brief load members in domain_t
+ * @param[in]  dirname : name of directory from which data is loaded
+ * @param[out] domain  : global domain sizes and resolutions
+ * @return             : error code
+ */
+static int domain_load(
+    const char dirname[],
+    domain_t * domain
+){
+  size_t * glsizes = domain->glsizes;
+  double * restrict lengths = domain->lengths;
+  double * restrict * restrict xf = &domain->xf;
+  double * restrict * restrict xc = &domain->xc;
+  if(0 != fileio.r_serial(dirname, "glsizes", 1, (size_t [1]){NDIMS}, fileio.npy_size_t, sizeof(size_t), glsizes)){
+    return 1;
+  }
+  if(0 != fileio.r_serial(dirname, "lengths", 1, (size_t [1]){NDIMS}, fileio.npy_double, sizeof(double), lengths)){
+    return 1;
+  }
+  *xf = memory_calloc(glsizes[0] + 1, sizeof(double));
+  if(0 != fileio.r_serial(dirname, "xf", 1, (size_t [1]){glsizes[0] + 1}, fileio.npy_double, sizeof(double), *xf)){
+    return 1;
+  }
+  *xc = memory_calloc(glsizes[0] + 2, sizeof(double));
+  if(0 != fileio.r_serial(dirname, "xc", 1, (size_t [1]){glsizes[0] + 2}, fileio.npy_double, sizeof(double), *xc)){
+    return 1;
+  }
+  return 0;
+}
+
+/**
+ * @brief save members in domain_t
+ * @param[in] dirname : name of directory to which data is saved
+ * @param[in] domain  : global domain sizes and resolutions
+ * @return            : error code
+ */
+int domain_save(
+    const char dirname[],
+    const domain_t * domain
+){
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  // since this is a serial operation,
+  //   other processes are not involved
+  if(root != myrank){
+    return 0;
+  }
+  const size_t * glsizes = domain->glsizes;
+  const double * lengths = domain->lengths;
+  const double * xf      = domain->xf;
+  const double * xc      = domain->xc;
+  fileio.w_serial(dirname, "glsizes", 1, (size_t [1]){NDIMS}, fileio.npy_size_t, sizeof(size_t), glsizes);
+  fileio.w_serial(dirname, "lengths", 1, (size_t [1]){NDIMS}, fileio.npy_double, sizeof(double), lengths);
+  fileio.w_serial(dirname, "xf", 1, (size_t [1]){glsizes[0] + 1}, fileio.npy_double, sizeof(double), xf);
+  fileio.w_serial(dirname, "xc", 1, (size_t [1]){glsizes[0] + 2}, fileio.npy_double, sizeof(double), xc);
+  return 0;
+}
+
+static int check_x_grid_is_uniform(
+    const domain_t * domain,
+    bool * x_grid_is_uniform
+){
+  static bool is_uniform = false;
+  const size_t isize = domain->glsizes[0];
+  const double * hxxc = domain->hxxc;
+  double extrema[2] = {+1. * DBL_MAX, -1. * DBL_MAX};
+  for(size_t i = 0; i < isize; i++){
+    extrema[0] = fmin(extrema[0], hxxc[i]);
+    extrema[1] = fmax(extrema[1], hxxc[i]);
+  }
+  if(fabs(extrema[1] - extrema[0]) < 1.e-15){
+    is_uniform = true;
+  }else{
+    is_uniform = false;
+  }
+  *x_grid_is_uniform = is_uniform;
+  return 0;
+}
+
+#if NDIMS == 3
+static int get_ndigits(
+    int num
+){
+  // just for pretty print
+  // e.g. num =    3 -> return 1
+  // e.g. num =   13 -> return 2
+  // e.g. num = 1234 -> return 4
+  if(num < 0){
+    return 0;
+  }
+  int retval = 1;
+  while(num /= 10){
+    retval++;
+  }
+  return retval;
+}
+
+// optimise MPI domain decomposition,
+//   i.e. minimise all-to-all time
+static int optimise_sdecomp_init(
+    const size_t * glsizes,
+    sdecomp_info_t ** info_optimum
+){
+  // periodicity in each dimension,
+  //   which are fixed in this project
+  //   (x: wall-bounded, otherwise periodic)
+  const bool periods[NDIMS] = {false, true, true};
+  // global array size, real
+  const size_t r_gl_sizes[NDIMS] = {glsizes[0], glsizes[1], glsizes[2]};
+  // global array size, complex (after-FFTed value in y)
+  const size_t c_gl_sizes[NDIMS] = {glsizes[0], glsizes[1] / 2 + 1, glsizes[2]};
+  // number of processes in each dimension,
+  //   which is to be optimised
+  size_t dims_optimum[NDIMS] = {0, 0, 0};
+  double wtime_optimum = DBL_MAX;
+  const int root = 0;
+  int nprocs = 0;
+  int myrank = root;
+  MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  const size_t nynz = (size_t)nprocs;
+  // factorise and decide dims
+  for(size_t ny = 1; ny <= nynz; ny++){
+    if(0 != nynz % ny){
+      // prime decomposition failed
+      continue;
+    }
+    const size_t nz = nynz / ny;
+    const size_t dims[NDIMS] = {1, ny, nz};
+    // sanitise, for all dimensions, dims should not
+    //   exceed the number of grid points
+    // NOTE: refer to the complex array,
+    //   which is smaller in general
+    bool valid = true;
+    for(size_t dim1 = 0; dim1 < NDIMS; dim1++){
+      const size_t glsize = c_gl_sizes[dim1];
+      for(size_t dim0 = 0; dim0 < NDIMS; dim0++){
+        const size_t np = dims[dim0];
+        if(np > glsize){
+          // number of processes is
+          //   greater than number of grids
+          valid = false;
+        }
+      }
+    }
+    if(!valid){
+      continue;
+    }
+    // execute transposes which are used to solve Poisson equation
+    //   and check how long they take in total
+    // for the time being only transposes when solving Poisson equation are considered,
+    //   i.e. implicity and implicitz also request transposes, which are neglected
+    sdecomp_info_t * info = NULL;
+    if(0 != sdecomp.construct(
+          MPI_COMM_WORLD,
+          NDIMS,
+          (size_t [NDIMS]){dims[0], dims[1], dims[2]},
+          periods,
+          &info
+    )) return 1;
+    // initialise pencils and rotations
+    double       * r_x1pcnl = NULL;
+    double       * r_y1pcnl = NULL;
+    fftw_complex * c_y1pcnl = NULL;
+    fftw_complex * c_z1pcnl = NULL;
+    fftw_complex * c_x2pcnl = NULL;
+    sdecomp_transpose_plan_t * r_x1_to_y1 = NULL;
+    sdecomp_transpose_plan_t * r_y1_to_x1 = NULL;
+    sdecomp_transpose_plan_t * c_y1_to_z1 = NULL;
+    sdecomp_transpose_plan_t * c_z1_to_y1 = NULL;
+    sdecomp_transpose_plan_t * c_z1_to_x2 = NULL;
+    sdecomp_transpose_plan_t * c_x2_to_z1 = NULL;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_X1PENCIL, SDECOMP_Y1PENCIL, r_gl_sizes, sizeof(      double), &r_x1_to_y1)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Y1PENCIL, SDECOMP_X1PENCIL, r_gl_sizes, sizeof(      double), &r_y1_to_x1)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Y1PENCIL, SDECOMP_Z1PENCIL, c_gl_sizes, sizeof(fftw_complex), &c_y1_to_z1)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Z1PENCIL, SDECOMP_Y1PENCIL, c_gl_sizes, sizeof(fftw_complex), &c_z1_to_y1)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Z1PENCIL, SDECOMP_X2PENCIL, c_gl_sizes, sizeof(fftw_complex), &c_z1_to_x2)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_X2PENCIL, SDECOMP_Z1PENCIL, c_gl_sizes, sizeof(fftw_complex), &c_x2_to_z1)) return 1;
+    size_t r_x1sizes[NDIMS] = {0};
+    size_t r_y1sizes[NDIMS] = {0};
+    size_t c_y1sizes[NDIMS] = {0};
+    size_t c_z1sizes[NDIMS] = {0};
+    size_t c_x2sizes[NDIMS] = {0};
+    for(sdecomp_dir_t dim = 0; dim < NDIMS; dim++){
+      if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_X1PENCIL, dim, r_gl_sizes[dim], r_x1sizes + dim)) return 1;
+      if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Y1PENCIL, dim, r_gl_sizes[dim], r_y1sizes + dim)) return 1;
+      if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Y1PENCIL, dim, c_gl_sizes[dim], c_y1sizes + dim)) return 1;
+      if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Z1PENCIL, dim, c_gl_sizes[dim], c_z1sizes + dim)) return 1;
+      if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_X2PENCIL, dim, c_gl_sizes[dim], c_x2sizes + dim)) return 1;
+    }
+    r_x1pcnl = memory_calloc(r_x1sizes[0] * r_x1sizes[1] * r_x1sizes[2], sizeof(      double));
+    r_y1pcnl = memory_calloc(r_y1sizes[0] * r_y1sizes[1] * r_y1sizes[2], sizeof(      double));
+    c_y1pcnl = memory_calloc(c_y1sizes[0] * c_y1sizes[1] * c_y1sizes[2], sizeof(fftw_complex));
+    c_z1pcnl = memory_calloc(c_z1sizes[0] * c_z1sizes[1] * c_z1sizes[2], sizeof(fftw_complex));
+    c_x2pcnl = memory_calloc(c_x2sizes[0] * c_x2sizes[1] * c_x2sizes[2], sizeof(fftw_complex));
+    // execute transpose, repeat for "niter" times
+    const size_t niter = 4;
+    const double tic = timer();
+    for(size_t iter = 0; iter < niter; iter++){
+      sdecomp.transpose.execute(r_x1_to_y1, r_x1pcnl, r_y1pcnl);
+      sdecomp.transpose.execute(c_y1_to_z1, c_y1pcnl, c_z1pcnl);
+      sdecomp.transpose.execute(c_z1_to_x2, c_z1pcnl, c_x2pcnl);
+      sdecomp.transpose.execute(c_x2_to_z1, c_x2pcnl, c_z1pcnl);
+      sdecomp.transpose.execute(c_z1_to_y1, c_z1pcnl, c_y1pcnl);
+      sdecomp.transpose.execute(r_y1_to_x1, r_y1pcnl, r_x1pcnl);
+    }
+    const double toc = timer();
+    // clean-up tentative transpose plans and buffers
+    sdecomp.transpose.destruct(r_x1_to_y1);
+    sdecomp.transpose.destruct(c_y1_to_z1);
+    sdecomp.transpose.destruct(c_z1_to_x2);
+    sdecomp.transpose.destruct(c_x2_to_z1);
+    sdecomp.transpose.destruct(c_z1_to_y1);
+    sdecomp.transpose.destruct(r_y1_to_x1);
+    memory_free(r_x1pcnl);
+    memory_free(r_y1pcnl);
+    memory_free(c_y1pcnl);
+    memory_free(c_z1pcnl);
+    memory_free(c_x2pcnl);
+    // clean-up current sdecomp config
+    sdecomp.destruct(info);
+    // check time
+    const double wtime = (toc - tic) / niter;
+    if(wtime < wtime_optimum){
+      // this is the best option for now,
+      //   update candidate
+      for(size_t dim = 0; dim < NDIMS; dim++){
+        dims_optimum[dim] = dims[dim];
+      }
+      wtime_optimum = wtime;
+    }
+    if(root == myrank){
+      const int nd = get_ndigits(nprocs);
+      printf("dims: [%*zu, %*zu, %*zu]: % .7e [sec]\n", nd, dims[0], nd, dims[1], nd, dims[2], wtime);
+    }
+  }
+  // create sdecomp which will be used in the main run
+  if(0 != sdecomp.construct(
+        MPI_COMM_WORLD,
+        NDIMS,
+        (size_t [NDIMS]){dims_optimum[0], dims_optimum[1], dims_optimum[2]},
+        periods,
+        info_optimum
+  )) return 1;
+  if(root == myrank){
+    const int nd = get_ndigits(nprocs);
+    printf("Conclusive domain decomposition: [%*zu, %*zu, %*zu]\n", nd, dims_optimum[0], nd, dims_optimum[1], nd, dims_optimum[2]);
+  }
+  return 0;
+}
+#endif
+
+// scale factor in x, defined at x cell faces | 10
+static double * allocate_and_init_hxxf(
+    const int isize,
+    const double * xc
+){
+  double * hxxf = memory_calloc(isize + 1, sizeof(double));
+  for(int i = 1; i <= isize + 1; i++){
+    HXXF(i  ) = XC(i  ) - XC(i-1);
+  }
+  return hxxf;
+}
+
+// scale factor in x, defined at x cell centers | 10
+static double * allocate_and_init_hxxc(
+    const int isize,
+    const double * xf
+){
+  double * hxxc = memory_calloc(isize, sizeof(double));
+  for(int i = 1; i <= isize; i++){
+    HXXC(i  ) = XF(i+1) - XF(i  );
+  }
+  return hxxc;
+}
+
+#if NDIMS == 2
+// Jacobian determinant, defined at x cell faces | 11
+static double * allocate_and_init_jdxf(
+    const int isize,
+    const double * hxxf,
+    const double hy
+){
+  double * jdxf = memory_calloc(isize + 1, sizeof(double));
+  for(int i = 1; i <= isize + 1; i++){
+    JDXF(i  ) = HXXF(i  ) * hy;
+  }
+  return jdxf;
+}
+#else
+// Jacobian determinant, defined at x cell faces | 12
+static double * allocate_and_init_jdxf(
+    const int isize,
+    const double * hxxf,
+    const double hy,
+    const double hz
+){
+  double * jdxf = memory_calloc(isize + 1, sizeof(double));
+  for(int i = 1; i <= isize + 1; i++){
+    JDXF(i  ) = HXXF(i  ) * hy * hz;
+  }
+  return jdxf;
+}
+#endif
+
+#if NDIMS == 2
+// Jacobian determinant, defined at x cell centers | 11
+static double * allocate_and_init_jdxc(
+    const int isize,
+    const double * hxxc,
+    const double hy
+){
+  double * jdxc = memory_calloc(isize, sizeof(double));
+  for(int i = 1; i <= isize; i++){
+    JDXC(i  ) = HXXC(i  ) * hy;
+  }
+  return jdxc;
+}
+#else
+// Jacobian determinant, defined at x cell centers | 12
+static double * allocate_and_init_jdxc(
+    const int isize,
+    const double * hxxc,
+    const double hy,
+    const double hz
+){
+  double * jdxc = memory_calloc(isize, sizeof(double));
+  for(int i = 1; i <= isize; i++){
+    JDXC(i  ) = HXXC(i  ) * hy * hz;
+  }
+  return jdxc;
+}
+#endif
+
+static void report(
+    const domain_t * domain
+){
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    printf("DOMAIN\n");
+    for(sdecomp_dir_t dim = 0; dim < NDIMS; dim++){
+      printf("\tglsizes[%u]: %zu\n", dim, domain->glsizes[dim]);
+    }
+    for(sdecomp_dir_t dim = 0; dim < NDIMS; dim++){
+      printf("\tlengths[%u]: % .7e\n", dim, domain->lengths[dim]);
+    }
+    fflush(stdout);
+  }
+}
+
+/**
+ * @brief constructor of the structure
+ * @param[in]  dirname_ic : name of directory in which initial conditions are stored
+ * @param[out] domain     : structure being allocated and initalised
+ * @return                : (success) 0
+ *                          (failure) non-zero value
+ */
+int domain_init(
+    const char dirname_ic[],
+    domain_t * domain
+){
+  sdecomp_info_t ** info    = &domain->info;
+  size_t * restrict glsizes =  domain->glsizes;
+  size_t * restrict mysizes =  domain->mysizes;
+  size_t * restrict offsets =  domain->offsets;
+  double * restrict lengths =  domain->lengths;
+  double * restrict * xf    = &domain->xf;
+  double * restrict * xc    = &domain->xc;
+  double * restrict * jdxf  = &domain->jdxf;
+  double * restrict * jdxc  = &domain->jdxc;
+  double * restrict * hxxf   = &domain->hxxf;
+  double * restrict * hxxc   = &domain->hxxc;
+  double * restrict   hy    = &domain->hy;
+#if NDIMS == 3
+  double * restrict   hz    = &domain->hz;
+#endif
+  // load spatial information
+  if(0 != domain_load(dirname_ic, domain)){
+    return 1;
+  }
+  // scale factors
+  *hxxf = allocate_and_init_hxxf(glsizes[0], *xc);
+  *hxxc = allocate_and_init_hxxc(glsizes[0], *xf);
+  *hy = lengths[1] / glsizes[1];
+#if NDIMS == 3
+  *hz = lengths[2] / glsizes[2];
+#endif
+  // Jacobian determinants at cell faces and centers
+#if NDIMS == 2
+  *jdxf = allocate_and_init_jdxf(glsizes[0], *hxxf, *hy);
+  *jdxc = allocate_and_init_jdxc(glsizes[0], *hxxc, *hy);
+#else
+  *jdxf = allocate_and_init_jdxf(glsizes[0], *hxxf, *hy, *hz);
+  *jdxc = allocate_and_init_jdxc(glsizes[0], *hxxc, *hy, *hz);
+#endif
+  // initialise sdecomp to distribute the domain
+#if NDIMS == 2
+  if(0 != sdecomp.construct(
+        MPI_COMM_WORLD,
+        NDIMS,
+        (size_t [NDIMS]){0, 0},
+        (bool [NDIMS]){false, true},
+        info
+  )) return 1;
+#else
+  if(0 != optimise_sdecomp_init(
+        glsizes,
+        info
+  )) return 1;
+#endif
+  // local array sizes and offsets
+  for(size_t dim = 0; dim < NDIMS; dim++){
+    sdecomp.get_pencil_mysize(*info, SDECOMP_X1PENCIL, dim, glsizes[dim], mysizes + dim);
+    sdecomp.get_pencil_offset(*info, SDECOMP_X1PENCIL, dim, glsizes[dim], offsets + dim);
+  }
+  // in this project, only uniform grids are allowed
+  bool is_uniform = false;
+  check_x_grid_is_uniform(domain, &is_uniform);
+  if (!is_uniform) {
+    fprintf(stderr, "x grid is not uniform, which is not allowed");
+    return 1;
+  }
+  report(domain);
+  return 0;
+}
+
diff --git a/src/fileio.c b/src/fileio.c
new file mode 100644
index 0000000..3b18a9b
--- /dev/null
+++ b/src/fileio.c
@@ -0,0 +1,484 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <mpi.h>
+#include "snpyio.h"
+#include "memory.h"
+#include "fileio.h"
+
+// allocate and initialise string having a NPY file name
+static char * create_npyfname(
+    const char dirname[],
+    const char dsetname[]
+){
+  if(NULL == dirname){
+    fprintf(stderr, "dirname is NULL\n");
+    return NULL;
+  }
+  if(NULL == dsetname){
+    fprintf(stderr, "dsetname is NULL\n");
+    return NULL;
+  }
+  const char slash[] = {"/"};
+  const char suffix[] = {".npy"};
+  const size_t nchars =
+    + strlen( dirname)
+    + strlen(   slash)
+    + strlen(dsetname)
+    + strlen(  suffix);
+  char * fname = memory_calloc(nchars + 2, sizeof(char));
+  snprintf(fname, nchars + 1, "%s%s%s%s", dirname, slash, dsetname, suffix);
+  fname[nchars + 1] = '\0';
+  return fname;
+}
+
+static int mpi_file_open(
+    const MPI_Comm comm,
+    char * fname,
+    int amode,
+    MPI_File * fh
+){
+  const int mpi_error_code = MPI_File_open(comm, fname, amode, MPI_INFO_NULL, fh);
+  if(MPI_SUCCESS != mpi_error_code){
+    char string[MPI_MAX_ERROR_STRING] = {'\0'};
+    int resultlen = 0;
+    MPI_Error_string(mpi_error_code, string, &resultlen);
+    fprintf(stderr, "%s: %s\n", fname, string);
+    memory_free(fname);
+    return 1;
+  }
+  return 0;
+}
+
+static int get_count(
+    const size_t ndims,
+    const int * mysizes
+){
+  int count = 1;
+  for(size_t n = 0; n < ndims; n++){
+    count *= mysizes[n];
+  }
+  return count;
+}
+
+static int prepare_view(
+    const size_t ndims,
+    const int * glsizes,
+    const int * mysizes,
+    const int * offsets,
+    MPI_File fh,
+    const size_t header_size,
+    const MPI_Datatype basetype,
+    MPI_Datatype * filetype
+){
+  // create data type and set file view
+  MPI_Type_create_subarray((int)ndims, glsizes, mysizes, offsets, MPI_ORDER_C, basetype, filetype);
+  MPI_Type_commit(filetype);
+  MPI_File_set_view(fh, (MPI_Offset)header_size, basetype, *filetype, "native", MPI_INFO_NULL);
+  return 0;
+}
+
+static int destroy_view(
+    MPI_Datatype * filetype
+){
+  // clean-up datatype
+  MPI_Type_free(filetype);
+  return 0;
+}
+
+static int init(
+    void
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  const size_t sizeof_size_t = sizeof(size_t);
+  const size_t sizeof_double = sizeof(double);
+  FILE * stream = stderr;
+  if(8 != sizeof_size_t){
+    if(root == myrank){
+      fprintf(stream, "NPY data type '<u8' and sizeof(size_t): %zu mismatch\n", sizeof_size_t);
+      fflush(stream);
+    }
+    return 1;
+  }
+  if(8 != sizeof_double){
+    if(root == myrank){
+      fprintf(stream, "NPY data type '<i4' and sizeof(double): %zu mismatch\n", sizeof_double);
+      fflush(stream);
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static FILE * fopen_(
+    const char * path,
+    const char * mode
+){
+  errno = 0;
+  FILE * stream = fopen(path, mode);
+  if(NULL == stream){
+    perror(path);
+  }
+  return stream;
+}
+
+static int fclose_(
+    FILE * stream
+){
+  if(NULL == stream){
+    return 1;
+  }
+  fclose(stream);
+  return 0;
+}
+
+// create directory
+static int mkdir_(
+    const char dirname[]
+){
+  // NOTE: call this function ONLY from the main process
+  // NOTE: continue even if failed,
+  //   since we want to override previous data (errorcode: EEXIST)
+  // RWX masks for user, group, and others (0o777, ref. "man 2 chmod")
+  const mode_t mode = S_IRWXU | S_IRWXG | S_IRWXO;
+  if(0 != mkdir(dirname, mode)){
+    perror(dirname);
+    return 1;
+  }
+  return 0;
+}
+
+// wrapper function of snpyio_r_header with error handling
+static size_t r_npy_header(
+    const char fname[],
+    const size_t ndims,
+    const size_t * shape,
+    const char * dtype,
+    const bool is_fortran_order
+){
+  const char msg[] = {"NPY header read failed"};
+  FILE * fp = fopen_(fname, "r");
+  if(NULL == fp){
+    return 0;
+  }
+  // load header, return header size when succeeded, return 0 otherwise
+  size_t ndims_ = 0;
+  size_t * shape_ = NULL;
+  char * dtype_ = NULL;
+  bool is_fortran_order_ = false;
+  size_t header_size = snpyio_r_header(&ndims_, &shape_, &dtype_, &is_fortran_order_, fp);
+  fclose_(fp);
+  // check arguments, return loaded header size when all OK, return 0 otherwise
+  // ndims
+  if(ndims != ndims_){
+    fprintf(stderr, "%s(%s), ndims: %zu expected, %zu obtained\n", msg, fname, ndims, ndims_);
+    header_size = 0;
+    goto err_hndl;
+  }
+  // shape (for each dimension)
+  for(size_t n = 0; n < ndims; n++){
+    if(shape[n] != shape_[n]){
+      fprintf(stderr, "%s(%s), shape[%zu]: %zu expected, %zu obtained\n", msg, fname, n, shape[n], shape_[n]);
+      header_size = 0;
+      goto err_hndl;
+    }
+  }
+  // dtype
+  if(0 != strcmp(dtype, dtype_)){
+    fprintf(stderr, "%s(%s), dtype: %s expected, %s obtained\n", msg, fname, dtype, dtype_);
+    header_size = 0;
+    goto err_hndl;
+  }
+  // is_fortran_order
+  if(is_fortran_order != is_fortran_order_){
+    fprintf(stderr, "%s(%s), is_fortran_order: %s expected, %s obtained\n", msg, fname, is_fortran_order ? "true" : "false", is_fortran_order_ ? "true" : "false");
+    header_size = 0;
+    goto err_hndl;
+  }
+err_hndl:
+  memory_free(shape_);
+  memory_free(dtype_);
+  return header_size;
+}
+
+// wrapper function of snpyio_w_header with error handling
+static size_t w_npy_header(
+    const char fname[],
+    const size_t ndims,
+    const size_t * shape,
+    const char dtype[],
+    const bool is_fortran_order
+){
+  const char msg[] = {"NPY header write failed"};
+  FILE * fp = fopen_(fname, "w");
+  if(NULL == fp){
+    return 0;
+  }
+  const size_t header_size = snpyio_w_header(ndims, shape, dtype, is_fortran_order, fp);
+  if(0 == header_size){
+    fprintf(stderr, "%s(%s)\n", msg, fname);
+  }
+  fclose_(fp);
+  return header_size;
+}
+
+/**
+ * @brief read data from a npy file, by one process
+ * @param[in]  dirname  : name of directory in which a target npy file is contained
+ * @param[in]  dsetname : name of dataset
+ * @param[in]  ndims    : number of dimensions of dataset
+ * @param[in]  shape    : shape of dataset
+ * @param[in]  dtype    : datatype, e.g. '<f8'
+ * @param[in]  size     : size of each element
+ * @param[out] data     : pointer to the data to be loaded
+ */
+static int r_serial(
+    const char dirname[],
+    const char dsetname[],
+    const size_t ndims,
+    const size_t * shape,
+    const char dtype[],
+    const size_t size,
+    void * data
+){
+  char * fname = create_npyfname(dirname, dsetname);
+  const size_t header_size = r_npy_header(fname, ndims, shape, dtype, false);
+  if(0 == header_size){
+    fprintf(stderr, "%s: NPY header load failed\n", fname);
+    memory_free(fname);
+    return 1;
+  }
+  FILE * fp = fopen_(fname, "r");
+  if(NULL == fp){
+    memory_free(fname);
+    return 1;
+  }
+  if(0 != fseek(fp, (long)header_size, SEEK_SET)){
+    fprintf(stderr, "%s: fseek failed\n", fname);
+    fclose_(fp);
+    memory_free(fname);
+    return 1;
+  }
+  size_t nitems = 1;
+  for(size_t dim = 0; dim < ndims; dim++){
+    nitems *= shape[dim];
+  }
+  const size_t nitems_ = fread(data, size, nitems, fp);
+  if(nitems_ != nitems){
+    fprintf(stderr, "%s: fread failed (%zu vs %zu)\n", fname, nitems_, nitems);
+    fclose_(fp);
+    memory_free(fname);
+    return 1;
+  }
+  fclose_(fp);
+  memory_free(fname);
+  return 0;
+}
+
+/**
+ * @brief write data to a npy file, by one process
+ * @param[in] dirname  : name of directory in which a target npy file is contained
+ * @param[in] dsetname : name of dataset
+ * @param[in] ndims    : number of dimensions of dataset
+ * @param[in] shape    : shape of dataset
+ * @param[in] dtype    : datatype, e.g. '<f8'
+ * @param[in] size     : size of each element
+ * @param[in] data     : pointer to the data to be written
+ */
+static int w_serial(
+    const char dirname[],
+    const char dsetname[],
+    const size_t ndims,
+    const size_t * shape,
+    const char dtype[],
+    const size_t size,
+    const void * data
+){
+  char * fname = create_npyfname(dirname, dsetname);
+  const size_t header_size = w_npy_header(fname, ndims, shape, dtype, false);
+  if(0 == header_size){
+    memory_free(fname);
+    return 1;
+  }
+  FILE * fp = fopen_(fname, "a");
+  if(NULL == fp){
+    memory_free(fname);
+    return 1;
+  }
+  if(0 != fseek(fp, (long)header_size, SEEK_SET)){
+    fprintf(stderr, "%s: fseek failed\n", fname);
+    fclose_(fp);
+    memory_free(fname);
+    return 1;
+  }
+  size_t nitems = 1;
+  for(size_t dim = 0; dim < ndims; dim++){
+    nitems *= shape[dim];
+  }
+  const size_t nitems_ = fwrite(data, size, nitems, fp);
+  if(nitems_ != nitems){
+    fprintf(stderr, "%s: fwrite failed\n", fname);
+    fclose_(fp);
+    memory_free(fname);
+    return 1;
+  }
+  fclose_(fp);
+  memory_free(fname);
+  return 0;
+}
+
+/**
+ * @brief read N-dimensional data from a npy file, by all processes
+ * @param[in]  comm     : communicator to which all processes calling this function belong
+ * @param[in]  dirname  : name of directory in which a target npy file is contained
+ * @param[in]  dsetname : name of dataset
+ * @param[in]  ndims    : number of dimensions of the array
+ * @param[in]  glsizes  : global sizes   of the dataset
+ * @param[in]  mysizes  : local  sizes   of the dataset
+ * @param[in]  offsets  : local  offsets of the dataset
+ * @param[in]  dtype    : NPY data type
+ * @param[in]  size     : size of each element
+ * @param[out] data     : pointer to the data to be loaded
+ */
+static int r_nd_parallel(
+    const MPI_Comm comm,
+    const char dirname[],
+    const char dsetname[],
+    const size_t ndims,
+    const int * glsizes,
+    const int * mysizes,
+    const int * offsets,
+    const char dtype[],
+    const size_t size,
+    void * data
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm_rank(comm, &myrank);
+  char * fname = create_npyfname(dirname, dsetname);
+  // check header by main process
+  size_t header_size = 0;
+  if(root == myrank){
+    // set values which are expected to be in NPY file
+    size_t * shape = memory_calloc(ndims, sizeof(size_t));
+    for(size_t dim = 0; dim < ndims; dim++){
+      shape[dim] = (size_t)glsizes[dim];
+    }
+    header_size = r_npy_header(fname, ndims, shape, dtype, false);
+    memory_free(shape);
+  }
+  // share result
+  MPI_Bcast(&header_size, sizeof(size_t), MPI_BYTE, root, comm);
+  if(0 == header_size){
+    memory_free(fname);
+    return 1;
+  }
+  // open file
+  MPI_File fh = NULL;
+  if(0 != mpi_file_open(comm, fname, MPI_MODE_RDONLY, &fh)) return 1;
+  // prepare file view
+  MPI_Datatype basetype = MPI_BYTE;
+  MPI_Datatype filetype = MPI_DATATYPE_NULL;
+  MPI_Type_contiguous(size, basetype, &basetype);
+  MPI_Type_commit(&basetype);
+  prepare_view((int)ndims, glsizes, mysizes, offsets, fh, header_size, basetype, &filetype);
+  // get number of elements which are locally read
+  const int count = get_count(ndims, mysizes);
+  // read
+  MPI_File_read_all(fh, data, count, basetype, MPI_STATUS_IGNORE);
+  // clean-up file view
+  MPI_Type_free(&basetype);
+  destroy_view(&filetype);
+  // close file
+  MPI_File_close(&fh);
+  memory_free(fname);
+  return 0;
+}
+
+/**
+ * @brief write N-dimensional data to a npy file, by all processes
+ * @param[in] comm     : communicator to which all processes calling this function belong
+ * @param[in] dirname  : name of directory in which a target npy file is contained
+ * @param[in] dsetname : name of dataset
+ * @param[in] ndims    : number of dimensions of the array
+ * @param[in] glsizes  : global sizes   of the dataset
+ * @param[in] mysizes  : local  sizes   of the dataset
+ * @param[in] offsets  : local  offsets of the dataset
+ * @param[in] dtype    : NPY data type
+ * @param[in] size     : size of each element
+ * @param[in] data     : pointer to the data to be written
+ */
+static int w_nd_parallel(
+    const MPI_Comm comm,
+    const char dirname[],
+    const char dsetname[],
+    const size_t ndims,
+    const int * glsizes,
+    const int * mysizes,
+    const int * offsets,
+    const char dtype[],
+    const size_t size,
+    const void * data
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm_rank(comm, &myrank);
+  char * fname = create_npyfname(dirname, dsetname);
+  // check header by main process
+  size_t header_size = 0;
+  if(root == myrank){
+    // set values which are expected to be in NPY file
+    size_t * shape = memory_calloc(ndims, sizeof(size_t));
+    for(size_t dim = 0; dim < ndims; dim++){
+      shape[dim] = (size_t)glsizes[dim];
+    }
+    header_size = w_npy_header(fname, ndims, shape, dtype, false);
+    memory_free(shape);
+  }
+  // share result
+  MPI_Bcast(&header_size, sizeof(size_t), MPI_BYTE, root, comm);
+  if(0 == header_size){
+    memory_free(fname);
+    return 1;
+  }
+  // open file
+  MPI_File fh = NULL;
+  if(0 != mpi_file_open(comm, fname, MPI_MODE_CREATE | MPI_MODE_RDWR, &fh)) return 1;
+  // prepare file view
+  MPI_Datatype basetype = MPI_BYTE;
+  MPI_Datatype filetype = MPI_DATATYPE_NULL;
+  MPI_Type_contiguous(size, basetype, &basetype);
+  MPI_Type_commit(&basetype);
+  prepare_view((int)ndims, glsizes, mysizes, offsets, fh, header_size, basetype, &filetype);
+  // get number of elements which are locally written
+  const int count = get_count(ndims, mysizes);
+  // write
+  MPI_File_write_all(fh, data, count, basetype, MPI_STATUS_IGNORE);
+  // clean-up file view
+  MPI_Type_free(&basetype);
+  destroy_view(&filetype);
+  // close file
+  MPI_File_close(&fh);
+  memory_free(fname);
+  return 0;
+}
+
+const fileio_t fileio = {
+  .npy_size_t = "'<u8'",
+  .npy_double = "'<f8'",
+  .init = init,
+  .fopen = fopen_,
+  .fclose = fclose_,
+  .mkdir = mkdir_,
+  .r_serial = r_serial,
+  .w_serial = w_serial,
+  .r_nd_parallel = r_nd_parallel,
+  .w_nd_parallel = w_nd_parallel,
+};
+
diff --git a/src/fluid/boundary/p.c b/src/fluid/boundary/p.c
new file mode 100644
index 0000000..0ccbc81
--- /dev/null
+++ b/src/fluid/boundary/p.c
@@ -0,0 +1,63 @@
+#include <assert.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/p.h"
+
+/**
+ * @brief update boundary values of the pressure
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : pressure
+ * @return               : error code
+ */
+int fluid_update_boundaries_p(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(P_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(P_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * p = array->data;
+    // Neumann
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      P(      0, j) = P(    1, j);
+      P(isize+1, j) = P(isize, j);
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        P(      0, j, k) = P(    1, j, k);
+        P(isize+1, j, k) = P(isize, j, k);
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
diff --git a/src/fluid/boundary/psi.c b/src/fluid/boundary/psi.c
new file mode 100644
index 0000000..bd2a0c2
--- /dev/null
+++ b/src/fluid/boundary/psi.c
@@ -0,0 +1,63 @@
+#include <assert.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/psi.h"
+
+/**
+ * @brief update boundary values of the scalar potential
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : scalar potential
+ * @return               : error code
+ */
+int fluid_update_boundaries_psi(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(PSI_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(PSI_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * psi = array->data;
+    // Neumann
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      PSI(      0, j) = PSI(    1, j);
+      PSI(isize+1, j) = PSI(isize, j);
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        PSI(      0, j, k) = PSI(    1, j, k);
+        PSI(isize+1, j, k) = PSI(isize, j, k);
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
diff --git a/src/fluid/boundary/ux.c b/src/fluid/boundary/ux.c
new file mode 100644
index 0000000..adae4b4
--- /dev/null
+++ b/src/fluid/boundary/ux.c
@@ -0,0 +1,63 @@
+#include <assert.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/ux.h"
+
+/**
+ * @brief update boundary values of x velocity
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : x velocity
+ * @return               : error code
+ */
+int fluid_update_boundaries_ux(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(UX_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(UX_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * ux = array->data;
+    // impermeable
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      UX(      1, j) = 0.;
+      UX(isize+1, j) = 0.;
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        UX(      1, j, k) = 0.;
+        UX(isize+1, j, k) = 0.;
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
diff --git a/src/fluid/boundary/uy.c b/src/fluid/boundary/uy.c
new file mode 100644
index 0000000..2803020
--- /dev/null
+++ b/src/fluid/boundary/uy.c
@@ -0,0 +1,64 @@
+#include <assert.h>
+#include <mpi.h>
+#include "param.h"
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/uy.h"
+
+/**
+ * @brief update boundary values of y velocity
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : y velocity
+ * @return               : error code
+ */
+int fluid_update_boundaries_uy(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(UY_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(UY_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * uy = array->data;
+    // set boundary values
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      UY(      0, j) = param_uy_xm;
+      UY(isize+1, j) = param_uy_xp;
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        UY(      0, j, k) = param_uy_xm;
+        UY(isize+1, j, k) = param_uy_xp;
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
diff --git a/src/fluid/boundary/uz.c b/src/fluid/boundary/uz.c
new file mode 100644
index 0000000..f60fd42
--- /dev/null
+++ b/src/fluid/boundary/uz.c
@@ -0,0 +1,52 @@
+#if NDIMS == 3
+#include <assert.h>
+#include <mpi.h>
+#include "param.h"
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/uz.h"
+
+/**
+ * @brief update boundary values of z velocity
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : z velocity
+ * @return               : error code
+ */
+int fluid_update_boundaries_uz(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(UZ_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(UZ_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+    const int ksize = domain->mysizes[2];
+    double * uz = array->data;
+    // set boundary values
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        UZ(      0, j, k) = param_uz_xm;
+        UZ(isize+1, j, k) = param_uz_xp;
+      }
+    }
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+      MPI_DOUBLE,
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+  }
+  return 0;
+}
+#endif
diff --git a/src/fluid/compute_potential.c b/src/fluid/compute_potential.c
new file mode 100644
index 0000000..38c5e2a
--- /dev/null
+++ b/src/fluid/compute_potential.c
@@ -0,0 +1,743 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <math.h>
+#include <complex.h>
+#include <fftw3.h>
+#include "sdecomp.h"
+#include "memory.h"
+#include "domain.h"
+#include "tdm.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/psi.h"
+
+static const double g_pi = 3.14159265358979324;
+
+// structure only used to solve Poisson equation
+// NOTE: this is shared among normal and efficient solvers
+//   thus some variables may not be used
+typedef struct {
+  bool is_initialised;
+  void * restrict buf0;
+  void * restrict buf1;
+  fftw_plan fftw_plan_x[2];
+#if NDIMS == 3
+  fftw_plan fftw_plan_y[2];
+  fftw_plan fftw_plan_z[2];
+#endif
+  size_t tdm_sizes[2];
+  tdm_info_t * tdm_info;
+  double * evals;
+  sdecomp_transpose_plan_t * r_transposer_x1_to_y1;
+  sdecomp_transpose_plan_t * r_transposer_y1_to_x1;
+#if NDIMS == 3
+  sdecomp_transpose_plan_t * c_transposer_y1_to_z1;
+  sdecomp_transpose_plan_t * c_transposer_z1_to_y1;
+#endif
+} poisson_solver_t;
+
+/* initialise Poisson solver */
+// several pencils for different data types are treated
+//   and thus this source is very complicated
+// used prefixes are as follows:
+//   r_: real    (double)       type
+//   c_: complex (fftw_complex) type
+//
+//   gl_    : global array size (not pencils)
+//   x1pncl_: each pencl (x1, y1, ...)
+
+/* size of domain and pencils */
+// NOTE: define globally to reduce the number of arguments of static functions
+// global domain size in real space
+static size_t r_gl_sizes[NDIMS] = {0};
+#if NDIMS == 3
+// global domain size in complex space
+static size_t c_gl_sizes[NDIMS] = {0};
+#endif
+// local domain size (x1 pencil) in real space
+static size_t r_x1pncl_sizes[NDIMS] = {0};
+// local domain size (y1 pencil) in real space
+static size_t r_y1pncl_sizes[NDIMS] = {0};
+#if NDIMS == 3
+// local domain size (y1 pencil) in complex space
+static size_t c_y1pncl_sizes[NDIMS] = {0};
+// local domain size (z1 pencil) in complex space
+static size_t c_z1pncl_sizes[NDIMS] = {0};
+#endif
+
+static size_t prod(
+    const size_t sizes[NDIMS]
+){
+  // compute the product of the given vector
+  size_t nitems = 1;
+  for(size_t dim = 0; dim < NDIMS; dim++){
+    nitems *= sizes[dim];
+  }
+  return nitems;
+}
+
+static int report_failure(
+    const char type[]
+){
+  // function to just dump error message and abort
+  FILE * stream = stderr;
+  fprintf(stream, "Poisson solver, initialisation failed: %s\n", type);
+  fprintf(stream, "  FFTW:    A possible reason is you link Intel-MKL lib\n");
+  fprintf(stream, "           Make sure you use FFTW3 directly,\n");
+  fprintf(stream, "           NOT its crazy wrapper offered by MKL\n");
+  fprintf(stream, "  SDECOMP: Check sdecomp.log and check arguments\n");
+  fprintf(stream, "           If they are all correct, PLEASE CONTACT ME\n");
+  fflush(stream);
+  return 0;
+}
+
+static size_t max(
+    const size_t val0,
+    const size_t val1
+){
+  if(val0 > val1){
+    return val0;
+  }else{
+    return val1;
+  }
+}
+
+static int compute_pencil_sizes(
+    const domain_t * domain
+){
+  // NOTE: those variables are defined globally at the top of this file
+  //   to reduce the nhumber of arguments which functions take
+  // global domain size in real space
+  const sdecomp_info_t * info = domain->info;
+  r_gl_sizes[0] = domain->glsizes[0];
+  r_gl_sizes[1] = domain->glsizes[1];
+#if NDIMS == 3
+  r_gl_sizes[2] = domain->glsizes[2];
+#endif
+#if NDIMS == 3
+  // global domain size in complex space
+  // NOTE: Hermite symmetry in y
+  c_gl_sizes[0] = domain->glsizes[0];
+  c_gl_sizes[1] = domain->glsizes[1] / 2 + 1;
+  c_gl_sizes[2] = domain->glsizes[2];
+#endif
+  // local domain sizes
+  for(sdecomp_dir_t dim = 0; dim < NDIMS; dim++){
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_X1PENCIL, dim, r_gl_sizes[dim], r_x1pncl_sizes + dim)) return 1;
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Y1PENCIL, dim, r_gl_sizes[dim], r_y1pncl_sizes + dim)) return 1;
+#if NDIMS == 3
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Y1PENCIL, dim, c_gl_sizes[dim], c_y1pncl_sizes + dim)) return 1;
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Z1PENCIL, dim, c_gl_sizes[dim], c_z1pncl_sizes + dim)) return 1;
+#endif
+  }
+  return 0;
+}
+
+static int allocate_buffers(
+    poisson_solver_t * poisson_solver
+){
+  // although there are bunch of pencils involved,
+  //   two buffers are enough to do the job,
+  //   which are allocated here
+  void * restrict * buf0 = &poisson_solver->buf0;
+  void * restrict * buf1 = &poisson_solver->buf1;
+  const size_t r_dsize = sizeof(double);
+#if NDIMS == 3
+  const size_t c_dsize = sizeof(fftw_complex);
+#endif
+  size_t buf0_bytes = 0;
+  size_t buf1_bytes = 0;
+#if NDIMS == 2
+  // r_x1pncl -> FFT -> r_x1pncl -> rotate -> r_y1pncl
+  // buffer0            buffer1               buffer0
+  buf0_bytes = max(buf0_bytes, r_dsize * prod(r_x1pncl_sizes));
+  buf0_bytes = max(buf0_bytes, r_dsize * prod(r_y1pncl_sizes));
+  buf1_bytes = max(buf1_bytes, r_dsize * prod(r_x1pncl_sizes));
+#else
+  // r_x1pncl -> FFT -> r_x1pncl -> rotate -> r_y1pncl -> FFT -> c_y1pncl -> rotate -> c_z1pncl
+  // buffer0            buffer1               buffer0            buffer1               buffer0
+  buf0_bytes = max(buf0_bytes, r_dsize * prod(r_x1pncl_sizes));
+  buf0_bytes = max(buf0_bytes, r_dsize * prod(r_y1pncl_sizes));
+  buf0_bytes = max(buf0_bytes, c_dsize * prod(c_z1pncl_sizes));
+  buf1_bytes = max(buf1_bytes, r_dsize * prod(r_x1pncl_sizes));
+  buf1_bytes = max(buf1_bytes, c_dsize * prod(c_y1pncl_sizes));
+#endif
+  // allocate them using fftw_malloc to enforce them 16bit-aligned for SIMD
+  *buf0 = fftw_malloc(buf0_bytes);
+  if(NULL == *buf0){
+    fprintf(stderr, "FATAL: fftw_malloc failed (requested %zu bytes)\n", buf0_bytes);
+    fflush(stderr);
+    return 1;
+  }
+  *buf1 = fftw_malloc(buf1_bytes);
+  if(NULL == *buf1){
+    fprintf(stderr, "FATAL: fftw_malloc failed (requested %zu bytes)\n", buf1_bytes);
+    fflush(stderr);
+    return 1;
+  }
+  return 0;
+}
+
+static int init_tri_diagonal_solver(
+    const domain_t * domain,
+    poisson_solver_t * poisson_solver
+){
+  // N x N tri-diagonal matrix,
+  //   which are solved for M times
+  // tdm_sizes[0] = N, tdm_sizes[1] = M
+  // since lower- and upper-diagonal components are
+  //   independent to y, z directions and time,
+  //   we compute here and re-use them
+  // center-diagonal components are, on the other hand,
+  //   dependent on time and thus needs to compute everytime
+  //   in the solver
+  size_t * restrict tdm_sizes = poisson_solver->tdm_sizes;
+  tdm_info_t ** tdm_info = &poisson_solver->tdm_info;
+#if NDIMS == 2
+  tdm_sizes[0] = r_y1pncl_sizes[1];
+  tdm_sizes[1] = r_y1pncl_sizes[0];
+  if(0 != tdm.construct(
+    /* size of system */ tdm_sizes[0],
+    /* number of rhs  */ 1,
+    /* is periodic    */ true,
+    /* is complex     */ false,
+    /* output         */ tdm_info
+  )) return 1;
+  // initialise tri-diagonal matrix in y direction
+  double * tdm_l = NULL;
+  double * tdm_u = NULL;
+  tdm.get_l(*tdm_info, &tdm_l);
+  tdm.get_u(*tdm_info, &tdm_u);
+  const double hy = domain->hy;
+  for(size_t j = 0; j < tdm_sizes[0]; j++){
+    tdm_l[j] = 1. / hy / hy;
+    tdm_u[j] = 1. / hy / hy;
+  }
+#else
+  tdm_sizes[0] = c_z1pncl_sizes[2];
+  tdm_sizes[1] = c_z1pncl_sizes[0] * c_z1pncl_sizes[1];
+  if(0 != tdm.construct(
+    /* size of system */ tdm_sizes[0],
+    /* number of rhs  */ 1,
+    /* is periodic    */ true,
+    /* is complex     */ true,
+    /* output         */ tdm_info
+  )) return 1;
+  // initialise tri-diagonal matrix in z direction
+  double * tdm_l = NULL;
+  double * tdm_u = NULL;
+  tdm.get_l(*tdm_info, &tdm_l);
+  tdm.get_u(*tdm_info, &tdm_u);
+  const double hz = domain->hz;
+  for(size_t k = 0; k < tdm_sizes[0]; k++){
+    tdm_l[k] = 1. / hz / hz;
+    tdm_u[k] = 1. / hz / hz;
+  }
+#endif
+  return 0;
+}
+
+static int init_pencil_rotations(
+    const domain_t * domain,
+    poisson_solver_t * poisson_solver
+){
+  const sdecomp_info_t * info = domain->info;
+  const size_t r_dsize = sizeof(double);
+#if NDIMS == 3
+  const size_t c_dsize = sizeof(fftw_complex);
+#endif
+  if(0 != sdecomp.transpose.construct(info, SDECOMP_X1PENCIL, SDECOMP_Y1PENCIL, r_gl_sizes, r_dsize, &poisson_solver->r_transposer_x1_to_y1)){
+    report_failure("SDECOMP x1 to y1 for real");
+    return 1;
+  }
+  if(0 != sdecomp.transpose.construct(info, SDECOMP_Y1PENCIL, SDECOMP_X1PENCIL, r_gl_sizes, r_dsize, &poisson_solver->r_transposer_y1_to_x1)){
+    report_failure("SDECOMP y1 to x1 for real");
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != sdecomp.transpose.construct(info, SDECOMP_Y1PENCIL, SDECOMP_Z1PENCIL, c_gl_sizes, c_dsize, &poisson_solver->c_transposer_y1_to_z1)){
+    report_failure("SDECOMP y1 to z1 for complex");
+    return 1;
+  }
+  if(0 != sdecomp.transpose.construct(info, SDECOMP_Z1PENCIL, SDECOMP_Y1PENCIL, c_gl_sizes, c_dsize, &poisson_solver->c_transposer_z1_to_y1)){
+    report_failure("SDECOMP z1 to y1 for complex");
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+static int init_ffts(
+    poisson_solver_t * poisson_solver
+){
+  const unsigned flags = FFTW_PATIENT | FFTW_DESTROY_INPUT;
+  // NOTE: two buffers should be properly given
+  //   see "allocate_buffers" above
+  // x, real to real
+  {
+    const int signal_length = r_x1pncl_sizes[SDECOMP_XDIR];
+#if NDIMS == 2
+    const int repeat_for = r_x1pncl_sizes[SDECOMP_YDIR];
+#else
+    const int repeat_for = r_x1pncl_sizes[SDECOMP_YDIR] * r_x1pncl_sizes[SDECOMP_ZDIR];
+#endif
+    fftw_plan * fplan = &poisson_solver->fftw_plan_x[0];
+    fftw_plan * bplan = &poisson_solver->fftw_plan_x[1];
+    *fplan = fftw_plan_many_r2r(
+        1, &signal_length, repeat_for,
+        poisson_solver->buf0, NULL, 1, signal_length,
+        poisson_solver->buf1, NULL, 1, signal_length,
+        (fftw_r2r_kind [1]){FFTW_REDFT10}, flags
+    );
+    *bplan = fftw_plan_many_r2r(
+        1, &signal_length, repeat_for,
+        poisson_solver->buf1, NULL, 1, signal_length,
+        poisson_solver->buf0, NULL, 1, signal_length,
+        (fftw_r2r_kind [1]){FFTW_REDFT01}, flags
+    );
+    if(NULL == *fplan){
+      report_failure("FFTW x-forward");
+      return 1;
+    }
+    if(NULL == *bplan){
+      report_failure("FFTW x-backward");
+      return 1;
+    }
+  }
+#if NDIMS == 3
+  // y, real / complex
+  {
+    fftw_plan * fplan = &poisson_solver->fftw_plan_y[0];
+    fftw_plan * bplan = &poisson_solver->fftw_plan_y[1];
+    const int r_signal_length = r_y1pncl_sizes[SDECOMP_YDIR];
+    const int c_signal_length = c_y1pncl_sizes[SDECOMP_YDIR];
+    const int repeat_for = r_y1pncl_sizes[SDECOMP_ZDIR] * r_y1pncl_sizes[SDECOMP_XDIR];
+    *fplan = fftw_plan_many_dft_r2c(
+        1, &r_signal_length, repeat_for,
+        poisson_solver->buf0, NULL, 1, r_signal_length,
+        poisson_solver->buf1, NULL, 1, c_signal_length,
+        flags
+    );
+    *bplan = fftw_plan_many_dft_c2r(
+        1, &r_signal_length, repeat_for,
+        poisson_solver->buf1, NULL, 1, c_signal_length,
+        poisson_solver->buf0, NULL, 1, r_signal_length,
+        flags
+    );
+    if(NULL == *fplan){
+      report_failure("FFTW y-forward");
+      return 1;
+    }
+    if(NULL == *bplan){
+      report_failure("FFTW y-backward");
+      return 1;
+    }
+  }
+#endif
+  return 0;
+}
+
+static int init_eigenvalues(
+    const domain_t * domain,
+    poisson_solver_t * poisson_solver
+){
+  const sdecomp_info_t * info = domain->info;
+  double ** evals = &poisson_solver->evals;
+#if NDIMS == 2
+  // y1 pencil, DCT in x
+  const sdecomp_pencil_t pencil = SDECOMP_Y1PENCIL;
+  const double signal_lengths[NDIMS - 1] = {
+    2. * r_gl_sizes[SDECOMP_XDIR],
+  };
+  size_t mysizes[NDIMS - 1] = {0};
+  sdecomp.get_pencil_mysize(info, pencil, SDECOMP_XDIR, r_gl_sizes[SDECOMP_XDIR], mysizes);
+  size_t offsets[NDIMS - 1] = {0};
+  sdecomp.get_pencil_offset(info, pencil, SDECOMP_XDIR, r_gl_sizes[SDECOMP_XDIR], offsets);
+  const double gridsizes[NDIMS - 1] = {
+    domain->lengths[SDECOMP_XDIR] / r_gl_sizes[SDECOMP_XDIR],
+  };
+  // initialise eigenvalues in homogeneous directions
+  *evals = memory_calloc(mysizes[0], sizeof(double));
+  for(size_t cnt = 0, i = offsets[0]; i < mysizes[0] + offsets[0]; i++, cnt++){
+    (*evals)[cnt] =
+      - 4. / pow(gridsizes[0], 2.) * pow(
+        sin( g_pi * i / signal_lengths[0] ),
+        2.
+    );
+  }
+#else
+  // z1 pencil, DCT in x and DFT in y
+  const sdecomp_pencil_t pencil = SDECOMP_Z1PENCIL;
+  const double signal_lengths[NDIMS - 1] = {
+    2. * r_gl_sizes[SDECOMP_XDIR],
+    1. * r_gl_sizes[SDECOMP_YDIR],
+  };
+  size_t mysizes[NDIMS - 1] = {0};
+  sdecomp.get_pencil_mysize(info, pencil, SDECOMP_XDIR, c_gl_sizes[SDECOMP_XDIR], mysizes + 0);
+  sdecomp.get_pencil_mysize(info, pencil, SDECOMP_YDIR, c_gl_sizes[SDECOMP_YDIR], mysizes + 1);
+  size_t offsets[NDIMS - 1] = {0};
+  sdecomp.get_pencil_offset(info, pencil, SDECOMP_XDIR, c_gl_sizes[SDECOMP_XDIR], offsets + 0);
+  sdecomp.get_pencil_offset(info, pencil, SDECOMP_YDIR, c_gl_sizes[SDECOMP_YDIR], offsets + 1);
+  const double gridsizes[NDIMS - 1] = {
+    domain->lengths[SDECOMP_XDIR] / r_gl_sizes[SDECOMP_XDIR],
+    domain->lengths[SDECOMP_YDIR] / r_gl_sizes[SDECOMP_YDIR],
+  };
+  // initialise eigenvalues in homogeneous directions
+  *evals = memory_calloc(mysizes[0] * mysizes[1], sizeof(double));
+  for(size_t cnt = 0, j = offsets[1]; j < mysizes[1] + offsets[1]; j++){
+    for(size_t i = offsets[0]; i < mysizes[0] + offsets[0]; i++, cnt++){
+      (*evals)[cnt] =
+        - 4. / pow(gridsizes[0], 2.) * pow(
+          sin( g_pi * i / signal_lengths[0] ),
+          2.
+        )
+        - 4. / pow(gridsizes[1], 2.) * pow(
+          sin( g_pi * j / signal_lengths[1] ),
+          2.
+        );
+    }
+  }
+#endif
+  return 0;
+}
+
+static int init_poisson_solver(
+    const domain_t * domain,
+    poisson_solver_t * poisson_solver
+){
+  // check domain size (global, local, pencils)
+  if(0 != compute_pencil_sizes(domain)) return 1;
+  // initialise each part of poisson_solver_t
+  if(0 != allocate_buffers(poisson_solver))                 return 1;
+  if(0 != init_tri_diagonal_solver(domain, poisson_solver)) return 1;
+  if(0 != init_pencil_rotations(domain, poisson_solver))    return 1;
+  if(0 != init_ffts(poisson_solver))                        return 1;
+  if(0 != init_eigenvalues(domain, poisson_solver))         return 1;
+  poisson_solver->is_initialised = true;
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    printf("DCT-based solver is used\n");
+  }
+  return 0;
+}
+
+static int assign_input(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    const fluid_t * fluid,
+    double * restrict rhs
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  const double * restrict psi = fluid->psi[0].data;
+  const double * restrict den = fluid->den[1].data;
+  // normalise FFT beforehand
+#if NDIMS == 2
+  const double norm = 2. * domain->glsizes[0];
+#else
+  const double norm = 2. * domain->glsizes[0] * domain->glsizes[1];
+#endif
+  const double refden = fluid->refden;
+  // coefficients in front of each contribution,
+  //   0: divergence
+  //   1: potential
+  // and FFT normalisation
+  const double coefs[2] = {
+    // rho_ref / gamma dt
+    1. / norm / dt_new * refden,
+    // c_{n-1}
+    1. / norm * -1. / dt_new * dt_old,
+  };
+#if NDIMS == 2
+  for(int cnt = 0, j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++, cnt++){
+      // compute local divergence | 13
+      const double hx_xm = HXXF(i  );
+      const double hx_xp = HXXF(i+1);
+      const double jd_xm = JDXF(i  );
+      const double jd_x0 = JDXC(i  );
+      const double jd_xp = JDXF(i+1);
+      const double ux_xm = UX(i  , j  );
+      const double ux_xp = UX(i+1, j  );
+      const double uy_ym = UY(i  , j  );
+      const double uy_yp = UY(i  , j+1);
+      const double div = 1. / jd_x0 * (
+          - jd_xm / hx_xm * ux_xm + jd_xp / hx_xp * ux_xp
+          - jd_x0 / hy    * uy_ym + jd_x0 / hy    * uy_yp
+      );
+      // additional contribution | 24
+      const double den_xm = + 0.5 * DEN(i-1, j  )
+                            + 0.5 * DEN(i  , j  );
+      const double den_xp = + 0.5 * DEN(i  , j  )
+                            + 0.5 * DEN(i+1, j  );
+      const double den_ym = + 0.5 * DEN(i  , j-1)
+                            + 0.5 * DEN(i  , j  );
+      const double den_yp = + 0.5 * DEN(i  , j  )
+                            + 0.5 * DEN(i  , j+1);
+      const double dpsi_xm = - PSI(i-1, j  )
+                             + PSI(i  , j  );
+      const double dpsi_xp = - PSI(i  , j  )
+                             + PSI(i+1, j  );
+      const double dpsi_ym = - PSI(i  , j-1)
+                             + PSI(i  , j  );
+      const double dpsi_yp = - PSI(i  , j  )
+                             + PSI(i  , j+1);
+      const double gp_xm = (refden / den_xm - 1.) * dpsi_xm / hx_xm;
+      const double gp_xp = (refden / den_xp - 1.) * dpsi_xp / hx_xp;
+      const double gp_ym = (refden / den_ym - 1.) * dpsi_ym / hy;
+      const double gp_yp = (refden / den_yp - 1.) * dpsi_yp / hy;
+      const double add = 1. / jd_x0 * (
+          - jd_xm / hx_xm * gp_xm + jd_xp / hx_xp * gp_xp
+          - jd_x0 / hy    * gp_ym + jd_x0 / hy    * gp_yp
+      );
+      rhs[cnt] = (
+        + coefs[0] * div
+        + coefs[1] * add
+      );
+    }
+  }
+#else
+  for(int cnt = 0, k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++, cnt++){
+        // compute local divergence | 16
+        const double hx_xm = HXXF(i  );
+        const double hx_xp = HXXF(i+1);
+        const double jd_xm = JDXF(i  );
+        const double jd_x0 = JDXC(i  );
+        const double jd_xp = JDXF(i+1);
+        const double ux_xm = UX(i  , j  , k  );
+        const double ux_xp = UX(i+1, j  , k  );
+        const double uy_ym = UY(i  , j  , k  );
+        const double uy_yp = UY(i  , j+1, k  );
+        const double uz_zm = UZ(i  , j  , k  );
+        const double uz_zp = UZ(i  , j  , k+1);
+        const double div = 1. / jd_x0 * (
+            - jd_xm / hx_xm * ux_xm + jd_xp / hx_xp * ux_xp
+            - jd_x0 / hy    * uy_ym + jd_x0 / hy    * uy_yp
+            - jd_x0 / hz    * uz_zm + jd_x0 / hz    * uz_zp
+        );
+        // additional contribution | 35
+        const double den_xm = + 0.5 * DEN(i-1, j  , k  )
+                              + 0.5 * DEN(i  , j  , k  );
+        const double den_xp = + 0.5 * DEN(i  , j  , k  )
+                              + 0.5 * DEN(i+1, j  , k  );
+        const double den_ym = + 0.5 * DEN(i  , j-1, k  )
+                              + 0.5 * DEN(i  , j  , k  );
+        const double den_yp = + 0.5 * DEN(i  , j  , k  )
+                              + 0.5 * DEN(i  , j+1, k  );
+        const double den_zm = + 0.5 * DEN(i  , j  , k-1)
+                              + 0.5 * DEN(i  , j  , k  );
+        const double den_zp = + 0.5 * DEN(i  , j  , k  )
+                              + 0.5 * DEN(i  , j  , k+1);
+        const double dpsi_xm = - PSI(i-1, j  , k  )
+                               + PSI(i  , j  , k  );
+        const double dpsi_xp = - PSI(i  , j  , k  )
+                               + PSI(i+1, j  , k  );
+        const double dpsi_ym = - PSI(i  , j-1, k  )
+                               + PSI(i  , j  , k  );
+        const double dpsi_yp = - PSI(i  , j  , k  )
+                               + PSI(i  , j+1, k  );
+        const double dpsi_zm = - PSI(i  , j  , k-1)
+                               + PSI(i  , j  , k  );
+        const double dpsi_zp = - PSI(i  , j  , k  )
+                               + PSI(i  , j  , k+1);
+        const double gp_xm = (refden / den_xm - 1.) * dpsi_xm / hx_xm;
+        const double gp_xp = (refden / den_xp - 1.) * dpsi_xp / hx_xp;
+        const double gp_ym = (refden / den_ym - 1.) * dpsi_ym / hy;
+        const double gp_yp = (refden / den_yp - 1.) * dpsi_yp / hy;
+        const double gp_zm = (refden / den_zm - 1.) * dpsi_zm / hz;
+        const double gp_zp = (refden / den_zp - 1.) * dpsi_zp / hz;
+        const double add = 1. / jd_x0 * (
+            - jd_xm / hx_xm * gp_xm + jd_xp / hx_xp * gp_xp
+            - jd_x0 / hy    * gp_ym + jd_x0 / hy    * gp_yp
+            - jd_x0 / hz    * gp_zm + jd_x0 / hz    * gp_zp
+        );
+        rhs[cnt] = (
+          + coefs[0] * div
+          + coefs[1] * add
+        );
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int extract_output(
+    const domain_t * domain,
+    const double * restrict rhs,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict psi = fluid->psi[1].data;
+#if NDIMS == 2
+  for(int cnt = 0, j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++, cnt++){
+      PSI(i, j) = rhs[cnt];
+    }
+  }
+#else
+  for(int cnt = 0, k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++, cnt++){
+        PSI(i, j, k) = rhs[cnt];
+      }
+    }
+  }
+#endif
+  fluid_update_boundaries_psi(domain, &fluid->psi[1]);
+  return 0;
+}
+
+static int solve_linear_systems(
+    poisson_solver_t * poisson_solver
+){
+  // size of system (length) and how many such systems to be solved
+  // NOTE: although size_of_system is the same as tdm.get_size gives,
+  //   repeat_for is different from what tdm.get_nrhs returns (=1)
+  //   here repeat_for is the degree of freedom in the wavespace
+  const size_t size_of_system = poisson_solver->tdm_sizes[0];
+  const size_t repeat_for     = poisson_solver->tdm_sizes[1];
+  // tri-diagonal matrix
+  tdm_info_t * tdm_info = poisson_solver->tdm_info;
+  double * restrict tdm_l = NULL;
+  double * restrict tdm_u = NULL;
+  double * restrict tdm_c = NULL;
+  tdm.get_l(tdm_info, &tdm_l);
+  tdm.get_u(tdm_info, &tdm_u);
+  tdm.get_c(tdm_info, &tdm_c);
+  // eigenvalues coming from Fourier projection
+  const double * restrict evals = poisson_solver->evals;
+#if NDIMS == 2
+  double * restrict rhs = poisson_solver->buf0;
+#else
+  fftw_complex * restrict rhs = poisson_solver->buf0;
+#endif
+  for(size_t m = 0; m < repeat_for; m++){
+    // set center diagonal components
+    for(size_t n = 0; n < size_of_system; n++){
+      tdm_c[n] = - tdm_l[n] - tdm_u[n] + evals[m];
+    }
+    tdm.solve(tdm_info, rhs + m * size_of_system);
+  }
+  return 0;
+}
+
+/**
+ * @brief compute scalar potential psi to correct velocity
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     dt_old : previous time step size
+ * @param[in]     dt_new : current  time step size
+ * @param[in,out] fluid  : velocity (in), scalar potential psi (out)
+ * @return               : (success) 0
+ *                       : (failure) 1
+ */
+int fluid_compute_potential(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+){
+  static poisson_solver_t poisson_solver = {
+    .is_initialised = false,
+  };
+  // initialise Poisson solver
+  if(!poisson_solver.is_initialised){
+    if(0 != init_poisson_solver(domain, &poisson_solver)){
+      // failed to initialise Poisson solver
+      return 1;
+    }
+  }
+  // compute right-hand side of Poisson equation
+  // assigned to buf0
+  assign_input(domain, dt_old, dt_new, fluid, poisson_solver.buf0);
+  // solve the equation
+  // project x to wave space
+  // f(x, y)    -> f(k_x, y)
+  // f(x, y, z) -> f(k_x, y, z)
+  // from buf0 to buf1
+  fftw_execute(poisson_solver.fftw_plan_x[0]);
+  // transpose real x1pencil to y1pencil
+  // from buf1 to buf0
+  sdecomp.transpose.execute(
+      poisson_solver.r_transposer_x1_to_y1,
+      poisson_solver.buf1,
+      poisson_solver.buf0
+  );
+#if NDIMS == 3
+  // project y to wave space
+  // f(k_x, y, z) -> f(k_x, k_y, z)
+  // from buf0 to buf1
+  fftw_execute(poisson_solver.fftw_plan_y[0]);
+  // transpose complex y1pencil to z1pencil
+  // from buf1 to buf0
+  sdecomp.transpose.execute(
+      poisson_solver.c_transposer_y1_to_z1,
+      poisson_solver.buf1,
+      poisson_solver.buf0
+  );
+#endif
+  // solve linear systems
+  solve_linear_systems(&poisson_solver);
+#if NDIMS == 3
+  // transpose complex z1pencil to y1pencil
+  // from buf0 to buf1
+  sdecomp.transpose.execute(
+      poisson_solver.c_transposer_z1_to_y1,
+      poisson_solver.buf0,
+      poisson_solver.buf1
+  );
+  // project y to physical space
+  // f(k_x, k_y, z) -> f(k_x, y, z)
+  // from buf1 to buf0
+  fftw_execute(poisson_solver.fftw_plan_y[1]);
+#endif
+  // transpose real y1pencil to x1pencil
+  // from buf0 to buf1
+  sdecomp.transpose.execute(
+      poisson_solver.r_transposer_y1_to_x1,
+      poisson_solver.buf0,
+      poisson_solver.buf1
+  );
+  // project x to physical space
+  // f(k_x, y)    -> f(x, y)
+  // f(k_x, y, z) -> f(x, y, z)
+  // from buf1 to buf0
+  fftw_execute(poisson_solver.fftw_plan_x[1]);
+  extract_output(domain, poisson_solver.buf0, fluid);
+  return 0;
+}
+
diff --git a/src/fluid/correct_velocity/internal.h b/src/fluid/correct_velocity/internal.h
new file mode 100644
index 0000000..1ed49c0
--- /dev/null
+++ b/src/fluid/correct_velocity/internal.h
@@ -0,0 +1,27 @@
+#if !defined(FLUID_CORRECT_VELOCITY_INTERNAL_H)
+#define FLUID_CORRECT_VELOCITY_INTERNAL_H
+
+extern int fluid_correct_velocity_ux(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+);
+
+extern int fluid_correct_velocity_uy(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+);
+
+#if NDIMS == 3
+extern int fluid_correct_velocity_uz(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+);
+#endif
+
+#endif // FLUID_CORRECT_VELOCITY_INTERNAL_H
diff --git a/src/fluid/correct_velocity/main.c b/src/fluid/correct_velocity/main.c
new file mode 100644
index 0000000..4e6dcdd
--- /dev/null
+++ b/src/fluid/correct_velocity/main.c
@@ -0,0 +1,28 @@
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "internal.h"
+
+/**
+ * @brief correct non-solenoidal velocity using scalar potential psi
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     dt_old : previous time step size
+ * @param[in]     dt_new : current  time step size
+ * @param[in,out] fluid  : scalar potential (in), velocity (out)
+ * @return               : error code
+ */
+int fluid_correct_velocity(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+){
+  // compute prefactor gamma dt
+  fluid_correct_velocity_ux(domain, dt_old, dt_new, fluid);
+  fluid_correct_velocity_uy(domain, dt_old, dt_new, fluid);
+#if NDIMS == 3
+  fluid_correct_velocity_uz(domain, dt_old, dt_new, fluid);
+#endif
+  return 0;
+}
+
diff --git a/src/fluid/correct_velocity/ux.c b/src/fluid/correct_velocity/ux.c
new file mode 100644
index 0000000..5798b1c
--- /dev/null
+++ b/src/fluid/correct_velocity/ux.c
@@ -0,0 +1,106 @@
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/psi.h"
+#include "array_macros/fluid/den.h"
+
+/**
+ * @brief correct ux using scalar potential psi
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     dt_old : previous time step size
+ * @param[in]     dt_new : current  time step size
+ * @param[in,out] fluid  : scalar potential psi (in), ux (out)
+ * @return               : error code
+ */
+int fluid_correct_velocity_ux(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  double * restrict ux = fluid->ux.data;
+  const double refden = fluid->refden;
+  {
+    const double * restrict psi = fluid->psi[1].data;
+#if NDIMS == 2
+    // new scalar potential contribution | 11
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double hx = HXXF(i  );
+        const double psi_xm = PSI(i-1, j  );
+        const double psi_xp = PSI(i  , j  );
+        UX(i, j) -= dt_new / refden / hx * (
+            - psi_xm
+            + psi_xp
+        );
+      }
+    }
+#else
+    // new scalar potential contribution | 13
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 2; i <= isize; i++){
+          const double hx = HXXF(i  );
+          const double psi_xm = PSI(i-1, j  , k  );
+          const double psi_xp = PSI(i  , j  , k  );
+          UX(i, j, k) -= dt_new / refden / hx * (
+              - psi_xm
+              + psi_xp
+          );
+        }
+      }
+    }
+#endif
+  }
+  {
+    const double * restrict psi = fluid->psi[0].data;
+    const double * restrict den = fluid->den[1].data;
+    const double coef = -1. / dt_new * dt_old;
+#if NDIMS == 2
+    // old scalar potential contribution | 13
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double hx = HXXF(i  );
+        const double psi_xm = PSI(i-1, j  );
+        const double psi_xp = PSI(i  , j  );
+        const double den_x0 = + 0.5 * DEN(i-1, j  )
+                              + 0.5 * DEN(i  , j  );
+        UX(i, j) += dt_new * coef * (1. / den_x0 - 1. / refden) / hx * (
+            - psi_xm
+            + psi_xp
+        );
+      }
+    }
+#else
+    // old scalar potential contribution | 15
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 2; i <= isize; i++){
+          const double hx = HXXF(i  );
+          const double psi_xm = PSI(i-1, j  , k  );
+          const double psi_xp = PSI(i  , j  , k  );
+          const double den_x0 = + 0.5 * DEN(i-1, j  , k  )
+                                + 0.5 * DEN(i  , j  , k  );
+          UX(i, j, k) += dt_new * coef * (1. / den_x0 - 1. / refden) / hx * (
+              - psi_xm
+              + psi_xp
+          );
+        }
+      }
+    }
+#endif
+  }
+  // update boundary and halo cells
+  fluid_update_boundaries_ux(domain, &fluid->ux);
+  return 0;
+}
+
diff --git a/src/fluid/correct_velocity/uy.c b/src/fluid/correct_velocity/uy.c
new file mode 100644
index 0000000..f818a2e
--- /dev/null
+++ b/src/fluid/correct_velocity/uy.c
@@ -0,0 +1,102 @@
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "internal.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/fluid/psi.h"
+#include "array_macros/fluid/den.h"
+
+/**
+ * @brief correct uy using scalar potential psi
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     dt_old : previous time step size
+ * @param[in]     dt_new : current  time step size
+ * @param[in,out] fluid  : scalar potential psi (in), ux (out)
+ * @return               : error code
+ */
+int fluid_correct_velocity_uy(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  double * restrict uy = fluid->uy.data;
+  const double refden = fluid->refden;
+  // new scalar potential
+  {
+    const double * restrict psi = fluid->psi[1].data;
+#if NDIMS == 2
+    // new scalar potential contribution | 10
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double psi_ym = PSI(i  , j-1);
+        const double psi_yp = PSI(i  , j  );
+        UY(i, j) -= dt_new / refden / hy * (
+            - psi_ym
+            + psi_yp
+        );
+      }
+    }
+#else
+    // new scalar potential contribution | 12
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          const double psi_ym = PSI(i  , j-1, k  );
+          const double psi_yp = PSI(i  , j  , k  );
+          UY(i, j, k) -= dt_new / refden / hy * (
+              - psi_ym
+              + psi_yp
+          );
+        }
+      }
+    }
+#endif
+  }
+  {
+    const double * restrict psi = fluid->psi[0].data;
+    const double * restrict den = fluid->den[1].data;
+    const double coef = -1. / dt_new * dt_old;
+#if NDIMS == 2
+    // old scalar potential contribution | 12
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double psi_ym = PSI(i  , j-1);
+        const double psi_yp = PSI(i  , j  );
+        const double den_y0 = + 0.5 * DEN(i  , j-1)
+                              + 0.5 * DEN(i  , j  );
+        UY(i, j) += dt_new * coef * (1. / den_y0 - 1. / refden) / hy * (
+            - psi_ym
+            + psi_yp
+        );
+      }
+    }
+#else
+    // old scalar potential contribution | 14
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          double psi_ym = PSI(i  , j-1, k  );
+          double psi_yp = PSI(i  , j  , k  );
+          const double den_y0 = + 0.5 * DEN(i  , j-1, k  )
+                                + 0.5 * DEN(i  , j  , k  );
+          UY(i, j, k) += dt_new * coef * (1. / den_y0 - 1. / refden) / hy * (
+              - psi_ym
+              + psi_yp
+          );
+        }
+      }
+    }
+#endif
+  }
+  // update boundary and halo cells
+  fluid_update_boundaries_uy(domain, &fluid->uy);
+  return 0;
+}
+
diff --git a/src/fluid/correct_velocity/uz.c b/src/fluid/correct_velocity/uz.c
new file mode 100644
index 0000000..46bf733
--- /dev/null
+++ b/src/fluid/correct_velocity/uz.c
@@ -0,0 +1,70 @@
+#if NDIMS == 3
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "internal.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/fluid/psi.h"
+#include "array_macros/fluid/den.h"
+
+/**
+ * @brief correct uz using scalar potential psi
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     dt_old : previous time step size
+ * @param[in]     dt_new : current  time step size
+ * @param[in,out] fluid  : scalar potential psi (in), ux (out)
+ * @return               : error code
+ */
+int fluid_correct_velocity_uz(
+    const domain_t * domain,
+    const double dt_old,
+    const double dt_new,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  double * restrict uz = fluid->uz.data;
+  const double refden = fluid->refden;
+  {
+    const double * restrict psi = fluid->psi[1].data;
+    // new scalar potential contribution | 12
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          const double psi_zm = PSI(i  , j  , k-1);
+          const double psi_zp = PSI(i  , j  , k  );
+          UZ(i, j, k) -= dt_new / refden / hz * (
+              - psi_zm
+              + psi_zp
+          );
+        }
+      }
+    }
+  }
+  {
+    const double * restrict psi = fluid->psi[0].data;
+    const double * restrict den = fluid->den[1].data;
+    const double coef = -1. / dt_new * dt_old;
+    // old scalar potential contribution | 14
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          const double psi_zm = PSI(i  , j  , k-1);
+          const double psi_zp = PSI(i  , j  , k  );
+          const double den_z0 = + 0.5 * DEN(i  , j  , k-1)
+                                + 0.5 * DEN(i  , j  , k  );
+          UZ(i, j, k) += dt_new * coef * (1. / den_z0 - 1. / refden) / hz * (
+              - psi_zm
+              + psi_zp
+          );
+        }
+      }
+    }
+  }
+  // update boundary and halo cells
+  fluid_update_boundaries_uz(domain, &fluid->uz);
+  return 0;
+}
+#endif
diff --git a/src/fluid/density.c b/src/fluid/density.c
new file mode 100644
index 0000000..cb58426
--- /dev/null
+++ b/src/fluid/density.c
@@ -0,0 +1,48 @@
+#include <math.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/interface/vof.h"
+
+int fluid_compute_density(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface,
+    const size_t index
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict den = fluid->den[index].data;
+  const double * restrict vof = interface->vof.data;
+  const double denr = fluid->denr;
+  const double min = fmin(1., denr);
+  const double max = fmax(1., denr);
+#if NDIMS == 2
+  for(int j = 0; j <= jsize + 1; j++){
+    for(int i = 0; i <= isize + 1; i++){
+      double * lden = &DEN(i, j);
+      *lden = 1. + (denr - 1.) * VOF(i, j);
+      *lden = fmax(min, *lden);
+      *lden = fmin(max, *lden);
+    }
+  }
+#else
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        double * lden = &DEN(i, j, k);
+        *lden = 1. + (denr - 1.) * VOF(i, j, k);
+        *lden = fmax(min, *lden);
+        *lden = fmin(max, *lden);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/fluid/init.c b/src/fluid/init.c
new file mode 100644
index 0000000..cac15d1
--- /dev/null
+++ b/src/fluid/init.c
@@ -0,0 +1,145 @@
+#include <math.h>
+#include "memory.h"
+#include "config.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "fileio.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "array_macros/fluid/p.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/visux.h"
+#include "array_macros/fluid/visuy.h"
+#include "array_macros/fluid/visuz.h"
+#include "array_macros/fluid/txx.h"
+#include "array_macros/fluid/txy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/txz.h"
+#endif
+#include "array_macros/fluid/tyy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/tyz.h"
+#include "array_macros/fluid/tzz.h"
+#endif
+#include "array_macros/fluid/psi.h"
+#include "array_macros/fluid/srcux.h"
+#include "array_macros/fluid/srcuy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/srcuz.h"
+#endif
+
+/**
+ * @brief allocate members
+ * @param[in]  domain : information about domain decomposition and size
+ * @param[out] fluid  : structure storing flow fields and auxiliary buffers
+ * @return            : error code
+ */
+static int allocate(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  // velocity
+  if(0 != array.prepare(domain, UX_NADDS, sizeof(double), &fluid->ux )) return 1;
+  if(0 != array.prepare(domain, UY_NADDS, sizeof(double), &fluid->uy )) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, UZ_NADDS, sizeof(double), &fluid->uz )) return 1;
+#endif
+  // pressure and scalar potentials
+  if(0 != array.prepare(domain, P_NADDS,   sizeof(double), &fluid->p  )) return 1;
+  for(size_t n = 0; n < 2; n++){
+    if(0 != array.prepare(domain, PSI_NADDS, sizeof(double), &fluid->psi[n])) return 1;
+  }
+  // density
+  for(size_t n = 0; n < 2; n++){
+    if(0 != array.prepare(domain, DEN_NADDS, sizeof(double), &fluid->den[n])) return 1;
+  }
+  // viscosity
+  if(0 != array.prepare(domain, VISUX_NADDS, sizeof(double), &fluid->visux)) return 1;
+  if(0 != array.prepare(domain, VISUY_NADDS, sizeof(double), &fluid->visuy)) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, VISUZ_NADDS, sizeof(double), &fluid->visuz)) return 1;
+#endif
+  // stress tensor
+  if(0 != array.prepare(domain, TXX_NADDS, sizeof(double), &fluid->txx)) return 1;
+  if(0 != array.prepare(domain, TXY_NADDS, sizeof(double), &fluid->txy)) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, TXZ_NADDS, sizeof(double), &fluid->txz)) return 1;
+#endif
+  if(0 != array.prepare(domain, TYY_NADDS, sizeof(double), &fluid->tyy)) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, TYZ_NADDS, sizeof(double), &fluid->tyz)) return 1;
+  if(0 != array.prepare(domain, TZZ_NADDS, sizeof(double), &fluid->tzz)) return 1;
+#endif
+  // Runge-Kutta source terms
+  for(size_t n = 0; n < 3; n++){
+    if(0 != array.prepare(domain, SRCUX_NADDS, sizeof(double), &fluid->srcux[n])) return 1;
+    if(0 != array.prepare(domain, SRCUY_NADDS, sizeof(double), &fluid->srcuy[n])) return 1;
+#if NDIMS == 3
+    if(0 != array.prepare(domain, SRCUZ_NADDS, sizeof(double), &fluid->srcuz[n])) return 1;
+#endif
+  }
+  return 0;
+}
+
+static void report(
+    const sdecomp_info_t * info,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(info, &myrank);
+  if(root == myrank){
+    printf("FLUID\n");
+    printf("\tRe: % .7e\n", fluid->Re);
+    printf("\tFr: % .7e\n", fluid->Fr);
+    printf("\tDensity ratio: % .7e\n", fluid->denr);
+    printf("\tViscosity ratio: % .7e\n", fluid->visr);
+    fflush(stdout);
+  }
+}
+
+/**
+ * @brief constructor of the structure
+ * @param[in]  dirname_ic : name of directory in which initial flow fields are stored
+ * @param[in]  domain     : information about domain decomposition and size
+ * @param[out]            : structure being allocated and initalised
+ * @return                : (success) 0
+ *                          (failure) non-zero value
+ */
+int fluid_init(
+    const char dirname_ic[],
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  // allocate arrays
+  if(0 != allocate(domain, fluid)) return 1;
+  // load flow fields
+  if(0 != array.load(domain, dirname_ic,  "ux", fileio.npy_double, &fluid->    ux)) return 1;
+  if(0 != array.load(domain, dirname_ic,  "uy", fileio.npy_double, &fluid->    uy)) return 1;
+#if NDIMS == 3
+  if(0 != array.load(domain, dirname_ic,  "uz", fileio.npy_double, &fluid->    uz)) return 1;
+#endif
+  if(0 != array.load(domain, dirname_ic,   "p", fileio.npy_double, &fluid->     p)) return 1;
+  if(0 != array.load(domain, dirname_ic, "psi", fileio.npy_double, &fluid->psi[0])) return 1;
+  // impose boundary conditions and communicate halo cells
+  fluid_update_boundaries_ux (domain, &fluid->    ux);
+  fluid_update_boundaries_uy (domain, &fluid->    uy);
+#if NDIMS == 3
+  fluid_update_boundaries_uz (domain, &fluid->    uz);
+#endif
+  fluid_update_boundaries_p  (domain, &fluid->     p);
+  fluid_update_boundaries_psi(domain, &fluid->psi[0]);
+  // compute diffusivities
+  if(0 != config.get_double("Re", &fluid->Re)) return 1;
+  if(0 != config.get_double("Fr", &fluid->Fr)) return 1;
+  if(0 != config.get_double("denr", &fluid->denr)) return 1;
+  if(0 != config.get_double("visr", &fluid->visr)) return 1;
+  fluid->refden = fmin(1., fluid->denr);
+  report(domain->info, fluid);
+  return 0;
+}
+
diff --git a/src/fluid/predict/internal.h b/src/fluid/predict/internal.h
new file mode 100644
index 0000000..2b560c7
--- /dev/null
+++ b/src/fluid/predict/internal.h
@@ -0,0 +1,85 @@
+#if !defined(FLUID_COMPUTE_RHS_INTERNAL)
+#define FLUID_COMPUTE_RHS_INTERNAL
+
+#include "interface.h"
+
+extern int compute_txx(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+
+extern int compute_txy(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+
+#if NDIMS == 3
+extern int compute_txz(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+#endif
+
+extern int compute_tyy(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+
+#if NDIMS == 3
+extern int compute_tyz(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+#endif
+
+#if NDIMS == 3
+extern int compute_tzz(
+    const domain_t * domain,
+    fluid_t * fluid
+);
+#endif
+
+extern int compute_rhs_ux(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+);
+
+extern int compute_rhs_uy(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+);
+
+#if NDIMS == 3
+extern int compute_rhs_uz(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+);
+#endif
+
+extern int predict_ux(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+);
+
+extern int predict_uy(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+);
+
+#if NDIMS == 3
+extern int predict_uz(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+);
+#endif
+
+#endif // FLUID_COMPUTE_RHS_INTERNAL
diff --git a/src/fluid/predict/main.c b/src/fluid/predict/main.c
new file mode 100644
index 0000000..87d35f2
--- /dev/null
+++ b/src/fluid/predict/main.c
@@ -0,0 +1,97 @@
+#include <string.h>
+#include "runge_kutta.h"
+#include "array.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "internal.h"
+
+static int reset_srcs(
+    const size_t rkstep,
+    array_t * restrict srca,
+    array_t * restrict srcb,
+    array_t * restrict srcg
+){
+  // stash previous RK source term,
+  //   which is achieved by swapping
+  //   the pointers to "data"
+  // NOTE: since "beta" is 0 when 0 == rkstep,
+  //   this exchange is not needed
+  if(0 != rkstep){
+    double * tmp = srca->data;
+    srca->data = srcb->data;
+    srcb->data = tmp;
+  }
+  // zero-clear current RK source terms (exp/imp)
+  memset(srca->data, 0, srca->datasize);
+  memset(srcg->data, 0, srcg->datasize);
+  return 0;
+}
+
+static int compute_rhs(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  compute_rhs_ux(domain, fluid, interface);
+  compute_rhs_uy(domain, fluid, interface);
+#if NDIMS == 3
+  compute_rhs_uz(domain, fluid, interface);
+#endif
+  return 0;
+}
+
+static int predict(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+){
+  predict_ux(domain, rkstep, dt, fluid);
+  predict_uy(domain, rkstep, dt, fluid);
+#if NDIMS == 3
+  predict_uz(domain, rkstep, dt, fluid);
+#endif
+  return 0;
+}
+
+/**
+ * @brief predict the new velocity field and update the temperature field
+ * @param[in]     domain : information related to MPI domain decomposition
+ * @param[in]     rkstep : Runge-Kutta step
+ * @param[in]     dt     : time step size
+ * @param[in,out] fluid  : flow field (in), RK source terms (in,out)
+ * @return               : error code
+ */
+int fluid_predict_field(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  // copy previous k-step source term and reset
+  reset_srcs(rkstep, fluid->srcux + rk_a, fluid->srcux + rk_b, fluid->srcux + rk_g);
+  reset_srcs(rkstep, fluid->srcuy + rk_a, fluid->srcuy + rk_b, fluid->srcuy + rk_g);
+#if NDIMS == 3
+  reset_srcs(rkstep, fluid->srcuz + rk_a, fluid->srcuz + rk_b, fluid->srcuz + rk_g);
+#endif
+  // compute shear-stress tensor
+  compute_txx(domain, fluid);
+  compute_txy(domain, fluid);
+#if NDIMS == 3
+  compute_txz(domain, fluid);
+#endif
+  compute_tyy(domain, fluid);
+#if NDIMS == 3
+  compute_tyz(domain, fluid);
+  compute_tzz(domain, fluid);
+#endif
+  // compute right-hand-side terms of the Runge-Kutta scheme
+  compute_rhs(domain, fluid, interface);
+  // update fields, which are still the prediction for the velocity,
+  //   whereas the temperature is already updated to a new value
+  predict(domain, rkstep, dt, fluid);
+  return 0;
+}
+
diff --git a/src/fluid/predict/txx.c b/src/fluid/predict/txx.c
new file mode 100644
index 0000000..6ec34b1
--- /dev/null
+++ b/src/fluid/predict/txx.c
@@ -0,0 +1,69 @@
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/domain/hxxc.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/visux.h"
+#include "array_macros/fluid/txx.h"
+
+int compute_txx(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxc = domain->hxxc;
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict visux = fluid->visux.data;
+  double * restrict txx = fluid->txx.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      // txx at cell center | 7
+      const double vis_ux = + 0.5 * VISUX(i  , j  )
+                            + 0.5 * VISUX(i+1, j  );
+      const double hx = HXXC(i  );
+      const double dux = - UX(i  , j  )
+                         + UX(i+1, j  );
+      const double lxx = 1. / hx * dux;
+      TXX(i, j) = vis_ux * lxx + vis_ux * lxx;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // txx at cell center | 7
+        const double vis_ux = + 0.5 * VISUX(i  , j  , k  )
+                              + 0.5 * VISUX(i+1, j  , k  );
+        const double hx = HXXC(i  );
+        const double dux = - UX(i  , j  , k  )
+                           + UX(i+1, j  , k  );
+        const double lxx = 1. / hx * dux;
+        TXX(i, j, k) = vis_ux * lxx + vis_ux * lxx;
+      }
+    }
+  }
+#endif
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+#if NDIMS == 3
+    MPI_DOUBLE,
+#endif
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->txx)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->txx)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/fluid/predict/txy.c b/src/fluid/predict/txy.c
new file mode 100644
index 0000000..4ed2702
--- /dev/null
+++ b/src/fluid/predict/txy.c
@@ -0,0 +1,84 @@
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/fluid/visux.h"
+#include "array_macros/fluid/visuy.h"
+#include "array_macros/fluid/txy.h"
+
+int compute_txy(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+  const double * restrict visux = fluid->visux.data;
+  const double * restrict visuy = fluid->visuy.data;
+  double * restrict txy = fluid->txy.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize + 1; i++){
+      // txy at cell corner | 12
+      const double vis_ux = + 0.5 * VISUX(i  , j-1)
+                            + 0.5 * VISUX(i  , j  );
+      const double vis_uy = + 0.5 * VISUY(i-1, j  )
+                            + 0.5 * VISUY(i  , j  );
+      const double hx = HXXF(i  );
+      const double dux = - UX(i  , j-1)
+                         + UX(i  , j  );
+      const double duy = - UY(i-1, j  )
+                         + UY(i  , j  );
+      const double lxy = 1. / hy * dux;
+      const double lyx = 1. / hx * duy;
+      TXY(i, j) = vis_ux * lxy + vis_uy * lyx;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        // txy at cell corner | 12
+        const double vis_ux = + 0.5 * VISUX(i  , j-1, k  )
+                              + 0.5 * VISUX(i  , j  , k  );
+        const double vis_uy = + 0.5 * VISUY(i-1, j  , k  )
+                              + 0.5 * VISUY(i  , j  , k  );
+        const double hx = HXXF(i  );
+        const double dux = - UX(i  , j-1, k  )
+                           + UX(i  , j  , k  );
+        const double duy = - UY(i-1, j  , k  )
+                           + UY(i  , j  , k  );
+        const double lxy = 1. / hy * dux;
+        const double lyx = 1. / hx * duy;
+        TXY(i, j, k) = vis_ux * lxy + vis_uy * lyx;
+      }
+    }
+  }
+#endif
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+#if NDIMS == 3
+    MPI_DOUBLE,
+#endif
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->txy)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->txy)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/fluid/predict/txz.c b/src/fluid/predict/txz.c
new file mode 100644
index 0000000..2ecdaf5
--- /dev/null
+++ b/src/fluid/predict/txz.c
@@ -0,0 +1,59 @@
+#if NDIMS == 3
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/fluid/visux.h"
+#include "array_macros/fluid/visuz.h"
+#include "array_macros/fluid/txz.h"
+
+int compute_txz(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict hxxf = domain->hxxf;
+  const double hz = domain->hz;
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uz = fluid->uz.data;
+  const double * restrict visux = fluid->visux.data;
+  const double * restrict visuz = fluid->visuz.data;
+  double * restrict txz = fluid->txz.data;
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        // txz at cell corner | 12
+        const double vis_ux = + 0.5 * VISUX(i  , j  , k-1)
+                              + 0.5 * VISUX(i  , j  , k  );
+        const double vis_uz = + 0.5 * VISUZ(i-1, j  , k  )
+                              + 0.5 * VISUZ(i  , j  , k  );
+        const double hx = HXXF(i  );
+        const double dux = - UX(i  , j  , k-1)
+                           + UX(i  , j  , k  );
+        const double duz = - UZ(i-1, j  , k  )
+                           + UZ(i  , j  , k  );
+        const double lxz = 1. / hz * dux;
+        const double lzx = 1. / hx * duz;
+        TXZ(i, j, k) = vis_ux * lxz + vis_uz * lzx;
+      }
+    }
+  }
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+    MPI_DOUBLE,
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->txz)){
+    return 1;
+  }
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->txz)){
+    return 1;
+  }
+  return 0;
+}
+#endif
diff --git a/src/fluid/predict/tyy.c b/src/fluid/predict/tyy.c
new file mode 100644
index 0000000..4cfbaca
--- /dev/null
+++ b/src/fluid/predict/tyy.c
@@ -0,0 +1,66 @@
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/fluid/visuy.h"
+#include "array_macros/fluid/tyy.h"
+
+int compute_tyy(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  const double * restrict uy = fluid->uy.data;
+  const double * restrict visuy = fluid->visuy.data;
+  double * restrict tyy = fluid->tyy.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      // tyy at cell center | 6
+      const double vis_uy = + 0.5 * VISUY(i  , j  )
+                            + 0.5 * VISUY(i  , j+1);
+      const double duy = - UY(i  , j  )
+                         + UY(i  , j+1);
+      const double lyy = 1. / hy * duy;
+      TYY(i, j) = vis_uy * lyy + vis_uy * lyy;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // tyy at cell center | 6
+        const double vis_uy = + 0.5 * VISUY(i  , j  , k  )
+                              + 0.5 * VISUY(i  , j+1, k  );
+        const double duy = - UY(i  , j  , k  )
+                           + UY(i  , j+1, k  );
+        const double lyy = 1. / hy * duy;
+        TYY(i, j, k) = vis_uy * lyy + vis_uy * lyy;
+      }
+    }
+  }
+#endif
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+#if NDIMS == 3
+    MPI_DOUBLE,
+#endif
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->tyy)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->tyy)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/fluid/predict/tyz.c b/src/fluid/predict/tyz.c
new file mode 100644
index 0000000..ac7746f
--- /dev/null
+++ b/src/fluid/predict/tyz.c
@@ -0,0 +1,57 @@
+#if NDIMS == 3
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/fluid/visuy.h"
+#include "array_macros/fluid/visuz.h"
+#include "array_macros/fluid/tyz.h"
+
+int compute_tyz(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hy = domain->hy;
+  const double hz = domain->hz;
+  const double * restrict uy = fluid->uy.data;
+  const double * restrict uz = fluid->uz.data;
+  const double * restrict visuy = fluid->visuy.data;
+  const double * restrict visuz = fluid->visuz.data;
+  double * restrict tyz = fluid->tyz.data;
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // tyz at cell corner | 11
+        const double vis_uy = + 0.5 * VISUY(i  , j  , k-1)
+                              + 0.5 * VISUY(i  , j  , k  );
+        const double vis_uz = + 0.5 * VISUZ(i  , j-1, k  )
+                              + 0.5 * VISUZ(i  , j  , k  );
+        const double duy = - UY(i  , j  , k-1)
+                           + UY(i  , j  , k  );
+        const double duz = - UZ(i  , j-1, k  )
+                           + UZ(i  , j  , k  );
+        const double lyz = 1. / hz * duy;
+        const double lzy = 1. / hy * duz;
+        TYZ(i, j, k) = vis_uy * lyz + vis_uz * lzy;
+      }
+    }
+  }
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+    MPI_DOUBLE,
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->tyz)){
+    return 1;
+  }
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->tyz)){
+    return 1;
+  }
+  return 0;
+}
+#endif
diff --git a/src/fluid/predict/tzz.c b/src/fluid/predict/tzz.c
new file mode 100644
index 0000000..84efe3c
--- /dev/null
+++ b/src/fluid/predict/tzz.c
@@ -0,0 +1,47 @@
+#if NDIMS == 3
+#include <mpi.h>
+#include "domain.h"
+#include "fluid.h"
+#include "halo.h"
+#include "./internal.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/fluid/visuz.h"
+#include "array_macros/fluid/tzz.h"
+
+int compute_tzz(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict uz = fluid->uz.data;
+  const double * restrict visuz = fluid->visuz.data;
+  double * restrict tzz = fluid->tzz.data;
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // tzz at cell center | 6
+        const double vis_uz = + 0.5 * VISUZ(i  , j  , k  )
+                              + 0.5 * VISUZ(i  , j  , k+1);
+        const double duz = - UZ(i  , j  , k  )
+                           + UZ(i  , j  , k+1);
+        const double lzz = 1. / hz * duz;
+        TZZ(i, j, k) = vis_uz * lzz + vis_uz * lzz;
+      }
+    }
+  }
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+    MPI_DOUBLE,
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, &fluid->tzz)){
+    return 1;
+  }
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, &fluid->tzz)){
+    return 1;
+  }
+  return 0;
+}
+#endif
diff --git a/src/fluid/predict/ux.c b/src/fluid/predict/ux.c
new file mode 100644
index 0000000..4259916
--- /dev/null
+++ b/src/fluid/predict/ux.c
@@ -0,0 +1,508 @@
+#include "memory.h"
+#include "runge_kutta.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/hxxc.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/p.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/txx.h"
+#include "array_macros/fluid/txy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/txz.h"
+#endif
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#if NDIMS == 3
+#include "array_macros/interface/fluxz.h"
+#endif
+#include "array_macros/interface/ifrcx.h"
+
+#if NDIMS == 2
+#define BEGIN \
+  for(int cnt = 0, j = 1; j <= jsize; j++){ \
+    for(int i = 2; i <= isize; i++, cnt++){
+#define END \
+    } \
+  }
+#else
+#define BEGIN \
+  for(int cnt = 0, k = 1; k <= ksize; k++){ \
+    for(int j = 1; j <= jsize; j++){ \
+      for(int i = 2; i <= isize; i++, cnt++){
+#define END \
+      } \
+    } \
+  }
+#endif
+
+static int advection_x(
+    const domain_t * domain,
+    const double * restrict ux,
+    const double * restrict fluxx,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+#if NDIMS == 2
+  BEGIN
+    // ux is advected in x | 18
+    const double hx_xm = HXXF(i-1);
+    const double hx_x0 = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXF(i+1);
+    const double mux_xm = + 0.5 * jd_xm / hx_xm * FLUXX(i-1, j  )
+                          + 0.5 * jd_x0 / hx_x0 * FLUXX(i  , j  );
+    const double mux_xp = + 0.5 * jd_x0 / hx_x0 * FLUXX(i  , j  )
+                          + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  );
+    const double ux_xm = + 0.5 * UX(i-1, j  )
+                         + 0.5 * UX(i  , j  );
+    const double ux_xp = + 0.5 * UX(i  , j  )
+                         + 0.5 * UX(i+1, j  );
+    src[cnt] -= 1. / jd_x0 * (
+        - mux_xm * ux_xm
+        + mux_xp * ux_xp
+    );
+  END
+#else
+  BEGIN
+    // ux is advected in x | 18
+    const double hx_xm = HXXF(i-1);
+    const double hx_x0 = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXF(i+1);
+    const double mux_xm = + 0.5 * jd_xm / hx_xm * FLUXX(i-1, j  , k  )
+                          + 0.5 * jd_x0 / hx_x0 * FLUXX(i  , j  , k  );
+    const double mux_xp = + 0.5 * jd_x0 / hx_x0 * FLUXX(i  , j  , k  )
+                          + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  , k  );
+    const double ux_xm = + 0.5 * UX(i-1, j  , k  )
+                         + 0.5 * UX(i  , j  , k  );
+    const double ux_xp = + 0.5 * UX(i  , j  , k  )
+                         + 0.5 * UX(i+1, j  , k  );
+    src[cnt] -= 1. / jd_x0 * (
+        - mux_xm * ux_xm
+        + mux_xp * ux_xp
+    );
+  END
+#endif
+  return 0;
+}
+
+static int advection_y(
+    const domain_t * domain,
+    const double * restrict ux,
+    const double * restrict fluxy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double hy = domain->hy;
+#if NDIMS == 2
+  BEGIN
+    // ux is advected in y | 15
+    const double jd_xm = JDXC(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXC(i  );
+    const double muy_ym = + 0.5 * jd_xm / hy * FLUXY(i-1, j  )
+                          + 0.5 * jd_xp / hy * FLUXY(i  , j  );
+    const double muy_yp = + 0.5 * jd_xm / hy * FLUXY(i-1, j+1)
+                          + 0.5 * jd_xp / hy * FLUXY(i  , j+1);
+    const double ux_ym = + 0.5 * UX(i  , j-1)
+                         + 0.5 * UX(i  , j  );
+    const double ux_yp = + 0.5 * UX(i  , j  )
+                         + 0.5 * UX(i  , j+1);
+    src[cnt] -= 1. / jd_x0 * (
+        - muy_ym * ux_ym
+        + muy_yp * ux_yp
+    );
+  END
+#else
+  BEGIN
+    // ux is advected in y | 15
+    const double jd_xm = JDXC(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXC(i  );
+    const double muy_ym = + 0.5 * jd_xm / hy * FLUXY(i-1, j  , k  )
+                          + 0.5 * jd_xp / hy * FLUXY(i  , j  , k  );
+    const double muy_yp = + 0.5 * jd_xm / hy * FLUXY(i-1, j+1, k  )
+                          + 0.5 * jd_xp / hy * FLUXY(i  , j+1, k  );
+    const double ux_ym = + 0.5 * UX(i  , j-1, k  )
+                         + 0.5 * UX(i  , j  , k  );
+    const double ux_yp = + 0.5 * UX(i  , j  , k  )
+                         + 0.5 * UX(i  , j+1, k  );
+    src[cnt] -= 1. / jd_x0 * (
+        - muy_ym * ux_ym
+        + muy_yp * ux_yp
+    );
+  END
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int advection_z(
+    const domain_t * domain,
+    const double * restrict ux,
+    const double * restrict fluxz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double hz = domain->hz;
+  BEGIN
+    // ux is advected in z | 15
+    const double jd_xm = JDXC(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXC(i  );
+    const double muz_zm = + 0.5 * jd_xm / hz * FLUXZ(i-1, j  , k  )
+                          + 0.5 * jd_xp / hz * FLUXZ(i  , j  , k  );
+    const double muz_zp = + 0.5 * jd_xm / hz * FLUXZ(i-1, j  , k+1)
+                          + 0.5 * jd_xp / hz * FLUXZ(i  , j  , k+1);
+    const double ux_zm = + 0.5 * UX(i  , j  , k-1)
+                         + 0.5 * UX(i  , j  , k  );
+    const double ux_zp = + 0.5 * UX(i  , j  , k  )
+                         + 0.5 * UX(i  , j  , k+1);
+    src[cnt] -= 1. / jd_x0 * (
+        - muz_zm * ux_zm
+        + muz_zp * ux_zp
+    );
+  END
+  return 0;
+}
+#endif
+
+static int diffusion_x(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict txx,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxc = domain->hxxc;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+#if NDIMS == 2
+  BEGIN
+    // ux is diffused in x | 11
+    const double hx_xm = HXXC(i-1);
+    const double hx_xp = HXXC(i  );
+    const double jd_xm = JDXC(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXC(i  );
+    const double txx_xm = TXX(i-1, j  );
+    const double txx_xp = TXX(i  , j  );
+    src[cnt] += diffusivity / jd_x0 * (
+        - jd_xm / hx_xm * txx_xm
+        + jd_xp / hx_xp * txx_xp
+    );
+  END
+#else
+  BEGIN
+    // ux is diffused in x | 11
+    const double hx_xm = HXXC(i-1);
+    const double hx_xp = HXXC(i  );
+    const double jd_xm = JDXC(i-1);
+    const double jd_x0 = JDXF(i  );
+    const double jd_xp = JDXC(i  );
+    const double txx_xm = TXX(i-1, j  , k  );
+    const double txx_xp = TXX(i  , j  , k  );
+    src[cnt] += diffusivity / jd_x0 * (
+        - jd_xm / hx_xm * txx_xm
+        + jd_xp / hx_xp * txx_xp
+    );
+  END
+#endif
+  return 0;
+}
+
+static int diffusion_y(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict txy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  const double * restrict jdxf = domain->jdxf;
+#if NDIMS == 2
+  BEGIN
+    // ux is diffused in y | 7
+    const double jd = JDXF(i  );
+    const double txy_ym = TXY(i  , j  );
+    const double txy_yp = TXY(i  , j+1);
+    src[cnt] += diffusivity / jd * (
+        - jd / hy * txy_ym
+        + jd / hy * txy_yp
+    );
+  END
+#else
+  BEGIN
+    // ux is diffused in y | 7
+    const double jd = JDXF(i  );
+    const double txy_ym = TXY(i  , j  , k  );
+    const double txy_yp = TXY(i  , j+1, k  );
+    src[cnt] += diffusivity / jd * (
+        - jd / hy * txy_ym
+        + jd / hy * txy_yp
+    );
+  END
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int diffusion_z(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict txz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict jdxf = domain->jdxf;
+  BEGIN
+    // ux is diffused in z | 7
+    const double jd = JDXF(i  );
+    const double txz_zm = TXZ(i  , j  , k  );
+    const double txz_zp = TXZ(i  , j  , k+1);
+    src[cnt] += diffusivity / jd * (
+        - jd / hz * txz_zm
+        + jd / hz * txz_zp
+    );
+  END
+  return 0;
+}
+#endif
+
+static int pressure(
+    const domain_t * domain,
+    const double * restrict p,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+#if NDIMS == 2
+  BEGIN
+    // pressure-gradient contribution | 4
+    src[cnt] -= 1. / HXXF(i  ) * (
+        - P(i-1, j  )
+        + P(i  , j  )
+    );
+  END
+#else
+  BEGIN
+    // pressure-gradient contribution | 4
+    src[cnt] -= 1. / HXXF(i  ) * (
+        - P(i-1, j  , k  )
+        + P(i  , j  , k  )
+    );
+  END
+#endif
+  return 0;
+}
+
+static int surface(
+    const domain_t * domain,
+    const double * restrict ifrcx,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+#if NDIMS == 2
+  BEGIN
+    src[cnt] += IFRCX(i, j);
+  END
+#else
+  BEGIN
+    src[cnt] += IFRCX(i, j, k);
+  END
+#endif
+  return 0;
+}
+
+static int gravity(
+    const domain_t * domain,
+    const double g,
+    const double * den,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+#if NDIMS == 2
+  BEGIN
+    src[cnt] += g * (
+        + 0.5 * DEN(i-1, j  )
+        + 0.5 * DEN(i  , j  )
+    );
+  END
+#else
+  BEGIN
+    src[cnt] += g * (
+        + 0.5 * DEN(i-1, j  , k  )
+        + 0.5 * DEN(i  , j  , k  )
+    );
+  END
+#endif
+  return 0;
+}
+
+/**
+ * @brief comute right-hand-side of Runge-Kutta scheme of ux
+ * @param[in]     domain : information related to MPI domain decomposition
+ * @param[in,out] fluid  : n-step flow field (in), RK source terms (inout)
+ * @return               : error code
+ */
+int compute_rhs_ux(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  const double * restrict  ux = fluid-> ux.data;
+  const double * restrict   p = fluid->  p.data;
+  const double * restrict txx = fluid->txx.data;
+  const double * restrict txy = fluid->txy.data;
+#if NDIMS == 3
+  const double * restrict txz = fluid->txz.data;
+#endif
+  const double * restrict fluxx = interface->fluxx.data;
+  const double * restrict fluxy = interface->fluxy.data;
+#if NDIMS == 3
+  const double * restrict fluxz = interface->fluxz.data;
+#endif
+  double * restrict srca = fluid->srcux[rk_a].data;
+  double * restrict srcg = fluid->srcux[rk_g].data;
+  const double diffusivity = 1. / fluid->Re;
+  const double acceleration = -1. / fluid->Fr / fluid->Fr;
+  // advective contributions
+  advection_x(domain, ux, fluxx, srca);
+  advection_y(domain, ux, fluxy, srca);
+#if NDIMS == 3
+  advection_z(domain, ux, fluxz, srca);
+#endif
+  // diffusive contributions
+  diffusion_x(domain, diffusivity, txx, srca);
+  diffusion_y(domain, diffusivity, txy, srca);
+#if NDIMS == 3
+  diffusion_z(domain, diffusivity, txz, srca);
+#endif
+  // pressure-gradient contribution
+  pressure(domain, p, srcg);
+  surface(domain, interface->ifrcx.data, srca);
+  gravity(domain, acceleration, fluid->den[0].data, srca);
+  return 0;
+}
+
+/**
+ * @brief predict ux
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     rkstep : Runge-Kutta step
+ * @param[in]     dt     : time step size
+ * @param[in,out] fluid  : Runge-Kutta source terms (in), velocity (out)
+ * @return               : error code
+ */
+int predict_ux(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double coef_a = rkcoefs[rkstep][rk_a];
+  const double coef_b = rkcoefs[rkstep][rk_b];
+  const double coef_g = rkcoefs[rkstep][rk_g];
+  const double * restrict srcuxa = fluid->srcux[rk_a].data;
+  const double * restrict srcuxb = fluid->srcux[rk_b].data;
+  const double * restrict srcuxg = fluid->srcux[rk_g].data;
+  double * restrict ux = fluid->ux.data;
+  {
+    const double * restrict den = fluid->den[0].data;
+#if NDIMS == 2
+    BEGIN
+      const double lden = + 0.5 * DEN(i-1, j  )
+                          + 0.5 * DEN(i  , j  );
+      UX(i, j) =
+        + lden * UX(i, j)
+        + coef_a * dt * srcuxa[cnt]
+        + coef_b * dt * srcuxb[cnt]
+        + coef_g * dt * srcuxg[cnt];
+    END
+#else
+    BEGIN
+      const double lden = + 0.5 * DEN(i-1, j  , k  )
+                          + 0.5 * DEN(i  , j  , k  );
+      UX(i, j, k) =
+        + lden * UX(i, j, k)
+        + coef_a * dt * srcuxa[cnt]
+        + coef_b * dt * srcuxb[cnt]
+        + coef_g * dt * srcuxg[cnt];
+    END
+#endif
+  }
+  {
+    const double * restrict den = fluid->den[1].data;
+#if NDIMS == 2
+    BEGIN
+      const double lden = + 0.5 * DEN(i-1, j  )
+                          + 0.5 * DEN(i  , j  );
+      UX(i, j) /= lden;
+    END
+#else
+    BEGIN
+      const double lden = + 0.5 * DEN(i-1, j  , k  )
+                          + 0.5 * DEN(i  , j  , k  );
+      UX(i, j, k) /= lden;
+    END
+#endif
+  }
+  fluid_update_boundaries_ux(domain, &fluid->ux);
+  return 0;
+}
+
diff --git a/src/fluid/predict/uy.c b/src/fluid/predict/uy.c
new file mode 100644
index 0000000..3c4d105
--- /dev/null
+++ b/src/fluid/predict/uy.c
@@ -0,0 +1,465 @@
+#include "memory.h"
+#include "runge_kutta.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/fluid/p.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/txy.h"
+#include "array_macros/fluid/tyy.h"
+#include "array_macros/fluid/tyz.h"
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#if NDIMS == 3
+#include "array_macros/interface/fluxz.h"
+#endif
+#include "array_macros/interface/ifrcy.h"
+
+#if NDIMS == 2
+#define BEGIN \
+  for(int cnt = 0, j = 1; j <= jsize; j++){ \
+    for(int i = 1; i <= isize; i++, cnt++){
+#define END \
+    } \
+  }
+#else
+#define BEGIN \
+  for(int cnt = 0, k = 1; k <= ksize; k++){ \
+    for(int j = 1; j <= jsize; j++){ \
+      for(int i = 1; i <= isize; i++, cnt++){
+#define END \
+      } \
+    } \
+  }
+#endif
+
+static int advection_x(
+    const domain_t * domain,
+    const double * restrict uy,
+    const double * restrict fluxx,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+#if NDIMS == 2
+  BEGIN
+    // uy is advected in x | 17
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double mux_xm = + 0.5 * jd_xm / hx_xm * FLUXX(i  , j-1)
+                          + 0.5 * jd_xm / hx_xm * FLUXX(i  , j  );
+    const double mux_xp = + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j-1)
+                          + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  );
+    const double uy_xm = + 0.5 * UY(i-1, j  )
+                         + 0.5 * UY(i  , j  );
+    const double uy_xp = + 0.5 * UY(i  , j  )
+                         + 0.5 * UY(i+1, j  );
+    src[cnt] -= 1. / jd_x0 * (
+        - mux_xm * uy_xm
+        + mux_xp * uy_xp
+    );
+  END
+#else
+  BEGIN
+    // uy is advected in x | 17
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double mux_xm = + 0.5 * jd_xm / hx_xm * FLUXX(i  , j-1, k  )
+                          + 0.5 * jd_xm / hx_xm * FLUXX(i  , j  , k  );
+    const double mux_xp = + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j-1, k  )
+                          + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  , k  );
+    const double uy_xm = + 0.5 * UY(i-1, j  , k  )
+                         + 0.5 * UY(i  , j  , k  );
+    const double uy_xp = + 0.5 * UY(i  , j  , k  )
+                         + 0.5 * UY(i+1, j  , k  );
+    src[cnt] -= 1. / jd_x0 * (
+        - mux_xm * uy_xm
+        + mux_xp * uy_xp
+    );
+  END
+#endif
+  return 0;
+}
+
+static int advection_y(
+    const domain_t * domain,
+    const double * restrict uy,
+    const double * restrict fluxy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  const double * restrict jdxc = domain->jdxc;
+#if NDIMS == 2
+  BEGIN
+    // uy is advected in y | 13
+    const double jd = JDXC(i  );
+    const double muy_ym = + 0.5 * jd / hy * FLUXY(i  , j-1)
+                          + 0.5 * jd / hy * FLUXY(i  , j  );
+    const double muy_yp = + 0.5 * jd / hy * FLUXY(i  , j  )
+                          + 0.5 * jd / hy * FLUXY(i  , j+1);
+    const double uy_ym = + 0.5 * UY(i  , j-1)
+                         + 0.5 * UY(i  , j  );
+    const double uy_yp = + 0.5 * UY(i  , j  )
+                         + 0.5 * UY(i  , j+1);
+    src[cnt] -= 1. / jd * (
+        - muy_ym * uy_ym
+        + muy_yp * uy_yp
+    );
+  END
+#else
+  BEGIN
+    // uy is advected in y | 13
+    const double jd = JDXC(i  );
+    const double muy_ym = + 0.5 * jd / hy * FLUXY(i  , j-1, k  )
+                          + 0.5 * jd / hy * FLUXY(i  , j  , k  );
+    const double muy_yp = + 0.5 * jd / hy * FLUXY(i  , j  , k  )
+                          + 0.5 * jd / hy * FLUXY(i  , j+1, k  );
+    const double uy_ym = + 0.5 * UY(i  , j-1, k  )
+                         + 0.5 * UY(i  , j  , k  );
+    const double uy_yp = + 0.5 * UY(i  , j  , k  )
+                         + 0.5 * UY(i  , j+1, k  );
+    src[cnt] -= 1. / jd * (
+        - muy_ym * uy_ym
+        + muy_yp * uy_yp
+    );
+  END
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int advection_z(
+    const domain_t * domain,
+    const double * restrict uy,
+    const double * restrict fluxz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uy is advected in z | 13
+    const double jd = JDXC(i  );
+    const double muz_zm = + 0.5 * jd / hz * FLUXZ(i  , j-1, k  )
+                          + 0.5 * jd / hz * FLUXZ(i  , j  , k  );
+    const double muz_zp = + 0.5 * jd / hz * FLUXZ(i  , j-1, k+1)
+                          + 0.5 * jd / hz * FLUXZ(i  , j  , k+1);
+    const double uy_zm = + 0.5 * UY(i  , j  , k-1)
+                         + 0.5 * UY(i  , j  , k  );
+    const double uy_zp = + 0.5 * UY(i  , j  , k  )
+                         + 0.5 * UY(i  , j  , k+1);
+    src[cnt] -= 1. / jd * (
+        - muz_zm * uy_zm
+        + muz_zp * uy_zp
+    );
+  END
+  return 0;
+}
+#endif
+
+static int diffusion_x(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict txy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+#if NDIMS == 2
+  BEGIN
+    // uy is diffused in x | 11
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double txy_xm = TXY(i  , j  );
+    const double txy_xp = TXY(i+1, j  );
+    src[cnt] += diffusivity / jd_x0 * (
+        - jd_xm / hx_xm * txy_xm
+        + jd_xp / hx_xp * txy_xp
+    );
+  END
+#else
+  BEGIN
+    // uy is diffused in x | 11
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double txy_xm = TXY(i  , j  , k  );
+    const double txy_xp = TXY(i+1, j  , k  );
+    src[cnt] += diffusivity / jd_x0 * (
+        - jd_xm / hx_xm * txy_xm
+        + jd_xp / hx_xp * txy_xp
+    );
+  END
+#endif
+  return 0;
+}
+
+static int diffusion_y(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict tyy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  const double * restrict jdxc = domain->jdxc;
+#if NDIMS == 2
+  BEGIN
+    // uy is diffused in y | 7
+    const double jd = JDXC(i  );
+    const double tyy_ym = TYY(i  , j-1);
+    const double tyy_yp = TYY(i  , j  );
+    src[cnt] += diffusivity / jd * (
+        - jd / hy * tyy_ym
+        + jd / hy * tyy_yp
+    );
+  END
+#else
+  BEGIN
+    // uy is diffused in y | 7
+    const double jd = JDXC(i  );
+    const double tyy_ym = TYY(i  , j-1, k  );
+    const double tyy_yp = TYY(i  , j  , k  );
+    src[cnt] += diffusivity / jd * (
+        - jd / hy * tyy_ym
+        + jd / hy * tyy_yp
+    );
+  END
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int diffusion_z(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict tyz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uy is diffused in z | 7
+    const double jd = JDXC(i  );
+    const double tyz_zm = TYZ(i  , j  , k  );
+    const double tyz_zp = TYZ(i  , j  , k+1);
+    src[cnt] += diffusivity / jd * (
+        - jd / hz * tyz_zm
+        + jd / hz * tyz_zp
+    );
+  END
+  return 0;
+}
+#endif
+
+static int pressure(
+    const domain_t * domain,
+    const double * restrict p,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+#if NDIMS == 2
+  BEGIN
+    // pressure-gradient contribution | 4
+    src[cnt] -= 1. / hy * (
+        - P(i  , j-1)
+        + P(i  , j  )
+    );
+  END
+#else
+  BEGIN
+    // pressure-gradient contribution | 4
+    src[cnt] -= 1. / hy * (
+        - P(i  , j-1, k  )
+        + P(i  , j  , k  )
+    );
+  END
+#endif
+  return 0;
+}
+
+static int surface(
+    const domain_t * domain,
+    const double * restrict ifrcy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+#if NDIMS == 2
+  BEGIN
+    src[cnt] += IFRCY(i, j);
+  END
+#else
+  BEGIN
+    src[cnt] += IFRCY(i, j, k);
+  END
+#endif
+  return 0;
+}
+
+/**
+ * @brief comute right-hand-side of Runge-Kutta scheme of uy
+ * @param[in]     domain : information related to MPI domain decomposition
+ * @param[in,out] fluid  : n-step flow field (in), RK source terms (inout)
+ * @return               : error code
+ */
+int compute_rhs_uy(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  const double * restrict  uy = fluid-> uy.data;
+  const double * restrict   p = fluid->  p.data;
+  const double * restrict txy = fluid->txy.data;
+  const double * restrict tyy = fluid->tyy.data;
+#if NDIMS == 3
+  const double * restrict tyz = fluid->tyz.data;
+#endif
+  const double * restrict fluxx = interface->fluxx.data;
+  const double * restrict fluxy = interface->fluxy.data;
+#if NDIMS == 3
+  const double * restrict fluxz = interface->fluxz.data;
+#endif
+  double * restrict srca = fluid->srcuy[rk_a].data;
+  double * restrict srcg = fluid->srcuy[rk_g].data;
+  const double diffusivity = 1. / fluid->Re;
+  // advective contributions
+  advection_x(domain, uy, fluxx, srca);
+  advection_y(domain, uy, fluxy, srca);
+#if NDIMS == 3
+  advection_z(domain, uy, fluxz, srca);
+#endif
+  // diffusive contributions
+  diffusion_x(domain, diffusivity, txy, srca);
+  diffusion_y(domain, diffusivity, tyy, srca);
+#if NDIMS == 3
+  diffusion_z(domain, diffusivity, tyz, srca);
+#endif
+  // pressure-gradient contribution
+  pressure(domain, p, srcg);
+  surface(domain, interface->ifrcy.data, srca);
+  return 0;
+}
+
+/**
+ * @brief predict uy
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     rkstep : Runge-Kutta step
+ * @param[in]     dt     : time step size
+ * @param[in,out] fluid  : Runge-Kutta source terms (in), velocity (out)
+ * @return               : error code
+ */
+int predict_uy(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double coef_a = rkcoefs[rkstep][rk_a];
+  const double coef_b = rkcoefs[rkstep][rk_b];
+  const double coef_g = rkcoefs[rkstep][rk_g];
+  const double * restrict srcuya = fluid->srcuy[rk_a].data;
+  const double * restrict srcuyb = fluid->srcuy[rk_b].data;
+  const double * restrict srcuyg = fluid->srcuy[rk_g].data;
+  double * restrict uy = fluid->uy.data;
+  {
+    const double * restrict den = fluid->den[0].data;
+#if NDIMS == 2
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j-1)
+                          + 0.5 * DEN(i  , j  );
+      UY(i, j) =
+        + lden * UY(i, j)
+        + coef_a * dt * srcuya[cnt]
+        + coef_b * dt * srcuyb[cnt]
+        + coef_g * dt * srcuyg[cnt];
+    END
+#else
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j-1, k  )
+                          + 0.5 * DEN(i  , j  , k  );
+      UY(i, j, k) =
+        + lden * UY(i, j, k)
+        + coef_a * dt * srcuya[cnt]
+        + coef_b * dt * srcuyb[cnt]
+        + coef_g * dt * srcuyg[cnt];
+    END
+#endif
+  }
+  {
+    const double * restrict den = fluid->den[1].data;
+#if NDIMS == 2
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j-1)
+                          + 0.5 * DEN(i  , j  );
+      UY(i, j) /= lden;
+    END
+#else
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j-1, k  )
+                          + 0.5 * DEN(i  , j  , k  );
+      UY(i, j, k) /= lden;
+    END
+#endif
+  }
+  fluid_update_boundaries_uy(domain, &fluid->uy);
+  return 0;
+}
+
diff --git a/src/fluid/predict/uz.c b/src/fluid/predict/uz.c
new file mode 100644
index 0000000..c1481cc
--- /dev/null
+++ b/src/fluid/predict/uz.c
@@ -0,0 +1,321 @@
+#if NDIMS == 3
+#include "memory.h"
+#include "runge_kutta.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/fluid/p.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/txz.h"
+#include "array_macros/fluid/tyz.h"
+#include "array_macros/fluid/tzz.h"
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#include "array_macros/interface/fluxz.h"
+#include "array_macros/interface/ifrcz.h"
+
+#define BEGIN \
+  for(int cnt = 0, k = 1; k <= ksize; k++){ \
+    for(int j = 1; j <= jsize; j++){ \
+      for(int i = 1; i <= isize; i++, cnt++){
+#define END \
+      } \
+    } \
+  }
+
+static int advection_x(
+    const domain_t * domain,
+    const double * restrict uz,
+    const double * restrict fluxx,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is advected in x | 17
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double mux_xm = + 0.5 * jd_xm / hx_xm * FLUXX(i  , j  , k-1)
+                          + 0.5 * jd_xm / hx_xm * FLUXX(i  , j  , k  );
+    const double mux_xp = + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  , k-1)
+                          + 0.5 * jd_xp / hx_xp * FLUXX(i+1, j  , k  );
+    const double uz_xm = + 0.5 * UZ(i-1, j  , k  )
+                         + 0.5 * UZ(i  , j  , k  );
+    const double uz_xp = + 0.5 * UZ(i  , j  , k  )
+                         + 0.5 * UZ(i+1, j  , k  );
+    src[cnt] -= 1. / jd_x0 * (
+        - mux_xm * uz_xm
+        + mux_xp * uz_xp
+    );
+  END
+  return 0;
+}
+
+static int advection_y(
+    const domain_t * domain,
+    const double * restrict uz,
+    const double * restrict fluxy,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hy = domain->hy;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is advected in y | 13
+    const double jd = JDXC(i  );
+    const double muy_ym = + 0.5 * jd / hy * FLUXY(i  , j  , k-1)
+                          + 0.5 * jd / hy * FLUXY(i  , j  , k  );
+    const double muy_yp = + 0.5 * jd / hy * FLUXY(i  , j+1, k-1)
+                          + 0.5 * jd / hy * FLUXY(i  , j+1, k  );
+    const double uz_ym = + 0.5 * UZ(i  , j-1, k  )
+                         + 0.5 * UZ(i  , j  , k  );
+    const double uz_yp = + 0.5 * UZ(i  , j  , k  )
+                         + 0.5 * UZ(i  , j+1, k  );
+    src[cnt] -= 1. / jd * (
+        - muy_ym * uz_ym
+        + muy_yp * uz_yp
+    );
+  END
+  return 0;
+}
+
+static int advection_z(
+    const domain_t * domain,
+    const double * restrict uz,
+    const double * restrict fluxz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is advected in z | 13
+    const double jd = JDXC(i  );
+    const double muz_zm = + 0.5 * jd / hz * FLUXZ(i  , j  , k-1)
+                          + 0.5 * jd / hz * FLUXZ(i  , j  , k  );
+    const double muz_zp = + 0.5 * jd / hz * FLUXZ(i  , j  , k  )
+                          + 0.5 * jd / hz * FLUXZ(i  , j  , k+1);
+    const double uz_zm = + 0.5 * UZ(i  , j  , k-1)
+                         + 0.5 * UZ(i  , j  , k  );
+    const double uz_zp = + 0.5 * UZ(i  , j  , k  )
+                         + 0.5 * UZ(i  , j  , k+1);
+    src[cnt] -= 1. / jd * (
+        - muz_zm * uz_zm
+        + muz_zp * uz_zp
+    );
+  END
+  return 0;
+}
+
+static int diffusion_x(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict txz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is diffused in x | 11
+    const double hx_xm = HXXF(i  );
+    const double hx_xp = HXXF(i+1);
+    const double jd_xm = JDXF(i  );
+    const double jd_x0 = JDXC(i  );
+    const double jd_xp = JDXF(i+1);
+    const double txz_xm = TXZ(i  , j  , k  );
+    const double txz_xp = TXZ(i+1, j  , k  );
+    src[cnt] += diffusivity / jd_x0 * (
+        - jd_xm / hx_xm * txz_xm
+        + jd_xp / hx_xp * txz_xp
+    );
+  END
+  return 0;
+}
+
+static int diffusion_y(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict tyz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hy = domain->hy;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is diffused in y | 7
+    const double jd = JDXC(i  );
+    const double tyz_ym = TYZ(i  , j  , k  );
+    const double tyz_yp = TYZ(i  , j+1, k  );
+    src[cnt] += diffusivity / jd * (
+        - jd / hy * tyz_ym
+        + jd / hy * tyz_yp
+    );
+  END
+  return 0;
+}
+
+static int diffusion_z(
+    const domain_t * domain,
+    const double diffusivity,
+    const double * restrict tzz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double * restrict jdxc = domain->jdxc;
+  BEGIN
+    // uz is diffused in z | 7
+    const double jd = JDXC(i  );
+    const double tzz_zm = TZZ(i  , j  , k-1);
+    const double tzz_zp = TZZ(i  , j  , k  );
+    src[cnt] += diffusivity / jd * (
+        - jd / hz * tzz_zm
+        + jd / hz * tzz_zp
+    );
+  END
+  return 0;
+}
+
+static int pressure(
+    const domain_t * domain,
+    const double * restrict p,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  BEGIN
+    // pressure-gradient contribution | 4
+    src[cnt] -= 1. / hz * (
+        - P(i  , j  , k-1)
+        + P(i  , j  , k  )
+    );
+  END
+  return 0;
+}
+
+static int surface(
+    const domain_t * domain,
+    const double * restrict ifrcz,
+    double * restrict src
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  BEGIN
+    src[cnt] += IFRCZ(i, j, k);
+  END
+  return 0;
+}
+
+/**
+ * @brief comute right-hand-side of Runge-Kutta scheme of uz
+ * @param[in]     domain : information related to MPI domain decomposition
+ * @param[in,out] fluid  : n-step flow field (in), RK source terms (inout)
+ * @return               : error code
+ */
+int compute_rhs_uz(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  const double * restrict  uz = fluid-> uz.data;
+  const double * restrict   p = fluid->  p.data;
+  const double * restrict txz = fluid->txz.data;
+  const double * restrict tyz = fluid->tyz.data;
+  const double * restrict tzz = fluid->tzz.data;
+  const double * restrict fluxx = interface->fluxx.data;
+  const double * restrict fluxy = interface->fluxy.data;
+  const double * restrict fluxz = interface->fluxz.data;
+  double * restrict srca = fluid->srcuz[rk_a].data;
+  double * restrict srcg = fluid->srcuz[rk_g].data;
+  const double diffusivity = 1. / fluid->Re;
+  // advective contributions
+  advection_x(domain, uz, fluxx, srca);
+  advection_y(domain, uz, fluxy, srca);
+  advection_z(domain, uz, fluxz, srca);
+  // diffusive contributions
+  diffusion_x(domain, diffusivity, txz, srca);
+  diffusion_y(domain, diffusivity, tyz, srca);
+  diffusion_z(domain, diffusivity, tzz, srca);
+  // pressure-gradient contribution
+  pressure(domain, p, srcg);
+  surface(domain, interface->ifrcz.data, srca);
+  return 0;
+}
+
+/**
+ * @brief predict uz
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in]     rkstep : Runge-Kutta step
+ * @param[in]     dt     : time step size
+ * @param[in,out] fluid  : Runge-Kutta source terms (in), velocity (out)
+ * @return               : error code
+ */
+int predict_uz(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double coef_a = rkcoefs[rkstep][rk_a];
+  const double coef_b = rkcoefs[rkstep][rk_b];
+  const double coef_g = rkcoefs[rkstep][rk_g];
+  const double * restrict srcuza = fluid->srcuz[rk_a].data;
+  const double * restrict srcuzb = fluid->srcuz[rk_b].data;
+  const double * restrict srcuzg = fluid->srcuz[rk_g].data;
+  double * restrict uz = fluid->uz.data;
+  {
+    const double * restrict den = fluid->den[0].data;
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j  , k-1)
+                          + 0.5 * DEN(i  , j  , k  );
+      UZ(i, j, k) =
+        + lden * UZ(i, j, k)
+        + coef_a * dt * srcuza[cnt]
+        + coef_b * dt * srcuzb[cnt]
+        + coef_g * dt * srcuzg[cnt];
+    END
+  }
+  {
+    const double * restrict den = fluid->den[1].data;
+    BEGIN
+      const double lden = + 0.5 * DEN(i  , j  , k-1)
+                          + 0.5 * DEN(i  , j  , k  );
+      UZ(i, j, k) /= lden;
+    END
+  }
+  fluid_update_boundaries_uz(domain, &fluid->uz);
+  return 0;
+}
+#endif
diff --git a/src/fluid/save.c b/src/fluid/save.c
new file mode 100644
index 0000000..1bb85f3
--- /dev/null
+++ b/src/fluid/save.c
@@ -0,0 +1,31 @@
+#include "sdecomp.h"
+#include "array.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "fileio.h"
+
+int fluid_save(
+    const char dirname[],
+    const domain_t * domain,
+    const fluid_t * fluid
+){
+  // serial
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    fileio.w_serial(dirname, "Re", 0, NULL, fileio.npy_double, sizeof(double), &fluid->Re);
+    fileio.w_serial(dirname, "Fr", 0, NULL, fileio.npy_double, sizeof(double), &fluid->Fr);
+  }
+  // collective
+  array.dump(domain, dirname,  "ux", fileio.npy_double, &fluid->    ux);
+  array.dump(domain, dirname,  "uy", fileio.npy_double, &fluid->    uy);
+#if NDIMS == 3
+  array.dump(domain, dirname,  "uz", fileio.npy_double, &fluid->    uz);
+#endif
+  array.dump(domain, dirname,   "p", fileio.npy_double, &fluid->     p);
+  array.dump(domain, dirname, "psi", fileio.npy_double, &fluid->psi[0]);
+  return 0;
+}
+
diff --git a/src/fluid/update_pressure.c b/src/fluid/update_pressure.c
new file mode 100644
index 0000000..fbc8155
--- /dev/null
+++ b/src/fluid/update_pressure.c
@@ -0,0 +1,68 @@
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "array_macros/fluid/p.h"
+#include "array_macros/fluid/psi.h"
+
+static inline int add_explicit(
+    const domain_t * domain,
+    const fluid_t * fluid
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict psi = fluid->psi[1].data;
+  double * restrict p = fluid->p.data;
+#if NDIMS == 2
+  // explicit contribution | 5
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      P(i, j) += PSI(i, j);
+    }
+  }
+#else
+  // explicit contribution | 7
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        P(i, j, k) += PSI(i, j, k);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int update_scalar_potential(
+    array_t * psi0,
+    array_t * psi1
+){
+  // update psi | 3
+  double * tmp = psi0->data;
+  psi0->data = psi1->data;
+  psi1->data = tmp;
+  return 0;
+}
+
+/**
+ * @brief update pressure using scalar potential psi
+ * @param[in]     domain : information related to MPI domain decomposition
+ * @param[in]     rkstep : Runge-Kutta step
+ * @param[in]     dt     : time step size
+ * @param[in,out] fluid  : scalar potential (in), pressure (out)
+ * @return               : error code
+ */
+int fluid_update_pressure(
+    const domain_t * domain,
+    fluid_t * fluid
+){
+  // explicit contribution, always present
+  add_explicit(domain, fluid);
+  // impose boundary conditions and communicate halo cells
+  fluid_update_boundaries_p(domain, &fluid->p);
+  update_scalar_potential(&fluid->psi[0], &fluid->psi[1]);
+  return 0;
+}
+
diff --git a/src/fluid/viscosity.c b/src/fluid/viscosity.c
new file mode 100644
index 0000000..eca5687
--- /dev/null
+++ b/src/fluid/viscosity.c
@@ -0,0 +1,214 @@
+#include <math.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "array_macros/fluid/visux.h"
+#include "array_macros/fluid/visuy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/visuz.h"
+#endif
+#include "array_macros/interface/vof.h"
+
+static inline double get(
+    const double visr,
+    const double min,
+    const double max,
+    const double vof
+){
+  double vis = 1. + (visr - 1.) * vof;
+  vis = fmax(min, vis);
+  vis = fmin(max, vis);
+  return vis;
+}
+
+static int compute_x(
+    const domain_t * domain,
+    const array_t * restrict arr_vof,
+    const double visr,
+    const double min,
+    const double max,
+    array_t * restrict arr_visux
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict vof = arr_vof->data;
+  double * restrict visux = arr_visux->data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    VISUX(1, j) = get(
+        visr, min, max,
+        VOF(0, j)
+    );
+    for(int i = 2; i <= isize; i++){
+      VISUX(i, j) = get(
+          visr, min, max,
+          + 0.5 * VOF(i-1, j  )
+          + 0.5 * VOF(i  , j  )
+      );
+    }
+    VISUX(isize + 1, j) = get(
+        visr, min, max,
+        VOF(isize + 1, j)
+    );
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      VISUX(1, j, k) = get(
+          visr, min, max,
+          VOF(0, j, k)
+      );
+      for(int i = 2; i <= isize; i++){
+        VISUX(i, j, k) = get(
+            visr, min, max,
+            + 0.5 * VOF(i-1, j  , k  )
+            + 0.5 * VOF(i  , j  , k  )
+        );
+      }
+      VISUX(isize + 1, j, k) = get(
+          visr, min, max,
+          VOF(isize + 1, j, k)
+      );
+    }
+  }
+#endif
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+#if NDIMS == 3
+    MPI_DOUBLE,
+#endif
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, arr_visux)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, arr_visux)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+static int compute_y(
+    const domain_t * domain,
+    const array_t * restrict arr_vof,
+    const double visr,
+    const double min,
+    const double max,
+    array_t * restrict arr_visuy
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict vof = arr_vof->data;
+  double * restrict visuy = arr_visuy->data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 0; i <= isize + 1; i++){
+      VISUY(i, j) = get(
+          visr, min, max,
+          + 0.5 * VOF(i  , j-1)
+          + 0.5 * VOF(i  , j  )
+      );
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        VISUY(i, j, k) = get(
+            visr, min, max,
+            + 0.5 * VOF(i  , j-1, k  )
+            + 0.5 * VOF(i  , j  , k  )
+        );
+      }
+    }
+  }
+#endif
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+#if NDIMS == 3
+    MPI_DOUBLE,
+#endif
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, arr_visuy)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, arr_visuy)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int compute_z(
+    const domain_t * domain,
+    const array_t * restrict arr_vof,
+    const double visr,
+    const double min,
+    const double max,
+    array_t * restrict arr_visuz
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict vof = arr_vof->data;
+  double * restrict visuz = arr_visuz->data;
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        VISUZ(i, j, k) = get(
+            visr, min, max,
+            + 0.5 * VOF(i  , j  , k-1)
+            + 0.5 * VOF(i  , j  , k  )
+        );
+      }
+    }
+  }
+  static MPI_Datatype dtypes[NDIMS - 1] = {
+    MPI_DOUBLE,
+    MPI_DOUBLE,
+  };
+  if(0 != halo_communicate_in_y(domain, dtypes + 0, arr_visuz)){
+    return 1;
+  }
+  if(0 != halo_communicate_in_z(domain, dtypes + 1, arr_visuz)){
+    return 1;
+  }
+  return 0;
+}
+#endif
+
+int fluid_compute_viscosity(
+    const domain_t * domain,
+    fluid_t * fluid,
+    const interface_t * interface
+){
+  const double visr = fluid->visr;
+  const double min = fmin(1., visr);
+  const double max = fmax(1., visr);
+  if(0 != compute_x(domain, &interface->vof, visr, min, max, &fluid->visux)){
+    return 1;
+  }
+  if(0 != compute_y(domain, &interface->vof, visr, min, max, &fluid->visuy)){
+    return 1;
+  }
+#if NDIMS == 3
+  if(0 != compute_z(domain, &interface->vof, visr, min, max, &fluid->visuz)){
+    return 1;
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/halo.c b/src/halo.c
new file mode 100644
index 0000000..b850ba4
--- /dev/null
+++ b/src/halo.c
@@ -0,0 +1,168 @@
+#include <stdio.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+
+// fixed parameters
+// since data type is defined, number of items is 1
+static const int nitems = 1;
+// same tag can be used since I use blocking communication
+static const int tag = 0;
+
+// assume the given data type has not been initialised yet
+static const MPI_Datatype dtype_uninitialised = MPI_DOUBLE;
+
+// communicate halo cells with the y-neighbour processes
+// NOTE: send boundary cells for simplicity
+int halo_communicate_in_y(
+    const domain_t * domain,
+    MPI_Datatype * dtype,
+    array_t * array
+){
+  // extract communicator
+  const sdecomp_info_t * info = domain->info;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(info, &comm_cart);
+  // check negative / positive neighbour ranks
+  int neighbours[2] = {MPI_PROC_NULL, MPI_PROC_NULL};
+  sdecomp.get_neighbours(info, SDECOMP_X1PENCIL, SDECOMP_YDIR, neighbours);
+  // array size (with halo and boundary cells)
+  const int isize_ = domain->mysizes[0] + array->nadds[0][0] + array->nadds[0][1];
+  const int jsize_ = domain->mysizes[1] + array->nadds[1][0] + array->nadds[1][1];
+#if NDIMS == 3
+  const int ksize_ = domain->mysizes[2] + array->nadds[2][0] + array->nadds[2][1];
+#endif
+  // number of halo cells
+  // this function assumes same number of halo cells
+  //   in the negative / positive directions
+  if(array->nadds[1][0] != array->nadds[1][1]){
+    printf("%s: number of halo cells in y (%d and %d) mismatch\n",
+        __func__, array->nadds[1][0], array->nadds[1][1]);
+    return 1;
+  }
+  const int nhalos_y = array->nadds[1][0];
+  // define datatype in y
+  if(dtype_uninitialised == *dtype){
+#if NDIMS == 2
+    MPI_Type_contiguous(
+        isize_ * nhalos_y,
+        *dtype,
+        dtype
+    );
+#else
+    MPI_Type_vector(
+        ksize_,
+        isize_ * nhalos_y,
+        isize_ * jsize_,
+        *dtype,
+        dtype
+    );
+#endif
+    MPI_Type_commit(dtype);
+  }
+  // send to positive, receive from negative
+  {
+#if NDIMS == 2
+    const int sindices[NDIMS] = {0, jsize_ - 2 * nhalos_y};
+    const int rindices[NDIMS] = {0,          0 * nhalos_y};
+    const size_t soffset = sindices[0] + isize_ * sindices[1];
+    const size_t roffset = rindices[0] + isize_ * rindices[1];
+#else
+    const int sindices[NDIMS] = {0, jsize_ - 2 * nhalos_y, 0};
+    const int rindices[NDIMS] = {0,          0 * nhalos_y, 0};
+    const size_t soffset = sindices[0] + isize_ * (sindices[1] + jsize_ * sindices[2]);
+    const size_t roffset = rindices[0] + isize_ * (rindices[1] + jsize_ * rindices[2]);
+#endif
+    MPI_Sendrecv(
+      (char *)array->data + array->size * soffset, nitems, *dtype, neighbours[1], tag,
+      (char *)array->data + array->size * roffset, nitems, *dtype, neighbours[0], tag,
+      comm_cart, MPI_STATUS_IGNORE
+    );
+  }
+  // send to negative, receive from positive
+  {
+#if NDIMS == 2
+    const int sindices[NDIMS] = {0,          1 * nhalos_y};
+    const int rindices[NDIMS] = {0, jsize_ - 1 * nhalos_y};
+    const size_t soffset = sindices[0] + isize_ * sindices[1];
+    const size_t roffset = rindices[0] + isize_ * rindices[1];
+#else
+    const int sindices[NDIMS] = {0,          1 * nhalos_y, 0};
+    const int rindices[NDIMS] = {0, jsize_ - 1 * nhalos_y, 0};
+    const size_t soffset = sindices[0] + isize_ * (sindices[1] + jsize_ * sindices[2]);
+    const size_t roffset = rindices[0] + isize_ * (rindices[1] + jsize_ * rindices[2]);
+#endif
+    MPI_Sendrecv(
+      (char *)array->data + array->size * soffset, nitems, *dtype, neighbours[0], tag,
+      (char *)array->data + array->size * roffset, nitems, *dtype, neighbours[1], tag,
+      comm_cart, MPI_STATUS_IGNORE
+    );
+  }
+  return 0;
+}
+
+#if NDIMS == 3
+// communicate halo cells with the z-neighbour processes
+// NOTE: send boundary cells for simplicity
+int halo_communicate_in_z(
+    const domain_t * domain,
+    MPI_Datatype * dtype,
+    array_t * array
+){
+  // extract communicator
+  const sdecomp_info_t * info = domain->info;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_cart(info, &comm_cart);
+  // check negative / positive neighbour ranks
+  int neighbours[2] = {MPI_PROC_NULL, MPI_PROC_NULL};
+  sdecomp.get_neighbours(info, SDECOMP_X1PENCIL, SDECOMP_ZDIR, neighbours);
+  // array size (with halo and boundary cells)
+  const int isize_ = domain->mysizes[0] + array->nadds[0][0] + array->nadds[0][1];
+  const int jsize_ = domain->mysizes[1] + array->nadds[1][0] + array->nadds[1][1];
+  const int ksize_ = domain->mysizes[2] + array->nadds[2][0] + array->nadds[2][1];
+  // number of halo cells
+  // this function assumes same number of halo cells
+  //   in the negative / positive directions
+  if(array->nadds[2][0] != array->nadds[2][1]){
+    printf("%s: number of halo cells in z (%d and %d) mismatch\n",
+        __func__, array->nadds[2][0], array->nadds[2][1]);
+    return 1;
+  }
+  const int nhalos_z = array->nadds[2][0];
+  // define datatype in z
+  if(dtype_uninitialised == *dtype){
+    MPI_Type_contiguous(
+        isize_ * jsize_ * nhalos_z,
+        *dtype,
+        dtype
+    );
+    MPI_Type_commit(dtype);
+  }
+  // send to positive, receive from negative
+  {
+    const int sindices[NDIMS] = {0, 0, ksize_ - 2 * nhalos_z};
+    const int rindices[NDIMS] = {0, 0,          0 * nhalos_z};
+    const size_t soffset = sindices[0] + isize_ * (sindices[1] + jsize_ * sindices[2]);
+    const size_t roffset = rindices[0] + isize_ * (rindices[1] + jsize_ * rindices[2]);
+    MPI_Sendrecv(
+      (char *)array->data + array->size * soffset, nitems, *dtype, neighbours[1], tag,
+      (char *)array->data + array->size * roffset, nitems, *dtype, neighbours[0], tag,
+      comm_cart, MPI_STATUS_IGNORE
+    );
+  }
+  // send to negative, receive from positive
+  {
+    const int sindices[NDIMS] = {0, 0,          1 * nhalos_z};
+    const int rindices[NDIMS] = {0, 0, ksize_ - 1 * nhalos_z};
+    const size_t soffset = sindices[0] + isize_ * (sindices[1] + jsize_ * sindices[2]);
+    const size_t roffset = rindices[0] + isize_ * (rindices[1] + jsize_ * rindices[2]);
+    MPI_Sendrecv(
+      (char *)array->data + array->size * soffset, nitems, *dtype, neighbours[0], tag,
+      (char *)array->data + array->size * roffset, nitems, *dtype, neighbours[1], tag,
+      comm_cart, MPI_STATUS_IGNORE
+    );
+  }
+  return 0;
+}
+#endif
diff --git a/src/integrate.c b/src/integrate.c
new file mode 100644
index 0000000..f266d7f
--- /dev/null
+++ b/src/integrate.c
@@ -0,0 +1,71 @@
+#include "runge_kutta.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "integrate.h"
+#include "decide_dt.h"
+
+// integrate the equations for one time step
+int integrate(
+    const domain_t * domain,
+    fluid_t * fluid,
+    interface_t * interface,
+    double * dt
+){
+  // decide time step size
+  if(0 != decide_dt(domain, fluid, interface, dt)){
+    return 1;
+  }
+  // Runge-Kutta iterations
+  // max iteration, should be three
+  for(size_t rkstep = 0; rkstep < RKSTEPMAX; rkstep++){
+    // compute k-step density and viscosity
+    if(0 != fluid_compute_density(domain, fluid, interface, 0)){
+      return 1;
+    }
+    if(0 != fluid_compute_viscosity(domain, fluid, interface)){
+      return 1;
+    }
+    // update vof field
+    if(0 != interface_compute_curvature_tensor(domain, interface)){
+      return 1;
+    }
+    if(0 != interface_compute_force(domain, fluid, interface)){
+      return 1;
+    }
+    if(0 != interface_update_vof(domain, rkstep, *dt, fluid, interface)){
+      return 1;
+    }
+    // compute (k+1)-step density
+    if(0 != fluid_compute_density(domain, fluid, interface, 1)){
+      return 1;
+    }
+    // override vof flux by mass flux
+    if(0 != interface_compute_mass_flux(domain, fluid, interface)){
+      return 1;
+    }
+    // predict flow field
+    if(0 != fluid_predict_field(domain, rkstep, *dt, fluid, interface)){
+      return 1;
+    }
+    // local time step sizes (gamma dt)
+    const double dt_old = rkcoefs[(rkstep + 2) % 3][rk_g] * (*dt);
+    const double dt_new = rkcoefs[(rkstep + 0) % 3][rk_g] * (*dt);
+    // compute scalar potential
+    if(0 != fluid_compute_potential(domain, dt_old, dt_new, fluid)){
+      return 1;
+    }
+    // correct velocity field to satisfy mass conservation
+    if(0 != fluid_correct_velocity(domain, dt_old, dt_new, fluid)){
+      return 1;
+    }
+    // update pressure
+    if(0 != fluid_update_pressure(domain, fluid)){
+      return 1;
+    }
+  }
+  return 0;
+}
+
diff --git a/src/interface/boundary.c b/src/interface/boundary.c
new file mode 100644
index 0000000..ca3c756
--- /dev/null
+++ b/src/interface/boundary.c
@@ -0,0 +1,63 @@
+#include <assert.h>
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "interface_solver.h"
+#include "array_macros/interface/vof.h"
+
+/**
+ * @brief update boundary values of vof field
+ * @param[in]     domain : information about domain decomposition and size
+ * @param[in,out] array  : volume-of-fluid
+ * @return               : error code
+ */
+int interface_update_boundaries_vof(
+    const domain_t * domain,
+    array_t * array
+){
+  for (size_t dim = 0; dim < NDIMS; dim++) {
+    assert(VOF_NADDS[dim][0] == array->nadds[dim][0]);
+    assert(VOF_NADDS[dim][1] == array->nadds[dim][1]);
+  }
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * vof = array->data;
+    // set boundary values
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      VOF(      0, j) = 0.;
+      VOF(isize+1, j) = 0.;
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        VOF(      0, j, k) = 0.;
+        VOF(isize+1, j, k) = 0.;
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
diff --git a/src/interface/common.c b/src/interface/common.c
new file mode 100644
index 0000000..4bf4efb
--- /dev/null
+++ b/src/interface/common.c
@@ -0,0 +1,15 @@
+#include "internal.h"
+
+// pm 1 / 2 / sqrt(3)
+const double gauss_ps[NGAUSS] = {
+  - 0.2886751345948129,
+  + 0.2886751345948129,
+};
+const double gauss_ws[NGAUSS] = {
+  + 0.5,
+  + 0.5,
+};
+
+const double vofbeta = 1.;
+const double vofmin = 1.e-8;
+
diff --git a/src/interface/curvature_tensor.c b/src/interface/curvature_tensor.c
new file mode 100644
index 0000000..d0ee04e
--- /dev/null
+++ b/src/interface/curvature_tensor.c
@@ -0,0 +1,302 @@
+#include <math.h>
+#include <float.h>
+#include "param.h"
+#include "domain.h"
+#include "interface.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/hxxc.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/dvof.h"
+#include "array_macros/interface/normal.h"
+#include "array_macros/interface/curv.h"
+
+static int compute_gradient(
+    const domain_t * domain,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict vof = interface->vof.data;
+  vector_t * restrict dvof = interface->dvof.data;
+#if NDIMS == 2
+  for(int j = 0; j <= jsize + 2; j++){
+    for(int i = 1; i <= isize + 1; i++){
+      // x gradient
+      const double gradx = 1. / HXXF(i  ) * (
+          - VOF(i-1, j-1) + VOF(i  , j-1)
+          - VOF(i-1, j  ) + VOF(i  , j  )
+      );
+      // y gradient
+      const double grady = 1. / hy * (
+          - VOF(i-1, j-1) - VOF(i  , j-1)
+          + VOF(i-1, j  ) + VOF(i  , j  )
+      );
+      // normalise and obtain corner normals
+      const double norm = sqrt(
+          + pow(gradx, 2.)
+          + pow(grady, 2.)
+      );
+      const double norminv = 1. / fmax(norm, DBL_EPSILON);
+      DVOF(i, j)[0] = gradx * norminv;
+      DVOF(i, j)[1] = grady * norminv;
+    }
+  }
+#else
+  for(int k = 0; k <= ksize + 2; k++){
+    for(int j = 0; j <= jsize + 2; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        // x gradient
+        const double gradx = 1. / HXXF(i  ) * (
+            - VOF(i-1, j-1, k-1) + VOF(i  , j-1, k-1)
+            - VOF(i-1, j  , k-1) + VOF(i  , j  , k-1)
+            - VOF(i-1, j-1, k  ) + VOF(i  , j-1, k  )
+            - VOF(i-1, j  , k  ) + VOF(i  , j  , k  )
+        );
+        // y gradient
+        const double grady = 1. / hy * (
+            - VOF(i-1, j-1, k-1) - VOF(i  , j-1, k-1)
+            + VOF(i-1, j  , k-1) + VOF(i  , j  , k-1)
+            - VOF(i-1, j-1, k  ) - VOF(i  , j-1, k  )
+            + VOF(i-1, j  , k  ) + VOF(i  , j  , k  )
+        );
+        // z gradient
+        const double gradz = 1. / hz * (
+            - VOF(i-1, j-1, k-1) - VOF(i  , j-1, k-1)
+            - VOF(i-1, j  , k-1) - VOF(i  , j  , k-1)
+            + VOF(i-1, j-1, k  ) + VOF(i  , j-1, k  )
+            + VOF(i-1, j  , k  ) + VOF(i  , j  , k  )
+        );
+        // normalise and obtain corner normals
+        const double norm = sqrt(
+            + pow(gradx, 2.)
+            + pow(grady, 2.)
+            + pow(gradz, 2.)
+        );
+        const double norminv = 1. / fmax(norm, DBL_EPSILON);
+        DVOF(i, j, k)[0] = gradx * norminv;
+        DVOF(i, j, k)[1] = grady * norminv;
+        DVOF(i, j, k)[2] = gradz * norminv;
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int compute_normal(
+    const domain_t * domain,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxc = domain->hxxc;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict vof = interface->vof.data;
+  const vector_t * restrict dvof = interface->dvof.data;
+  normal_t * restrict normal = interface->normal.data;
+#if NDIMS == 2
+  for(int j = 0; j <= jsize + 1; j++){
+    for(int i = 1; i <= isize; i++){
+      const double lvof = VOF(i, j);
+      // for (almost) single-phase region,
+      //   surface reconstruction is not needed
+      if(lvof < vofmin || 1. - vofmin < lvof){
+        continue;
+      }
+      // average nx
+      double nx = (
+          + DVOF(i  , j  )[0] + DVOF(i+1, j  )[0]
+          + DVOF(i  , j+1)[0] + DVOF(i+1, j+1)[0]
+      );
+      // average ny
+      double ny = (
+          + DVOF(i  , j  )[1] + DVOF(i+1, j  )[1]
+          + DVOF(i  , j+1)[1] + DVOF(i+1, j+1)[1]
+      );
+      // normalise and obtain center normals
+      nx /= HXXC(i  );
+      ny /= hy;
+      const double norm = sqrt(
+          + pow(nx, 2.)
+          + pow(ny, 2.)
+      );
+      const double norminv = 1. / fmax(norm, DBL_EPSILON);
+      nx *= norminv;
+      ny *= norminv;
+      // store normal and intercept
+      NORMAL(i, j)[0] = nx;
+      NORMAL(i, j)[1] = ny;
+      NORMAL(i, j)[2] = - 0.5 / vofbeta * log(1. / lvof - 1.);
+    }
+  }
+#else
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 1; i <= isize; i++){
+        const double lvof = VOF(i, j, k);
+        // for (almost) single-phase region,
+        //   surface reconstruction is not needed
+        if(lvof < vofmin || 1. - vofmin < lvof){
+          continue;
+        }
+        // average nx
+        double nx = (
+            + DVOF(i  , j  , k  )[0] + DVOF(i+1, j  , k  )[0]
+            + DVOF(i  , j+1, k  )[0] + DVOF(i+1, j+1, k  )[0]
+            + DVOF(i  , j  , k+1)[0] + DVOF(i+1, j  , k+1)[0]
+            + DVOF(i  , j+1, k+1)[0] + DVOF(i+1, j+1, k+1)[0]
+        );
+        // average ny
+        double ny = (
+            + DVOF(i  , j  , k  )[1] + DVOF(i+1, j  , k  )[1]
+            + DVOF(i  , j+1, k  )[1] + DVOF(i+1, j+1, k  )[1]
+            + DVOF(i  , j  , k+1)[1] + DVOF(i+1, j  , k+1)[1]
+            + DVOF(i  , j+1, k+1)[1] + DVOF(i+1, j+1, k+1)[1]
+        );
+        // average nz
+        double nz = (
+            + DVOF(i  , j  , k  )[2] + DVOF(i+1, j  , k  )[2]
+            + DVOF(i  , j+1, k  )[2] + DVOF(i+1, j+1, k  )[2]
+            + DVOF(i  , j  , k+1)[2] + DVOF(i+1, j  , k+1)[2]
+            + DVOF(i  , j+1, k+1)[2] + DVOF(i+1, j+1, k+1)[2]
+        );
+        // normalise and obtain center normals
+        nx = nx / HXXC(i  );
+        ny = ny / hy;
+        nz = nz / hz;
+        const double norm = sqrt(
+            + pow(nx, 2.)
+            + pow(ny, 2.)
+            + pow(nz, 2.)
+        );
+        const double norminv = 1. / fmax(norm, DBL_EPSILON);
+        nx *= norminv;
+        ny *= norminv;
+        nz *= norminv;
+        // store normal and intercept
+        NORMAL(i, j, k)[0] = nx;
+        NORMAL(i, j, k)[1] = ny;
+        NORMAL(i, j, k)[2] = nz;
+        NORMAL(i, j, k)[3] = - 0.5 / vofbeta * log(1. / lvof - 1.);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+// compute mean curvature from corner normals
+static int compute_curvature(
+    const domain_t * domain,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const vector_t * restrict dvof = interface->dvof.data;
+  double * restrict curv = interface->curv.data;
+#if NDIMS == 2
+  for(int j = 0; j <= jsize + 1; j++){
+    for(int i = 1; i <= isize; i++){
+      const double hx_xm = HXXF(i  );
+      const double hx_xp = HXXF(i+1);
+      const double jd_xm = JDXF(i  );
+      const double jd_x0 = JDXC(i  );
+      const double jd_xp = JDXF(i+1);
+      const double nx_xm = + 0.5 * DVOF(i  , j  )[0]
+                           + 0.5 * DVOF(i  , j+1)[0];
+      const double nx_xp = + 0.5 * DVOF(i+1, j  )[0]
+                           + 0.5 * DVOF(i+1, j+1)[0];
+      const double ny_ym = + 0.5 * DVOF(i  , j  )[1]
+                           + 0.5 * DVOF(i+1, j  )[1];
+      const double ny_yp = + 0.5 * DVOF(i  , j+1)[1]
+                           + 0.5 * DVOF(i+1, j+1)[1];
+      const double div = 1. / jd_x0 * (
+          - jd_xm / hx_xm * nx_xm + jd_xp / hx_xp * nx_xp
+          - jd_x0 / hy    * ny_ym + jd_x0 / hy    * ny_yp
+      );
+      CURV(i, j) = - 1. * div;
+    }
+  }
+#else
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 1; i <= isize; i++){
+        const double hx_xm = HXXF(i  );
+        const double hx_xp = HXXF(i+1);
+        const double jd_xm = JDXF(i  );
+        const double jd_x0 = JDXC(i  );
+        const double jd_xp = JDXF(i+1);
+        const double nx_xm = + 0.25 * DVOF(i  , j  , k  )[0]
+                             + 0.25 * DVOF(i  , j+1, k  )[0]
+                             + 0.25 * DVOF(i  , j  , k+1)[0]
+                             + 0.25 * DVOF(i  , j+1, k+1)[0];
+        const double nx_xp = + 0.25 * DVOF(i+1, j  , k  )[0]
+                             + 0.25 * DVOF(i+1, j+1, k  )[0]
+                             + 0.25 * DVOF(i+1, j  , k+1)[0]
+                             + 0.25 * DVOF(i+1, j+1, k+1)[0];
+        const double ny_ym = + 0.25 * DVOF(i  , j  , k  )[1]
+                             + 0.25 * DVOF(i+1, j  , k  )[1]
+                             + 0.25 * DVOF(i  , j  , k+1)[1]
+                             + 0.25 * DVOF(i+1, j  , k+1)[1];
+        const double ny_yp = + 0.25 * DVOF(i  , j+1, k  )[1]
+                             + 0.25 * DVOF(i+1, j+1, k  )[1]
+                             + 0.25 * DVOF(i  , j+1, k+1)[1]
+                             + 0.25 * DVOF(i+1, j+1, k+1)[1];
+        const double nz_zm = + 0.25 * DVOF(i  , j  , k  )[2]
+                             + 0.25 * DVOF(i+1, j  , k  )[2]
+                             + 0.25 * DVOF(i  , j+1, k  )[2]
+                             + 0.25 * DVOF(i+1, j+1, k  )[2];
+        const double nz_zp = + 0.25 * DVOF(i  , j  , k+1)[2]
+                             + 0.25 * DVOF(i+1, j  , k+1)[2]
+                             + 0.25 * DVOF(i  , j+1, k+1)[2]
+                             + 0.25 * DVOF(i+1, j+1, k+1)[2];
+        const double div = 1. / jd_x0 * (
+            - jd_xm / hx_xm * nx_xm + jd_xp / hx_xp * nx_xp
+            - jd_x0 / hy    * ny_ym + jd_x0 / hy    * ny_yp
+            - jd_x0 / hz    * nz_zm + jd_x0 / hz    * nz_zp
+        );
+        CURV(i, j, k) = - 1. * div;
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+int interface_compute_curvature_tensor(
+    const domain_t * domain,
+    interface_t * interface
+){
+  compute_gradient(domain, interface);
+  compute_normal(domain, interface);
+  compute_curvature(domain, interface);
+  return 0;
+}
+
diff --git a/src/interface/force.c b/src/interface/force.c
new file mode 100644
index 0000000..01861eb
--- /dev/null
+++ b/src/interface/force.c
@@ -0,0 +1,194 @@
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/ifrcx.h"
+#include "array_macros/interface/ifrcy.h"
+#include "array_macros/interface/ifrcz.h"
+#include "array_macros/interface/curv.h"
+
+// compute density factor | 5
+static double compute_refdeninv(
+    const double denr
+){
+  return 2. / (1. + denr);
+}
+
+static int compute_force_x(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double tension = 1. / interface->We;
+  const double * restrict den = fluid->den[0].data;
+  const double refdeninv = compute_refdeninv(fluid->denr);
+  const double * restrict vof = interface->vof.data;
+  const double * restrict curv = interface->curv.data;
+  double * restrict ifrcx = interface->ifrcx.data;
+#if NDIMS == 2
+  // compute surface tension force in x direction | 17
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      const double den_x0 = (
+          + 0.5 * DEN(i-1, j  )
+          + 0.5 * DEN(i  , j  )
+      );
+      const double kappa = (
+          + 0.5 * CURV(i-1, j  )
+          + 0.5 * CURV(i  , j  )
+      );
+      const double delta = 1. / HXXF(i  ) * (
+          - VOF(i-1, j  )
+          + VOF(i  , j  )
+      );
+      IFRCX(i, j) = den_x0 * refdeninv * tension * kappa * delta;
+    }
+  }
+#else
+  // compute surface tension force in x direction | 19
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double den_x0 = (
+            + 0.5 * DEN(i-1, j  , k  )
+            + 0.5 * DEN(i  , j  , k  )
+        );
+        const double kappa = (
+            + 0.5 * CURV(i-1, j  , k  )
+            + 0.5 * CURV(i  , j  , k  )
+        );
+        const double delta = 1. / HXXF(i  ) * (
+            - VOF(i-1, j  , k  )
+            + VOF(i  , j  , k  )
+        );
+        IFRCX(i, j, k) = den_x0 * refdeninv * tension * kappa * delta;
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int compute_force_y(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double hy = domain->hy;
+  const double tension = 1. / interface->We;
+  const double * restrict den = fluid->den[0].data;
+  const double refdeninv = compute_refdeninv(fluid->denr);
+  const double * restrict vof = interface->vof.data;
+  const double * restrict curv = interface->curv.data;
+  double * restrict ifrcy = interface->ifrcy.data;
+#if NDIMS == 2
+  // compute surface tension force in y direction | 17
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double den_y0 = (
+          + 0.5 * DEN(i  , j-1)
+          + 0.5 * DEN(i  , j  )
+      );
+      const double kappa = (
+          + 0.5 * CURV(i  , j-1)
+          + 0.5 * CURV(i  , j  )
+      );
+      const double delta = 1. / hy * (
+          - VOF(i  , j-1)
+          + VOF(i  , j  )
+      );
+      IFRCY(i, j) = den_y0 * refdeninv * tension * kappa * delta;
+    }
+  }
+#else
+  // compute surface tension force in y direction | 19
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double den_y0 = (
+            + 0.5 * DEN(i  , j-1, k  )
+            + 0.5 * DEN(i  , j  , k  )
+        );
+        const double kappa = (
+            + 0.5 * CURV(i  , j-1, k  )
+            + 0.5 * CURV(i  , j  , k  )
+        );
+        const double delta = 1. / hy * (
+            - VOF(i  , j-1, k  )
+            + VOF(i  , j  , k  )
+        );
+        IFRCY(i, j, k) = den_y0 * refdeninv * tension * kappa * delta;
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int compute_force_z(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double hz = domain->hz;
+  const double tension = 1. / interface->We;
+  const double * restrict den = fluid->den[0].data;
+  const double refdeninv = compute_refdeninv(fluid->denr);
+  const double * restrict vof = interface->vof.data;
+  const double * restrict curv = interface->curv.data;
+  double * restrict ifrcz = interface->ifrcz.data;
+  // compute surface tension force in z direction | 19
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double den_z0 = (
+            + 0.5 * DEN(i  , j  , k-1)
+            + 0.5 * DEN(i  , j  , k  )
+        );
+        const double kappa = (
+            + 0.5 * CURV(i  , j  , k-1)
+            + 0.5 * CURV(i  , j  , k  )
+        );
+        const double delta = 1. / hz * (
+            - VOF(i  , j  , k-1)
+            + VOF(i  , j  , k  )
+        );
+        IFRCZ(i, j, k) = den_z0 * refdeninv * tension * kappa * delta;
+      }
+    }
+  }
+  return 0;
+}
+#endif
+
+int interface_compute_force(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  compute_force_x(domain, fluid, interface);
+  compute_force_y(domain, fluid, interface);
+#if NDIMS == 3
+  compute_force_z(domain, fluid, interface);
+#endif
+  return 0;
+}
+
diff --git a/src/interface/init.c b/src/interface/init.c
new file mode 100644
index 0000000..07e309d
--- /dev/null
+++ b/src/interface/init.c
@@ -0,0 +1,82 @@
+#include "config.h"
+#include "domain.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "fileio.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/ifrcx.h"
+#include "array_macros/interface/ifrcy.h"
+#if NDIMS == 3
+#include "array_macros/interface/ifrcz.h"
+#endif
+#include "array_macros/interface/dvof.h"
+#include "array_macros/interface/normal.h"
+#include "array_macros/interface/curv.h"
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#if NDIMS == 3
+#include "array_macros/interface/fluxz.h"
+#endif
+#include "array_macros/interface/src.h"
+
+/**
+ * @brief allocate interface_t
+ * @param[in]  domain    : information about domain decomposition and size
+ * @param[out] interface : structure being allocated
+ * @return               : error code
+ */
+static int allocate(
+    const domain_t * domain,
+    interface_t * interface
+){
+  if(0 != array.prepare(domain, VOF_NADDS, sizeof(double), &interface->vof)) return 1;
+  if(0 != array.prepare(domain, IFRCX_NADDS, sizeof(double), &interface->ifrcx)) return 1;
+  if(0 != array.prepare(domain, IFRCY_NADDS, sizeof(double), &interface->ifrcy)) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, IFRCZ_NADDS, sizeof(double), &interface->ifrcz)) return 1;
+#endif
+  if(0 != array.prepare(domain, DVOF_NADDS, sizeof(vector_t), &interface->dvof)) return 1;
+  if(0 != array.prepare(domain, NORMAL_NADDS, sizeof(normal_t), &interface->normal)) return 1;
+  if(0 != array.prepare(domain, CURV_NADDS, sizeof(double), &interface->curv)) return 1;
+  if(0 != array.prepare(domain, FLUXX_NADDS, sizeof(double), &interface->fluxx)) return 1;
+  if(0 != array.prepare(domain, FLUXY_NADDS, sizeof(double), &interface->fluxy)) return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, FLUXZ_NADDS, sizeof(double), &interface->fluxz)) return 1;
+#endif
+  for(size_t n = 0; n < 2; n++){
+    if(0 != array.prepare(domain, SRC_NADDS, sizeof(double), &interface->src[n])) return 1;
+  }
+  return 0;
+}
+
+static void report(
+    const sdecomp_info_t * info,
+    const interface_t * interface
+){
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(info, &myrank);
+  if(root == myrank){
+    FILE * stream = stdout;
+    fprintf(stream, "INTERFACE\n");
+    fprintf(stream, "\tWe: % .7e\n", interface->We);
+    fflush(stream);
+  }
+}
+
+int interface_init(
+    const char dirname_ic[],
+    const domain_t * domain,
+    interface_t * interface
+){
+  if(0 != allocate(domain, interface)) return 1;
+  // load interface field
+  if(0 != array.load(domain, dirname_ic, "vof", fileio.npy_double, &interface->vof)) return 1;
+  // impose boundary conditions and communicate halo cells
+  interface_update_boundaries_vof(domain, &interface->vof);
+  // compute surface tension coefficient
+  if(0 != config.get_double("We", &interface->We)) return 1;
+  report(domain->info, interface);
+  return 0;
+}
+
diff --git a/src/interface/internal.h b/src/interface/internal.h
new file mode 100644
index 0000000..cbb8991
--- /dev/null
+++ b/src/interface/internal.h
@@ -0,0 +1,11 @@
+#if !defined(INTERFACE_INTERNAL_H)
+#define INTERFACE_INTERNAL_H
+
+#define NGAUSS 2
+extern const double gauss_ps[NGAUSS];
+extern const double gauss_ws[NGAUSS];
+
+extern const double vofbeta;
+extern const double vofmin;
+
+#endif // INTERFACE_INTERNAL_H
diff --git a/src/interface/mass_flux.c b/src/interface/mass_flux.c
new file mode 100644
index 0000000..b213e91
--- /dev/null
+++ b/src/interface/mass_flux.c
@@ -0,0 +1,120 @@
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#if NDIMS == 3
+#include "array_macros/interface/fluxz.h"
+#endif
+
+static int convert_x(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict ux = fluid->ux.data;
+  double * restrict fluxx = interface->fluxx.data;
+  const double denr = fluid->denr;
+#if NDIMS == 2
+  // convert x vof flux to x mass flux | 5
+  for(int j = 0; j <= jsize + 1; j++){
+    for(int i = 1; i <= isize + 1; i++){
+      FLUXX(i, j) = UX(i, j) + (denr - 1.) * FLUXX(i, j);
+    }
+  }
+#else
+  // convert x vof flux to x mass flux | 7
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        FLUXX(i, j, k) = UX(i, j, k) + (denr - 1.) * FLUXX(i, j, k);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int convert_y(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict uy = fluid->uy.data;
+  double * restrict fluxy = interface->fluxy.data;
+  const double denr = fluid->denr;
+#if NDIMS == 2
+  // convert y vof flux to y mass flux | 5
+  for(int j = 0; j <= jsize + 1; j++){
+    for(int i = 0; i <= isize + 1; i++){
+      FLUXY(i, j) = UY(i, j) + (denr - 1.) * FLUXY(i, j);
+    }
+  }
+#else
+  // convert y vof flux to y mass flux | 7
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        FLUXY(i, j, k) = UY(i, j, k) + (denr - 1.) * FLUXY(i, j, k);
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+#if NDIMS == 3
+static int convert_z(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict uz = fluid->uz.data;
+  double * restrict fluxz = interface->fluxz.data;
+  const double denr = fluid->denr;
+  // convert z vof flux to z mass flux | 7
+  for(int k = 0; k <= ksize + 1; k++){
+    for(int j = 0; j <= jsize + 1; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        FLUXZ(i, j, k) = UZ(i, j, k) + (denr - 1.) * FLUXZ(i, j, k);
+      }
+    }
+  }
+  return 0;
+}
+#endif
+
+int interface_compute_mass_flux(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  // NOTE: since all data is inside the process,
+  //         update halo and boundary values locally
+  convert_x(domain, fluid, interface);
+  convert_y(domain, fluid, interface);
+#if NDIMS == 3
+  convert_z(domain, fluid, interface);
+#endif
+  return 0;
+}
+
diff --git a/src/interface/save.c b/src/interface/save.c
new file mode 100644
index 0000000..368e00b
--- /dev/null
+++ b/src/interface/save.c
@@ -0,0 +1,23 @@
+#include "sdecomp.h"
+#include "array.h"
+#include "domain.h"
+#include "interface.h"
+#include "fileio.h"
+
+int interface_save(
+    const char dirname[],
+    const domain_t * domain,
+    const interface_t * interface
+){
+  // serial
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    fileio.w_serial(dirname, "We", 0, NULL, fileio.npy_double, sizeof(double), &interface->We);
+  }
+  // collective
+  array.dump(domain, dirname, "vof", fileio.npy_double, &interface->vof);
+  return 0;
+}
+
diff --git a/src/interface/update/fluxx.c b/src/interface/update/fluxx.c
new file mode 100644
index 0000000..0dacec1
--- /dev/null
+++ b/src/interface/update/fluxx.c
@@ -0,0 +1,126 @@
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid.h"
+#include "interface.h"
+#include "../internal.h"
+#include "internal.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/normal.h"
+#include "array_macros/interface/fluxx.h"
+
+static int update_boundaries(
+    const domain_t * domain,
+    array_t * array
+){
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * fluxx = array->data;
+    // assume impermeable walls
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      FLUXX(      1, j) = 0.;
+      FLUXX(isize+1, j) = 0.;
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        FLUXX(      1, j, k) = 0.;
+        FLUXX(isize+1, j, k) = 0.;
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
+int compute_flux_x(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict vof = interface->vof.data;
+  const normal_t * restrict normal = interface->normal.data;
+  double * restrict fluxx = interface->fluxx.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      // use upwind information
+      const double vel = UX(i, j);
+      const int    ii = vel < 0. ?    i : i - 1;
+      const double  x = vel < 0. ? -0.5 :  +0.5;
+      // evaluate flux
+      const double lvof = VOF(ii, j);
+      if(lvof < vofmin || 1. - vofmin < lvof){
+        FLUXX(i, j) = vel * lvof;
+        continue;
+      }
+      double flux = 0.;
+      for(int jj = 0; jj < NGAUSS; jj++){
+        const double w = gauss_ws[jj];
+        const double y = gauss_ps[jj];
+        flux += w * indicator(NORMAL(ii, j), (const double [NDIMS]){x, y});
+      }
+      FLUXX(i, j) = vel * flux;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        // use upwind information
+        const double vel = UX(i, j, k);
+        const int    ii = vel < 0. ?    i : i - 1;
+        const double  x = vel < 0. ? -0.5 :  +0.5;
+        // evaluate flux
+        const double lvof = VOF(ii, j, k);
+        if(lvof < vofmin || 1. - vofmin < lvof){
+          FLUXX(i, j, k) = vel * lvof;
+          continue;
+        }
+        double flux = 0.;
+        for(int kk = 0; kk < NGAUSS; kk++){
+          for(int jj = 0; jj < NGAUSS; jj++){
+            const double w = gauss_ws[jj] * gauss_ws[kk];
+            const double y = gauss_ps[jj];
+            const double z = gauss_ps[kk];
+            flux += w * indicator(NORMAL(ii, j, k), (const double [NDIMS]){x, y, z});
+          }
+        }
+        FLUXX(i, j, k) = vel * flux;
+      }
+    }
+  }
+#endif
+  update_boundaries(domain, &interface->fluxx);
+  return 0;
+}
+
diff --git a/src/interface/update/fluxy.c b/src/interface/update/fluxy.c
new file mode 100644
index 0000000..35e4890
--- /dev/null
+++ b/src/interface/update/fluxy.c
@@ -0,0 +1,126 @@
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid.h"
+#include "interface.h"
+#include "../internal.h"
+#include "internal.h"
+#include "array_macros/fluid/uy.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/normal.h"
+#include "array_macros/interface/fluxy.h"
+
+static int update_boundaries(
+    const domain_t * domain,
+    array_t * array
+){
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+    const int ksize = domain->mysizes[2];
+#endif
+    double * fluxy = array->data;
+    // dummy
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      FLUXY(      0, j) = 0.;
+      FLUXY(isize+1, j) = 0.;
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        FLUXY(      0, j, k) = 0.;
+        FLUXY(isize+1, j, k) = 0.;
+      }
+    }
+#endif
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+#if NDIMS == 3
+      MPI_DOUBLE,
+#endif
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+#if NDIMS == 3
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+#endif
+  }
+  return 0;
+}
+
+int compute_flux_y(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict uy = fluid->uy.data;
+  const double * restrict vof = interface->vof.data;
+  const normal_t * restrict normal = interface->normal.data;
+  double * restrict fluxy = interface->fluxy.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      // use upwind information
+      const double vel = UY(i, j);
+      const int    jj = vel < 0. ?    j : j - 1;
+      const double  y = vel < 0. ? -0.5 :  +0.5;
+      // evaluate flux
+      const double lvof = VOF(i, jj);
+      if(lvof < vofmin || 1. - vofmin < lvof){
+        FLUXY(i, j) = vel * lvof;
+        continue;
+      }
+      double flux = 0.;
+      for(int ii = 0; ii < NGAUSS; ii++){
+        const double w = gauss_ws[ii];
+        const double x = gauss_ps[ii];
+        flux += w * indicator(NORMAL(i, jj), (const double [NDIMS]){x, y});
+      }
+      FLUXY(i, j) = vel * flux;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // use upwind information
+        const double vel = UY(i, j, k);
+        const int    jj = vel < 0. ?    j : j - 1;
+        const double  y = vel < 0. ? -0.5 :  +0.5;
+        // evaluate flux
+        const double lvof = VOF(i, jj, k);
+        if(lvof < vofmin || 1. - vofmin < lvof){
+          FLUXY(i, j, k) = vel * lvof;
+          continue;
+        }
+        double flux = 0.;
+        for(int kk = 0; kk < NGAUSS; kk++){
+          for(int ii = 0; ii < NGAUSS; ii++){
+            const double w = gauss_ws[ii] * gauss_ws[kk];
+            const double x = gauss_ps[ii];
+            const double z = gauss_ps[kk];
+            flux += w * indicator(NORMAL(i, jj, k), (const double [NDIMS]){x, y, z});
+          }
+        }
+        FLUXY(i, j, k) = vel * flux;
+      }
+    }
+  }
+#endif
+  update_boundaries(domain, &interface->fluxy);
+  return 0;
+}
+
diff --git a/src/interface/update/fluxz.c b/src/interface/update/fluxz.c
new file mode 100644
index 0000000..5ff7273
--- /dev/null
+++ b/src/interface/update/fluxz.c
@@ -0,0 +1,88 @@
+#if NDIMS == 3
+#include <mpi.h>
+#include "array.h"
+#include "domain.h"
+#include "halo.h"
+#include "fluid.h"
+#include "interface.h"
+#include "../internal.h"
+#include "internal.h"
+#include "array_macros/fluid/uz.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/normal.h"
+#include "array_macros/interface/fluxz.h"
+
+static int update_boundaries(
+    const domain_t * domain,
+    array_t * array
+){
+  {
+    const int isize = domain->mysizes[0];
+    const int jsize = domain->mysizes[1];
+    const int ksize = domain->mysizes[2];
+    double * fluxz = array->data;
+    // dummy
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        FLUXZ(      0, j, k) = 0.;
+        FLUXZ(isize+1, j, k) = 0.;
+      }
+    }
+  }
+  {
+    static MPI_Datatype dtypes[NDIMS - 1] = {
+      MPI_DOUBLE,
+      MPI_DOUBLE,
+    };
+    if(0 != halo_communicate_in_y(domain, dtypes + 0, array)){
+      return 1;
+    }
+    if(0 != halo_communicate_in_z(domain, dtypes + 1, array)){
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int compute_flux_z(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  const double * restrict uz = fluid->uz.data;
+  const double * restrict vof = interface->vof.data;
+  const normal_t * restrict normal = interface->normal.data;
+  double * restrict fluxz = interface->fluxz.data;
+  for(int k = 1; k <= ksize + 1; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // use upwind information
+        const double vel = UZ(i, j, k);
+        const int    kk = vel < 0. ?    k : k - 1;
+        const double  z = vel < 0. ? -0.5 :  +0.5;
+        // evaluate flux
+        const double lvof = VOF(i, j, kk);
+        if(lvof < vofmin || 1. - vofmin < lvof){
+          FLUXZ(i, j, k) = vel * lvof;
+          continue;
+        }
+        double flux = 0.;
+        for(int jj = 0; jj < NGAUSS; jj++){
+          for(int ii = 0; ii < NGAUSS; ii++){
+            const double w = gauss_ws[ii] * gauss_ws[jj];
+            const double x = gauss_ps[ii];
+            const double y = gauss_ps[jj];
+            flux += w * indicator(NORMAL(i, j, kk), (const double [NDIMS]){x, y, z});
+          }
+        }
+        FLUXZ(i, j, k) = vel * flux;
+      }
+    }
+  }
+  update_boundaries(domain, &interface->fluxz);
+  return 0;
+}
+#endif
diff --git a/src/interface/update/internal.h b/src/interface/update/internal.h
new file mode 100644
index 0000000..b80a10d
--- /dev/null
+++ b/src/interface/update/internal.h
@@ -0,0 +1,31 @@
+#if !defined(INTERFACE_INTERNAL_UPDATE_H)
+#define INTERFACE_INTERNAL_UPDATE_H
+
+#include "interface.h"
+
+extern double indicator(
+    const normal_t n,
+    const vector_t x
+);
+
+extern int compute_flux_x(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+
+extern int compute_flux_y(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+
+#if NDIMS == 3
+extern int compute_flux_z(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    interface_t * interface
+);
+#endif
+
+#endif // INTERFACE_INTERNAL_UPDATE_H
diff --git a/src/interface/update/main.c b/src/interface/update/main.c
new file mode 100644
index 0000000..ad8c065
--- /dev/null
+++ b/src/interface/update/main.c
@@ -0,0 +1,207 @@
+#include <math.h>
+#include "runge_kutta.h"
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "../internal.h"
+#include "internal.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/interface/vof.h"
+#include "array_macros/interface/fluxx.h"
+#include "array_macros/interface/fluxy.h"
+#if NDIMS == 3
+#include "array_macros/interface/fluxz.h"
+#endif
+#include "array_macros/interface/src.h"
+
+// planar surface
+inline static double surface_function(
+    const normal_t n,
+    const vector_t x
+){
+  return
+    + n[0] * x[0]
+    + n[1] * x[1]
+#if NDIMS == 3
+    + n[2] * x[2]
+#endif
+    + n[NDIMS];
+}
+
+// diffused surface representation
+double indicator(
+    const normal_t n,
+    const vector_t x
+){
+  const double sf = surface_function(n, x);
+  return 0.5 * (1. + tanh(vofbeta * sf));
+}
+
+static int reset_srcs(
+    const size_t rkstep,
+    array_t * restrict srca,
+    array_t * restrict srcb
+){
+  // copy previous k-step source term and reset
+  if(0 != rkstep){
+    // stash previous RK source term,
+    //   which is achieved by swapping
+    //   the pointers to "data"
+    double * tmp = srca->data;
+    srca->data = srcb->data;
+    srcb->data = tmp;
+  }
+  return 0;
+}
+
+static int interface_compute_rhs(
+    const domain_t * domain,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double * restrict fluxx = interface->fluxx.data;
+  const double * restrict fluxy = interface->fluxy.data;
+#if NDIMS == 3
+  const double * restrict fluxz = interface->fluxz.data;
+#endif
+  double * restrict src = interface->src[rk_a].data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      // compute source of volume-of-fluid | 15
+      const double hx_xm = HXXF(i  );
+      const double hx_xp = HXXF(i+1);
+      const double jd_xm = JDXF(i  );
+      const double jd_x0 = JDXC(i  );
+      const double jd_xp = JDXF(i+1);
+      const double flux_xm = FLUXX(i  , j  );
+      const double flux_xp = FLUXX(i+1, j  );
+      const double flux_ym = FLUXY(i  , j  );
+      const double flux_yp = FLUXY(i  , j+1);
+      SRC(i, j) = 1. / jd_x0 * (
+          + jd_xm / hx_xm * flux_xm
+          - jd_xp / hx_xp * flux_xp
+          + jd_x0 / hy    * flux_ym
+          - jd_x0 / hy    * flux_yp
+      );
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // compute source of volume-of-fluid | 19
+        const double hx_xm = HXXF(i  );
+        const double hx_xp = HXXF(i+1);
+        const double jd_xm = JDXF(i  );
+        const double jd_x0 = JDXC(i  );
+        const double jd_xp = JDXF(i+1);
+        const double flux_xm = FLUXX(i  , j  , k  );
+        const double flux_xp = FLUXX(i+1, j  , k  );
+        const double flux_ym = FLUXY(i  , j  , k  );
+        const double flux_yp = FLUXY(i  , j+1, k  );
+        const double flux_zm = FLUXZ(i  , j  , k  );
+        const double flux_zp = FLUXZ(i  , j  , k+1);
+        SRC(i, j, k) = 1. / jd_x0 * (
+            + jd_xm / hx_xm * flux_xm
+            - jd_xp / hx_xp * flux_xp
+            + jd_x0 / hy    * flux_ym
+            - jd_x0 / hy    * flux_yp
+            + jd_x0 / hz    * flux_zm
+            - jd_x0 / hz    * flux_zp
+        );
+      }
+    }
+  }
+#endif
+  return 0;
+}
+
+static int interface_advect_vof(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    interface_t * interface
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict vof = interface->vof.data;
+  // update vof, alpha contribution
+  {
+    const double coef = rkcoefs[rkstep][rk_a];
+    const double * restrict src = interface->src[rk_a].data;
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        VOF(i, j) += dt * coef * SRC(i, j);
+      }
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          VOF(i, j, k) += dt * coef * SRC(i, j, k);
+        }
+      }
+    }
+#endif
+  }
+  // update vof, beta contribution
+  if(0 != rkstep){
+    const double coef = rkcoefs[rkstep][rk_b];
+    const double * restrict src = interface->src[rk_b].data;
+#if NDIMS == 2
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        VOF(i, j) += dt * coef * SRC(i, j);
+      }
+    }
+#else
+    for(int k = 1; k <= ksize; k++){
+      for(int j = 1; j <= jsize; j++){
+        for(int i = 1; i <= isize; i++){
+          VOF(i, j, k) += dt * coef * SRC(i, j, k);
+        }
+      }
+    }
+#endif
+  }
+  return 0;
+}
+
+int interface_update_vof(
+    const domain_t * domain,
+    const size_t rkstep,
+    const double dt,
+    const fluid_t * fluid,
+    interface_t * interface
+){
+  reset_srcs(rkstep, interface->src + rk_a, interface->src + rk_b);
+  compute_flux_x(domain, fluid, interface);
+  compute_flux_y(domain, fluid, interface);
+#if NDIMS == 3
+  compute_flux_z(domain, fluid, interface);
+#endif
+  interface_compute_rhs(domain, interface);
+  interface_advect_vof(domain, rkstep, dt, interface);
+  interface_update_boundaries_vof(domain, &interface->vof);
+  return 0;
+}
+
diff --git a/src/linear_system.c b/src/linear_system.c
new file mode 100644
index 0000000..4102d4b
--- /dev/null
+++ b/src/linear_system.c
@@ -0,0 +1,155 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include "memory.h"
+#include "sdecomp.h"
+#include "domain.h"
+#include "linear_system.h"
+#include "tdm.h"
+
+static size_t get_nitems(
+    const size_t sizes[NDIMS]
+){
+  // helper function just to multiply sizes and return
+  size_t nitems = 1;
+  for(int dim = 0; dim < NDIMS; dim++){
+    nitems *= sizes[dim];
+  }
+  return nitems;
+}
+
+/**
+ * @brief initialise linear solver to update field implicitly
+ * @param[in] info     : information about domain decomposition
+ * @param[in] implicit : treatment of the diffusive terms in each direction
+ * @param[in] glsizes  : GLOBAL size of array
+ * @return             : structure storing buffers and plans to solve linear systems in each direction
+ */
+int linear_system_init(
+    const sdecomp_info_t * info,
+    const bool implicit[NDIMS],
+    const size_t glsizes[NDIMS],
+    linear_system_t * linear_system
+){
+  if(linear_system->is_initialised){
+    printf("this linear_system object is already initialised\n");
+    return 1;
+  }
+  memcpy(linear_system->implicit, implicit, sizeof(bool) * NDIMS);
+  // pencils (and their sizes) to store input and output of linear systems
+  double * restrict * x1pncl = &linear_system->x1pncl;
+  double * restrict * y1pncl = &linear_system->y1pncl;
+#if NDIMS == 3
+  double * restrict * z2pncl = &linear_system->z2pncl;
+#endif
+  // check size first
+  size_t * x1pncl_mysizes = linear_system->x1pncl_mysizes;
+  size_t * y1pncl_mysizes = linear_system->y1pncl_mysizes;
+#if NDIMS == 3
+  size_t * z2pncl_mysizes = linear_system->z2pncl_mysizes;
+#endif
+  for(int dim = 0; dim < NDIMS; dim++){
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_X1PENCIL, dim, glsizes[dim], x1pncl_mysizes + dim)) return 1;
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Y1PENCIL, dim, glsizes[dim], y1pncl_mysizes + dim)) return 1;
+#if NDIMS == 3
+    if(0 != sdecomp.get_pencil_mysize(info, SDECOMP_Z2PENCIL, dim, glsizes[dim], z2pncl_mysizes + dim)) return 1;
+#endif
+  }
+  // allocate pencils if needed
+  // NOTE: x1pncl is not needed for fully-explicit case,
+  //   but I always allocate it here for simplicity (to store delta values)
+  *x1pncl =               memory_calloc(get_nitems(x1pncl_mysizes), sizeof(double));
+  *y1pncl = implicit[1] ? memory_calloc(get_nitems(y1pncl_mysizes), sizeof(double)) : NULL;
+#if NDIMS == 3
+  *z2pncl = implicit[2] ? memory_calloc(get_nitems(z2pncl_mysizes), sizeof(double)) : NULL;
+#endif
+  // initialise parallel matrix transpose if needed
+  // between x1 and y1
+  if(implicit[1]){
+    sdecomp_transpose_plan_t ** plan_f = &linear_system->transposer_x1_to_y1;
+    sdecomp_transpose_plan_t ** plan_b = &linear_system->transposer_y1_to_x1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_X1PENCIL, SDECOMP_Y1PENCIL, glsizes, sizeof(double), plan_f)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Y1PENCIL, SDECOMP_X1PENCIL, glsizes, sizeof(double), plan_b)) return 1;
+  }
+#if NDIMS == 3
+  // between x1 and z2
+  if(implicit[2]){
+    sdecomp_transpose_plan_t ** plan_f = &linear_system->transposer_x1_to_z2;
+    sdecomp_transpose_plan_t ** plan_b = &linear_system->transposer_z2_to_x1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_X1PENCIL, SDECOMP_Z2PENCIL, glsizes, sizeof(double), plan_f)) return 1;
+    if(0 != sdecomp.transpose.construct(info, SDECOMP_Z2PENCIL, SDECOMP_X1PENCIL, glsizes, sizeof(double), plan_b)) return 1;
+  }
+#endif
+  // initialise tri-diagonal matrix solvers
+  // Thomas algorithm in x direction
+  if(implicit[0]){
+    if(0 != tdm.construct(
+        /* size of system */ (int)(x1pncl_mysizes[0]),
+#if NDIMS == 2
+        /* number of rhs  */ (int)(x1pncl_mysizes[1]),
+#else
+        /* number of rhs  */ (int)(x1pncl_mysizes[1] * x1pncl_mysizes[2]),
+#endif
+        /* is periodic    */ false,
+        /* is complex     */ false,
+        /* output         */ &linear_system->tdm_x
+    )) return 1;
+  }
+  // Thomas algorithm in y direction
+  if(implicit[1]){
+    if(0 != tdm.construct(
+        /* size of system */ (int)(y1pncl_mysizes[1]),
+#if NDIMS == 2
+        /* number of rhs  */ (int)(y1pncl_mysizes[0]),
+#else
+        /* number of rhs  */ (int)(y1pncl_mysizes[2] * y1pncl_mysizes[0]),
+#endif
+        /* is periodic    */ true,
+        /* is complex     */ false,
+        /* output         */ &linear_system->tdm_y
+    )) return 1;
+  }
+#if NDIMS == 3
+  // Thomas algorithm in z direction
+  if(implicit[2]){
+    if(0 != tdm.construct(
+        /* size of system */ (int)(z2pncl_mysizes[2]),
+        /* number of rhs  */ (int)(z2pncl_mysizes[0] * z2pncl_mysizes[1]),
+        /* is periodic    */ true,
+        /* is complex     */ false,
+        /* output         */ &linear_system->tdm_z
+    )) return 1;
+  }
+#endif
+  linear_system->is_initialised = true;
+  return 0;
+}
+
+int linear_system_finalise(
+    linear_system_t * linear_system
+){
+  if(NULL == linear_system || !linear_system->is_initialised){
+    return 1;
+  }
+  const bool * implicit = linear_system->implicit;
+  memory_free(linear_system->x1pncl);
+  if(implicit[0]){
+    tdm.destruct(linear_system->tdm_x);
+  }
+  if(implicit[1]){
+    memory_free(linear_system->y1pncl);
+    sdecomp.transpose.destruct(linear_system->transposer_x1_to_y1);
+    sdecomp.transpose.destruct(linear_system->transposer_y1_to_x1);
+    tdm.destruct(linear_system->tdm_y);
+  }
+#if NDIMS == 3
+  if(implicit[2]){
+    memory_free(linear_system->z2pncl);
+    sdecomp.transpose.destruct(linear_system->transposer_x1_to_z2);
+    sdecomp.transpose.destruct(linear_system->transposer_z2_to_x1);
+    tdm.destruct(linear_system->tdm_z);
+  }
+#endif
+  return 0;
+}
+
diff --git a/src/logging/README.rst b/src/logging/README.rst
new file mode 100644
index 0000000..80d6656
--- /dev/null
+++ b/src/logging/README.rst
@@ -0,0 +1,26 @@
+########
+logging/
+########
+
+Functions to monitor the running simulation.
+
+* divergence.c
+
+   Compute and output maximum local divergence of the flow field.
+
+* energy.c
+
+   Compute and output the total squared velocity (in each direction) and the total squared temperature of the flow field.
+
+* internal.h
+
+   Private functions which are only used by this directory is declared.
+
+* main.c
+
+   Call other logging functions.
+
+* momentum.c
+
+   Compute and output the net momentum in each direction.
+
diff --git a/src/logging/divergence.c b/src/logging/divergence.c
new file mode 100644
index 0000000..0678d8b
--- /dev/null
+++ b/src/logging/divergence.c
@@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <math.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fileio.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "internal.h"
+
+/**
+ * @brief check divergence and write the maximum value
+ * @param[in] fname  : file name to which the log is written
+ * @param[in] domain : domain information
+ * @param[in] time   : current simulation time
+ * @param[in] fluid  : velocity
+ * @return           : error code
+ */
+int logging_check_divergence(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  double divmax = 0.;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      // compute local divergence | 13
+      const double hx_xm = HXXF(i  );
+      const double hx_xp = HXXF(i+1);
+      const double jd_xm = JDXF(i  );
+      const double jd_x0 = JDXC(i  );
+      const double jd_xp = JDXF(i+1);
+      const double ux_xm = UX(i  , j  );
+      const double ux_xp = UX(i+1, j  );
+      const double uy_ym = UY(i  , j  );
+      const double uy_yp = UY(i  , j+1);
+      const double div = 1. / jd_x0 * (
+          - jd_xm / hx_xm * ux_xm + jd_xp / hx_xp * ux_xp
+          - jd_x0 / hy    * uy_ym + jd_x0 / hy    * uy_yp
+      );
+      // check maximum
+      divmax = fmax(divmax, fabs(div));
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        // compute local divergence | 16
+        const double hx_xm = HXXF(i  );
+        const double hx_xp = HXXF(i+1);
+        const double jd_xm = JDXF(i  );
+        const double jd_x0 = JDXC(i  );
+        const double jd_xp = JDXF(i+1);
+        const double ux_xm = UX(i  , j  , k  );
+        const double ux_xp = UX(i+1, j  , k  );
+        const double uy_ym = UY(i  , j  , k  );
+        const double uy_yp = UY(i  , j+1, k  );
+        const double uz_zm = UZ(i  , j  , k  );
+        const double uz_zp = UZ(i  , j  , k+1);
+        const double div = 1. / jd_x0 * (
+            - jd_xm / hx_xm * ux_xm + jd_xp / hx_xp * ux_xp
+            - jd_x0 / hy    * uy_ym + jd_x0 / hy    * uy_yp
+            - jd_x0 / hz    * uz_zm + jd_x0 / hz    * uz_zp
+        );
+        // check maximum
+        divmax = fmax(divmax, fabs(div));
+      }
+    }
+  }
+#endif
+  // collect information among all processes
+  const void * sendbuf = root == myrank ? MPI_IN_PLACE : &divmax;
+  void * recvbuf = &divmax;
+  MPI_Reduce(sendbuf, recvbuf, 1, MPI_DOUBLE, MPI_MAX, root, comm_cart);
+  // result is written to a file from the main process
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f % .1e\n", time, divmax);
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/logging/energy.c b/src/logging/energy.c
new file mode 100644
index 0000000..4553720
--- /dev/null
+++ b/src/logging/energy.c
@@ -0,0 +1,123 @@
+#include <stdio.h>
+#include <math.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fileio.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "internal.h"
+
+/**
+ * @brief compute total kinetic and thermal energies
+ * @param[in] fname  : file name to which the log is written
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] time   : current simulation time
+ * @param[in] fluid  : velocity and temperature
+ * @return           : error code
+ */
+int logging_check_energy(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double * restrict den = fluid->den[1].data;
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  // squared velocity in each dimension
+  double quantities[NDIMS] = {0.};
+  // compute quadratic quantity in x direction
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      const double ds = JDXF(i  );
+      const double lden = 0.5 * DEN(i-1, j  ) + 0.5 * DEN(i  , j  );
+      const double lvel = UX(i, j);
+      quantities[0] += 0.5 * lden * pow(lvel, 2.) * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double dv = JDXF(i  );
+        const double lden = 0.5 * DEN(i-1, j  , k  ) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UX(i, j, k);
+        quantities[0] += 0.5 * lden * pow(lvel, 2.) * dv;
+      }
+    }
+  }
+#endif
+  // compute quadratic quantity in y direction
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double ds = JDXC(i  );
+      const double lden = 0.5 * DEN(i  , j-1) + 0.5 * DEN(i  , j  );
+      const double lvel = UY(i, j);
+      quantities[1] += 0.5 * lden * pow(lvel, 2.) * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double dv = JDXC(i  );
+        const double lden = 0.5 * DEN(i  , j-1, k  ) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UY(i, j, k);
+        quantities[1] += 0.5 * lden * pow(lvel, 2.) * dv;
+      }
+    }
+  }
+#endif
+#if NDIMS == 3
+  // compute quadratic quantity in z direction
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double dv = JDXC(i  );
+        const double lden = 0.5 * DEN(i  , j  , k-1) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UZ(i, j, k);
+        quantities[2] += 0.5 * lden * pow(lvel, 2.) * dv;
+      }
+    }
+  }
+#endif
+  const void * sendbuf = root == myrank ? MPI_IN_PLACE : quantities;
+  void * recvbuf = quantities;
+  MPI_Reduce(sendbuf, recvbuf, NDIMS, MPI_DOUBLE, MPI_SUM, root, comm_cart);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f ", time);
+    for(int n = 0; n < NDIMS; n++){
+      fprintf(fp, "% 18.15e%c", quantities[n], NDIMS - 1 == n ? '\n' : ' ');
+    }
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/logging/extrema.c b/src/logging/extrema.c
new file mode 100644
index 0000000..26ba9e4
--- /dev/null
+++ b/src/logging/extrema.c
@@ -0,0 +1,93 @@
+#include <stdio.h>
+#include <math.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fileio.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "internal.h"
+
+int logging_check_extrema(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  double extrema[NDIMS] = {0.};
+  // compute total x-momentum
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      extrema[0] = fmax(extrema[0], fabs(UX(i, j)));
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        extrema[0] = fmax(extrema[0], fabs(UX(i, j, k)));
+      }
+    }
+  }
+#endif
+  // compute total y-momentum
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      extrema[1] = fmax(extrema[1], fabs(UY(i, j)));
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        extrema[1] = fmax(extrema[1], fabs(UY(i, j, k)));
+      }
+    }
+  }
+#endif
+#if NDIMS == 3
+  // compute total z-momentum
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        extrema[2] = fmax(extrema[2], fabs(UZ(i, j, k)));
+      }
+    }
+  }
+#endif
+  const void * sendbuf = root == myrank ? MPI_IN_PLACE : extrema;
+  void * recvbuf = extrema;
+  MPI_Reduce(sendbuf, recvbuf, NDIMS, MPI_DOUBLE, MPI_MAX, root, comm_cart);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f ", time);
+    for(int n = 0; n < NDIMS; n++){
+      fprintf(fp, "% 18.15e%c", extrema[n], NDIMS - 1 == n ? '\n' : ' ');
+    }
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/logging/interface.c b/src/logging/interface.c
new file mode 100644
index 0000000..b00dadc
--- /dev/null
+++ b/src/logging/interface.c
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <math.h>
+#include "domain.h"
+#include "interface.h"
+#include "fileio.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/interface/vof.h"
+#include "internal.h"
+
+int logging_check_vof(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const interface_t * interface
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * jdxf = domain->jdxf;
+  const double * jdxc = domain->jdxc;
+  const double * hxxf = domain->hxxf;
+  const double hy = domain->hy;
+#if NDIMS == 3
+  const double hz = domain->hz;
+#endif
+  const double * vof = interface->vof.data;
+  double min = 1.;
+  double max = 0.;
+  double sums[3] = {0.};
+  // check volume conservation
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double ds = JDXC(i  );
+      const double lvof = VOF(i, j);
+      min = fmin(min, lvof);
+      max = fmax(max, lvof);
+      sums[0] += lvof * ds;
+      sums[1] +=        ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double dv = JDXC(i  );
+        const double lvof = VOF(i, j, k);
+        min = fmin(min, lvof);
+        max = fmax(max, lvof);
+        sums[0] += lvof * dv;
+        sums[1] +=        dv;
+      }
+    }
+  }
+#endif
+  // surface area, x face
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize + 1; i++){
+      const double vof_xm = VOF(i-1, j  );
+      const double vof_xp = VOF(i  , j  );
+      const double ds = JDXF(i  ) / HXXF(i  );
+      sums[2] += fabs(vof_xp - vof_xm) * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        const double vof_xm = VOF(i-1, j  , k  );
+        const double vof_xp = VOF(i  , j  , k  );
+        const double ds = JDXF(i  ) / HXXF(i  );
+        sums[2] += fabs(vof_xp - vof_xm) * ds;
+      }
+    }
+  }
+#endif
+  // surface area, y face
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double vof_ym = VOF(i  , j-1);
+      const double vof_yp = VOF(i  , j  );
+      const double ds = JDXC(i  ) / hy;
+      sums[2] += fabs(vof_yp - vof_ym) * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double vof_ym = VOF(i  , j-1, k  );
+        const double vof_yp = VOF(i  , j  , k  );
+        const double ds = JDXC(i  ) / hy;
+        sums[2] += fabs(vof_yp - vof_ym) * ds;
+      }
+    }
+  }
+#endif
+  // surface area, z face
+#if NDIMS == 3
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double vof_zm = VOF(i  , j  , k-1);
+        const double vof_zp = VOF(i  , j  , k  );
+        const double ds = JDXC(i  ) / hz;
+        sums[2] += fabs(vof_zp - vof_zm) * ds;
+      }
+    }
+  }
+#endif
+  MPI_Allreduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, comm_cart);
+  MPI_Allreduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, comm_cart);
+  MPI_Allreduce(MPI_IN_PLACE, sums, 3, MPI_DOUBLE, MPI_SUM, comm_cart);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f ", time);
+    fprintf(fp, "% 18.15e ", min);
+    fprintf(fp, "% 18.15e ", max);
+    fprintf(fp, "% 18.15e ", sums[0] / sums[1]);
+    fprintf(fp, "% 18.15e\n", sums[2]);
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/logging/internal.h b/src/logging/internal.h
new file mode 100644
index 0000000..2e02a2e
--- /dev/null
+++ b/src/logging/internal.h
@@ -0,0 +1,52 @@
+#if !defined(LOGGING_INTERNAL_H)
+#define LOGGING_INTERNAL_H
+
+// FOR INTERNAL USE
+
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+
+extern int logging_check_divergence(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+);
+
+extern int logging_check_momentum(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+);
+
+extern int logging_check_extrema(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+);
+
+extern int logging_check_energy(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+);
+
+extern int logging_check_stress(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+);
+
+extern int logging_check_vof(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const interface_t * interface
+);
+
+#endif // LOGGING_INTERNAL_H
diff --git a/src/logging/main.c b/src/logging/main.c
new file mode 100644
index 0000000..5e331fe
--- /dev/null
+++ b/src/logging/main.c
@@ -0,0 +1,123 @@
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+#include "memory.h"
+#include "config.h"
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "fileio.h"
+#include "logging.h"
+#include "internal.h"
+
+static double g_rate = 0.;
+static double g_next = 0.;
+
+/**
+ * @brief constructor - schedule logging
+ * @param[in] domain : MPI communicator
+ * @param[in] time   : current time (hereafter in free-fall time units)
+ */
+static int init(
+    const domain_t * domain,
+    const double time
+){
+  if(0 != config.get_double("log_rate", &g_rate)){
+    return 1;
+  }
+  g_next = g_rate * ceil(
+      fmax(DBL_EPSILON, time) / g_rate
+  );
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    printf("LOGGING\n");
+    printf("\tnext: % .3e\n", g_next);
+    printf("\trate: % .3e\n", g_rate);
+    fflush(stdout);
+  }
+  return 0;
+}
+
+/**
+ * @brief show current step, time, time step size, diffusive treatments
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] fname  : file name to which the log is written
+ * @param[in] time   : current simulation time
+ * @param[in] step   : current time step
+ * @param[in] dt     : time step size
+ * @param[in] wtime  : current wall time
+ */
+static void show_progress(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const size_t step,
+    const double dt,
+    const double wtime
+){
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL != fp){
+      // show progress to standard output and file
+      // output to stdout and file
+#define MPRINT(...) { \
+      fprintf(fp,     __VA_ARGS__); \
+      fprintf(stdout, __VA_ARGS__); \
+}
+      MPRINT("step %10zu, time %.3f, dt %.3e, elapsed %.1f [sec]\n", step, time, dt, wtime);
+#undef MPRINT
+      fileio.fclose(fp);
+    }
+  }
+}
+
+/**
+ * @brief output log files to be monitored during simulation
+ * @param[in] domain    : information related to MPI domain decomposition
+ * @param[in] step      : current time step
+ * @param[in] time      : current simulation time
+ * @param[in] dt        : time step size
+ * @param[in] wtime     : current wall time
+ * @param[in] fluid     : velocity and temperature
+ * @param[in] interface : vof field
+ */
+static void check_and_output(
+    const domain_t * domain,
+    const size_t step,
+    const double time,
+    const double dt,
+    const double wtime,
+    const fluid_t * fluid,
+    const interface_t * interface
+){
+  show_progress           ("output/log/progress.dat",   domain, time, step, dt, wtime);
+  logging_check_divergence("output/log/divergence.dat", domain, time, fluid);
+  logging_check_momentum  ("output/log/momentum.dat",   domain, time, fluid);
+  logging_check_extrema   ("output/log/extrema.dat",    domain, time, fluid);
+  logging_check_energy    ("output/log/energy.dat",     domain, time, fluid);
+  logging_check_stress    ("output/log/stress.dat",     domain, time, fluid);
+  logging_check_vof       ("output/log/vof.dat",        domain, time, interface);
+  g_next += g_rate;
+}
+
+/**
+ * @brief getter of a member: g_next
+ * @return : g_next
+ */
+static double get_next_time(
+    void
+){
+  return g_next;
+}
+
+const logging_t logging = {
+  .init             = init,
+  .check_and_output = check_and_output,
+  .get_next_time    = get_next_time,
+};
+
diff --git a/src/logging/momentum.c b/src/logging/momentum.c
new file mode 100644
index 0000000..4ea99fb
--- /dev/null
+++ b/src/logging/momentum.c
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fileio.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/domain/jdxc.h"
+#include "array_macros/fluid/den.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "internal.h"
+
+/**
+ * @brief compute total momenta
+ * @param[in] fname  : file name to which the log is written
+ * @param[in] domain : information about domain decomposition and size
+ * @param[in] time   : current simulation time
+ * @param[in] fluid  : velocity
+ * @return           : error code
+ */
+int logging_check_momentum(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict jdxc = domain->jdxc;
+  const double * restrict den = fluid->den[1].data;
+  const double * restrict ux = fluid->ux.data;
+  const double * restrict uy = fluid->uy.data;
+#if NDIMS == 3
+  const double * restrict uz = fluid->uz.data;
+#endif
+  double moms[NDIMS] = {0.};
+  // compute total x-momentum
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 2; i <= isize; i++){
+      const double ds = JDXF(i  );
+      const double lden = 0.5 * DEN(i-1, j  ) + 0.5 * DEN(i  , j  );
+      const double lvel = UX(i, j);
+      moms[0] += lden * lvel * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 2; i <= isize; i++){
+        const double dv = JDXF(i  );
+        const double lden = 0.5 * DEN(i-1, j  , k  ) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UX(i, j, k);
+        moms[0] += lden * lvel * dv;
+      }
+    }
+  }
+#endif
+  // compute total y-momentum
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize; i++){
+      const double ds = JDXC(i  );
+      const double lden = 0.5 * DEN(i  , j-1) + 0.5 * DEN(i  , j  );
+      const double lvel = UY(i, j);
+      moms[1] += lden * lvel * ds;
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double dv = JDXC(i  );
+        const double lden = 0.5 * DEN(i  , j-1, k  ) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UY(i, j, k);
+        moms[1] += lden * lvel * dv;
+      }
+    }
+  }
+#endif
+#if NDIMS == 3
+  // compute total z-momentum
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize; i++){
+        const double dv = JDXC(i  );
+        const double lden = 0.5 * DEN(i  , j  , k-1) + 0.5 * DEN(i  , j  , k  );
+        const double lvel = UZ(i, j, k);
+        moms[2] += lden * lvel * dv;
+      }
+    }
+  }
+#endif
+  const void * sendbuf = root == myrank ? MPI_IN_PLACE : moms;
+  void * recvbuf = moms;
+  MPI_Reduce(sendbuf, recvbuf, NDIMS, MPI_DOUBLE, MPI_SUM, root, comm_cart);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f ", time);
+    for(int n = 0; n < NDIMS; n++){
+      fprintf(fp, "% 18.15e%c", moms[n], NDIMS - 1 == n ? '\n' : ' ');
+    }
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/logging/stress.c b/src/logging/stress.c
new file mode 100644
index 0000000..a0438fe
--- /dev/null
+++ b/src/logging/stress.c
@@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <math.h>
+#include "domain.h"
+#include "fluid.h"
+#include "fileio.h"
+#include "array_macros/domain/hxxf.h"
+#include "array_macros/domain/jdxf.h"
+#include "array_macros/fluid/txy.h"
+#include "internal.h"
+
+int logging_check_stress(
+    const char fname[],
+    const domain_t * domain,
+    const double time,
+    const fluid_t * fluid
+){
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm comm_cart = MPI_COMM_NULL;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  sdecomp.get_comm_cart(domain->info, &comm_cart);
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  const double * restrict hxxf = domain->hxxf;
+  const double * restrict jdxf = domain->jdxf;
+  const double * restrict txy = fluid->txy.data;
+  const double diffusivity = 1. / fluid->Re;
+  // shear stress in the y direction for each wall
+  double vals[2] = {0., 0.};
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    const int im =         1;
+    const int ip = isize + 1;
+    vals[0] += diffusivity * JDXF(im) / HXXF(im) * TXY(im, j);
+    vals[1] += diffusivity * JDXF(ip) / HXXF(ip) * TXY(ip, j);
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      const int im =         1;
+      const int ip = isize + 1;
+      vals[0] += diffusivity * JDXF(im) / HXXF(im) * TXY(im, j, k);
+      vals[1] += diffusivity * JDXF(ip) / HXXF(ip) * TXY(ip, j, k);
+    }
+  }
+#endif
+  const void * sendbuf = root == myrank ? MPI_IN_PLACE : vals;
+  void * recvbuf = vals;
+  MPI_Reduce(sendbuf, recvbuf, 2, MPI_DOUBLE, MPI_SUM, root, comm_cart);
+  if(root == myrank){
+    FILE * fp = fileio.fopen(fname, "a");
+    if(NULL == fp){
+      return 0;
+    }
+    fprintf(fp, "%8.2f % 18.15e % 18.15e\n", time, vals[0], vals[1]);
+    fileio.fclose(fp);
+  }
+  return 0;
+}
+
diff --git a/src/main.c b/src/main.c
new file mode 100644
index 0000000..7709688
--- /dev/null
+++ b/src/main.c
@@ -0,0 +1,157 @@
+#include <stdio.h>
+#include <mpi.h>
+#include "timer.h"
+#include "domain.h"
+#include "fluid.h"
+#include "fluid_solver.h"
+#include "interface.h"
+#include "interface_solver.h"
+#include "integrate.h"
+#include "statistics.h"
+#include "save.h"
+#include "logging.h"
+#include "config.h"
+#include "fileio.h"
+
+static int save_entrypoint(
+    const domain_t * domain,
+    const size_t step,
+    const double time,
+    const fluid_t * fluid,
+    const interface_t * interface
+){
+  char * dirname = NULL;
+  save.prepare(domain, step, &dirname);
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    fileio.w_serial(dirname, "step", 0, NULL, fileio.npy_size_t, sizeof(size_t), &step);
+    fileio.w_serial(dirname, "time", 0, NULL, fileio.npy_double, sizeof(double), &time);
+  }
+  domain_save(dirname, domain);
+  fluid_save(dirname, domain, fluid);
+  interface_save(dirname, domain, interface);
+  return 0;
+}
+
+/**
+ * @brief main function
+ * @param[in] argc : number of arguments (expect 2)
+ * @param[in] argv : name of the directory
+ *                     where a set of initial condition is contained
+ * @return         : error code
+ */
+int main(
+    int argc,
+    char * argv[]
+){
+  // launch MPI, start timer
+  MPI_Init(NULL, NULL);
+  const int root = 0;
+  int myrank = root;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  const double tic = timer();
+  // find name of directory where IC is stored
+  if(2 != argc){
+    if(root == myrank){
+      printf("directory name should be given as input\n");
+    }
+    goto abort;
+  }
+  const char * dirname_ic = argv[1];
+  // initialise fileio object
+  if(0 != fileio.init()){
+    goto abort;
+  }
+  // initialise time step and time units
+  size_t step = 0;
+  if(0 != fileio.r_serial(dirname_ic, "step", 0, NULL, fileio.npy_size_t, sizeof(size_t), &step)){
+    goto abort;
+  }
+  double time = 0.;
+  if(0 != fileio.r_serial(dirname_ic, "time", 0, NULL, fileio.npy_double, sizeof(double), &time)){
+    goto abort;
+  }
+  // initialise structures
+  domain_t domain = {0};
+  if(0 != domain_init(dirname_ic, &domain)){
+    goto abort;
+  }
+  fluid_t fluid = {0};
+  if(0 != fluid_init(dirname_ic, &domain, &fluid)){
+    goto abort;
+  }
+  interface_t interface = {0};
+  if(0 != interface_init(dirname_ic, &domain, &interface)){
+    goto abort;
+  }
+  // initialise auxiliary objects
+  if(0 != logging.init(&domain, time)){
+    goto abort;
+  }
+  if(0 != save.init(&domain, time)){
+    goto abort;
+  }
+  if(0 != statistics.init(&domain, time)){
+    goto abort;
+  }
+  // check termination conditions
+  double timemax = 0.;
+  if(0 != config.get_double("timemax", &timemax)){
+    goto abort;
+  }
+  double wtimemax = 0.;
+  if(0 != config.get_double("wtimemax", &wtimemax)){
+    goto abort;
+  }
+  // report
+  if(root == myrank){
+    printf("step: %zu, time: % .7e\n", step, time);
+    printf("timemax: % .7e, wtimemax: % .7e\n", timemax, wtimemax);
+  }
+  // main loop
+  for(;;){
+    // time step size
+    double dt = 0.;
+    // integrate for one time step
+    if(0 != integrate(&domain, &fluid, &interface, &dt)){
+      goto abort;
+    }
+    // update step and simulation / wall time
+    step += 1;
+    time += dt;
+    const double toc = timer();
+    // terminate if one of the following conditions is met
+    // the simulation is finished
+    if(timemax < time){
+      break;
+    }
+    // wall time limit is reached
+    if(wtimemax < toc - tic){
+      break;
+    }
+    // compute and output log regularly
+    if(logging.get_next_time() < time){
+      logging.check_and_output(&domain, step, time, dt, toc - tic, &fluid, &interface);
+    }
+    // save flow fields regularly
+    if(save.get_next_time() < time){
+      save_entrypoint(&domain, step, time, &fluid, &interface);
+    }
+    // collect statistics regularly
+    if(statistics.get_next_time() < time){
+      statistics.collect(&domain, &fluid, &interface);
+    }
+  }
+  // finalisation
+  // save final flow fields
+  save_entrypoint(&domain, step, time, &fluid, &interface);
+  // save collected statistics
+  statistics.output(&domain, step);
+  // finalise MPI
+abort:
+  MPI_Finalize();
+  return 0;
+}
+
diff --git a/src/memory.c b/src/memory.c
new file mode 100644
index 0000000..90c27bd
--- /dev/null
+++ b/src/memory.c
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include "memory.h"
+
+/**
+ * @brief general-purpose memory allocator
+ * @param[in] count : number of elements to be allocated
+ * @param[in] size  : size of each element
+ * @return          : pointer to the allocated buffer
+ */
+void * memory_calloc(
+    const size_t count,
+    const size_t size
+){
+  // try to allocate
+  void * ptr = calloc(count, size);
+  if(NULL == ptr){
+    // failed to allocate
+    FILE * stream = stderr;
+    fprintf(stream, "memory allocation error: calloc(%zu, %zu)\n", count, size);
+    fflush(stream);
+    // since memory errors are fatal, I abort the program
+    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Abort(MPI_COMM_WORLD, 0);
+  }
+  return ptr;
+}
+
+/**
+ * @brief corresponding memory deallocator
+ * @param[in] ptr : pointer to the allocated buffer
+ */
+void memory_free(
+    void * ptr
+){
+  // for now just wrap free
+  free(ptr);
+}
+
diff --git a/src/param/boundary-condition.c b/src/param/boundary-condition.c
new file mode 100644
index 0000000..6b8c3fe
--- /dev/null
+++ b/src/param/boundary-condition.c
@@ -0,0 +1,9 @@
+#include "param.h"
+
+const double param_uy_xm = 0.;
+const double param_uy_xp = 0.;
+#if NDIMS == 3
+const double param_uz_xm = 0.;
+const double param_uz_xp = 0.;
+#endif
+
diff --git a/src/runge_kutta.c b/src/runge_kutta.c
new file mode 100644
index 0000000..c649aae
--- /dev/null
+++ b/src/runge_kutta.c
@@ -0,0 +1,16 @@
+#include "runge_kutta.h"
+
+const uint_fast8_t rk_a = 0;
+const uint_fast8_t rk_b = 1;
+const uint_fast8_t rk_g = 2;
+
+// coefficients of three-stage Runge-Kutta scheme
+static const double a0 = +32. / 60., b0 =   0. / 60.;
+static const double a1 = +25. / 60., b1 = -17. / 60.;
+static const double a2 = +45. / 60., b2 = -25. / 60.;
+const rkcoef_t rkcoefs[RKSTEPMAX] = {
+  {a0, b0, a0 + b0},
+  {a1, b1, a1 + b1},
+  {a2, b2, a2 + b2},
+};
+
diff --git a/src/save.c b/src/save.c
new file mode 100644
index 0000000..69a4bd1
--- /dev/null
+++ b/src/save.c
@@ -0,0 +1,118 @@
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include "sdecomp.h"
+#include "param.h"
+#include "memory.h"
+#include "domain.h"
+#include "save.h"
+#include "fileio.h"
+#include "config.h"
+
+// parameters to specify directory name
+static const char g_dirname_prefix[] = {"output/save/step"};
+static const int g_dirname_ndigits = 10;
+
+// name of directory
+static char * g_dirname = NULL;
+static size_t g_dirname_nchars = 0;
+
+// scheduler
+static double g_rate = 0.;
+static double g_next = 0.;
+
+/**
+ * @brief constructor - schedule saving flow fields
+ * @param[in] domain : MPI communicator
+ * @param[in] time   : current time (hereafter in free-fall time units)
+ */
+static int init(
+    const domain_t * domain,
+    const double time
+){
+  // fetch timings
+  if(0 != config.get_double("save_rate", &g_rate)){
+    return 1;
+  }
+  double after = 0.;
+  if(0 != config.get_double("save_after", &after)){
+    return 1;
+  }
+  // schedule next event
+  g_next = g_rate * ceil(
+      fmax(DBL_EPSILON, fmax(time, after)) / g_rate
+  );
+  // allocate directory name
+  g_dirname_nchars =
+    + strlen(g_dirname_prefix)
+    + g_dirname_ndigits;
+  g_dirname = memory_calloc(g_dirname_nchars + 2, sizeof(char));
+  // report
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    FILE * stream = stdout;
+    fprintf(stream, "SAVE\n");
+    fprintf(stream, "\tdest: %s\n", g_dirname_prefix);
+    fprintf(stream, "\tnext: % .3e\n", g_next);
+    fprintf(stream, "\trate: % .3e\n", g_rate);
+    fflush(stream);
+  }
+  return 0;
+}
+
+/**
+ * @brief prepare place to output flow fields
+ * @param[in]  domain  : information related to MPI domain decomposition
+ * @param[in]  step    : time step
+ * @param[out] dirname : name of created directory
+ */
+static int prepare(
+    const domain_t * domain,
+    const int step,
+    char ** dirname
+){
+  // set directory name
+  snprintf(
+      g_dirname,
+      g_dirname_nchars + 1,
+      "%s%0*d",
+      g_dirname_prefix,
+      g_dirname_ndigits,
+      step
+  );
+  *dirname = g_dirname;
+  // get communicator to identify the main process
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  // create directory
+  if(root == myrank){
+    // although it may fail, anyway continue, which is designed to be safe
+    fileio.mkdir(*dirname);
+  }
+  // wait for the main process to complete making directory
+  MPI_Barrier(MPI_COMM_WORLD);
+  // schedule next saving event
+  g_next += g_rate;
+  return 0;
+}
+
+/**
+ * @brief getter of a member: g_next
+ * @return : g_next
+ */
+static double get_next_time(
+    void
+){
+  return g_next;
+}
+
+const save_t save = {
+  .init          = init,
+  .prepare       = prepare,
+  .get_next_time = get_next_time,
+};
+
diff --git a/src/statistics.c b/src/statistics.c
new file mode 100644
index 0000000..a09d4d5
--- /dev/null
+++ b/src/statistics.c
@@ -0,0 +1,311 @@
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include "sdecomp.h"
+#include "param.h"
+#include "memory.h"
+#include "domain.h"
+#include "fluid.h"
+#include "interface.h"
+#include "statistics.h"
+#include "fileio.h"
+#include "config.h"
+#include "array_macros/fluid/ux.h"
+#include "array_macros/fluid/uy.h"
+#if NDIMS == 3
+#include "array_macros/fluid/uz.h"
+#endif
+#include "array_macros/interface/vof.h"
+#include "array_macros/statistics/ux1.h"
+#include "array_macros/statistics/uy1.h"
+#if NDIMS == 3
+#include "array_macros/statistics/uz1.h"
+#endif
+#include "array_macros/statistics/vof1.h"
+
+// parameters to specify directory name
+static const char g_dirname_prefix[] = {"output/stat/step"};
+static const int g_dirname_ndigits = 10;
+
+// name of directory
+static char * g_dirname = NULL;
+static size_t g_dirname_nchars = 0;
+
+// scheduler
+static double g_rate = 0.;
+static double g_next = 0.;
+
+// data
+static size_t g_num = 0;
+static array_t g_ux1 = {0};
+static array_t g_uy1 = {0};
+#if NDIMS == 3
+static array_t g_uz1 = {0};
+#endif
+static array_t g_vof1 = {0};
+
+/**
+ * @brief constructor - initialise and allocate internal buffers, schedule collection
+ * @param[in] domain : information about domain decomposition and size
+ * @param[in] time   : current time (hereafter in free-fall time units)
+ * @return           : error code
+ */
+static int init(
+    const domain_t * domain,
+    const double time
+){
+  // fetch timings
+  if(0 != config.get_double("stat_rate", &g_rate)){
+    return 1;
+  }
+  double after = 0.;
+  if(0 != config.get_double("stat_after", &after)){
+    return 1;
+  }
+  g_next = g_rate * ceil(
+      fmax(DBL_EPSILON, fmax(time, after)) / g_rate
+  );
+  // allocate directory name
+  g_dirname_nchars =
+    + strlen(g_dirname_prefix)
+    + g_dirname_ndigits;
+  g_dirname = memory_calloc(g_dirname_nchars + 2, sizeof(char));
+  // prepare arrays
+  if(0 != array.prepare(domain, UX1_NADDS,  sizeof(double), &g_ux1))  return 1;
+  if(0 != array.prepare(domain, UY1_NADDS,  sizeof(double), &g_uy1))  return 1;
+#if NDIMS == 3
+  if(0 != array.prepare(domain, UZ1_NADDS,  sizeof(double), &g_uz1))  return 1;
+#endif
+  if(0 != array.prepare(domain, VOF1_NADDS, sizeof(double), &g_vof1)) return 1;
+  // report
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  if(root == myrank){
+    FILE * stream = stdout;
+    fprintf(stream, "STATISTICS\n");
+    fprintf(stream, "\tdest: %s\n", g_dirname_prefix);
+    fprintf(stream, "\tnext: % .3e\n", g_next);
+    fprintf(stream, "\trate: % .3e\n", g_rate);
+    fflush(stream);
+  }
+  return 0;
+}
+
+/**
+ * @brief getter of a member: g_next
+ * @return : g_next
+ */
+static double get_next_time(
+    void
+){
+  return g_next;
+}
+
+/**
+ * @brief compute ux^1 and add results to the array
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] ux     : x velocity
+ */
+static void collect_mean_ux(
+    const domain_t * domain,
+    const double * restrict ux
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict ux1 = g_ux1.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 1; i <= isize + 1; i++){
+      UX1(i, j) += UX(i, j);
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 1; i <= isize + 1; i++){
+        UX1(i, j, k) += UX(i, j, k);
+      }
+    }
+  }
+#endif
+}
+
+/**
+ * @brief compute uy^1 and add results to the array
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] uy     : y velocity
+ */
+static void collect_mean_uy(
+    const domain_t * domain,
+    const double * restrict uy
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict uy1 = g_uy1.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 0; i <= isize + 1; i++){
+      UY1(i, j) += UY(i, j);
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        UY1(i, j, k) += UY(i, j, k);
+      }
+    }
+  }
+#endif
+}
+
+#if NDIMS == 3
+/**
+ * @brief compute uz^1 and add results to the array
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] uz     : z velocity
+ */
+static void collect_mean_uz(
+    const domain_t * domain,
+    const double * restrict uz
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+  const int ksize = domain->mysizes[2];
+  double * restrict uz1 = g_uz1.data;
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        UZ1(i, j, k) += UZ(i, j, k);
+      }
+    }
+  }
+}
+#endif
+
+static void collect_mean_vof(
+    const domain_t * domain,
+    const double * restrict vof
+){
+  const int isize = domain->mysizes[0];
+  const int jsize = domain->mysizes[1];
+#if NDIMS == 3
+  const int ksize = domain->mysizes[2];
+#endif
+  double * restrict vof1 = g_vof1.data;
+#if NDIMS == 2
+  for(int j = 1; j <= jsize; j++){
+    for(int i = 0; i <= isize + 1; i++){
+      VOF1(i, j) += VOF(i, j);
+    }
+  }
+#else
+  for(int k = 1; k <= ksize; k++){
+    for(int j = 1; j <= jsize; j++){
+      for(int i = 0; i <= isize + 1; i++){
+        VOF1(i, j, k) += VOF(i, j, k);
+      }
+    }
+  }
+#endif
+}
+
+/**
+ * @brief accumulate statistical data
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] fluid  : flow field
+ * @return           : error code
+ */
+static int collect(
+    const domain_t * domain,
+    const fluid_t * fluid,
+    const interface_t * interface
+){
+  // collect temporally-averaged quantities
+  collect_mean_ux(domain, fluid->ux.data);
+  collect_mean_uy(domain, fluid->uy.data);
+#if NDIMS == 3
+  collect_mean_uz(domain, fluid->uz.data);
+#endif
+  collect_mean_vof(domain, interface->vof.data);
+  // increment number of samples
+  g_num += 1;
+  // schedule next event
+  g_next += g_rate;
+  return 0;
+}
+
+/**
+ * @brief save structures which contains collected statistical data
+ * @param[in] domain : information related to MPI domain decomposition
+ * @param[in] step   : current time step
+ * @return           : error code
+ */
+static int output(
+    const domain_t * domain,
+    const size_t step
+){
+  // when no statistics are collected (g_num is 0),
+  //   no reason to save, so abort
+  if(0 == g_num){
+    return 0;
+  }
+  // set directory name
+  snprintf(
+      g_dirname,
+      g_dirname_nchars + 1,
+      "%s%0*zu",
+      g_dirname_prefix,
+      g_dirname_ndigits,
+      step
+  );
+  // get communicator to identify the main process
+  const int root = 0;
+  int myrank = root;
+  sdecomp.get_comm_rank(domain->info, &myrank);
+  // create directory and save scalars from main process
+  if(root == myrank){
+    // although it may fail, anyway continue, which is designed to be safe
+    fileio.mkdir(g_dirname);
+    // save scalars
+    fileio.w_serial(g_dirname, "num", 0, NULL, fileio.npy_size_t, sizeof(size_t), &g_num);
+  }
+  // wait for the main process to complete making directory
+  MPI_Barrier(MPI_COMM_WORLD);
+  // save domain info (coordinates)
+  domain_save(g_dirname, domain);
+  // save collected statistics
+  typedef struct {
+    const char * name;
+    const array_t * array;
+  } variable_t;
+  const variable_t variables[] = {
+    {.name = "ux1",  .array = &g_ux1},
+    {.name = "uy1",  .array = &g_uy1},
+#if NDIMS == 3
+    {.name = "uz1",  .array = &g_uz1},
+#endif
+    {.name = "vof1", .array = &g_vof1},
+  };
+  for(size_t index = 0; index < sizeof(variables) / sizeof(variable_t); index++){
+    const variable_t * v = variables + index;
+    array.dump(domain, g_dirname, v->name, fileio.npy_double, v->array);
+  }
+  return 0;
+}
+
+const statistics_t statistics = {
+  .init          = init,
+  .collect       = collect,
+  .output        = output,
+  .get_next_time = get_next_time,
+};
+
diff --git a/src/tdm.c b/src/tdm.c
new file mode 100644
index 0000000..2347d08
--- /dev/null
+++ b/src/tdm.c
@@ -0,0 +1,325 @@
+#include <math.h>
+#include <float.h>
+#include <complex.h>
+#include <fftw3.h>
+#include "memory.h"
+#include "tdm.h"
+
+/**
+ * @brief kernel function to solve a linear system
+ * @param[in]    n : matrix size
+ * @param[in]    l : lower  diagonal part
+ * @param[in]    c : center diagonal part
+ * @param[in]    u : upper  diagonal part
+ * @param[inout] q : right-hand-side & answers
+ * @return         : error code
+ */
+#define GTSV(type) \
+  static int gtsv_##type( \
+      const int n, \
+      const double * restrict l, \
+      const double * restrict c, \
+      const double * restrict u, \
+      double * restrict v, \
+      type * restrict q \
+){ \
+    /* divide the first row by center-diagonal term | 2 */ \
+    v[0] = u[0] / c[0]; \
+    q[0] = q[0] / c[0]; \
+    /* forward substitution | 7 */ \
+    for(int i = 1; i < n - 1; i++){ \
+      /* assume positive-definite system */ \
+      /*   to skip zero-division checks */ \
+      double val = 1. / (c[i] - l[i] * v[i-1]); \
+      v[i] = val *      (u[i]                ); \
+      q[i] = val *      (q[i] - l[i] * q[i-1]); \
+    } \
+    /* last row, do the same thing but consider singularity | 7 */ \
+    double val = c[n-1] - l[n-1] * v[n-2]; \
+    if(fabs(val) > DBL_EPSILON){ \
+      q[n-1] = 1. / val * (q[n-1] - l[n-1] * q[n-2]); \
+    }else{ \
+      /* singular, zero mean */ \
+      q[n-1] = 0.; \
+    } \
+    /* backward substitution | 3 */ \
+    for(int i = n - 2; i >= 0; i--){ \
+      q[i] -= v[i] * q[i+1]; \
+    } \
+    return 0; \
+  }
+
+/**
+ * @brief solve linear system
+ * @param[in]    n           : size of tri-diagonal matrix
+ * @param[in]    m           : how many right-hand-sides do you want to solve?
+ * @param[in]    is_periodic : periodic boundary condition is imposed
+ *                               (Sherman-Morrison formula is used)
+ *                               or not (normal Thomas algorithm is used)
+ * @param[in]    l           : pointer to lower- diagonal components
+ * @param[in]    c           : pointer to center-diagonal components
+ * @param[in]    u           : pointer to upper- diagonal components
+ * @param[inout] q           : right-hand-sides (size: "n", repeat for "m" times) & answers
+ *                               N.B. memory is contiguous in "n" direction, sparse in "m" direction
+ * @return                   : error code
+ */
+#define TDM_SOLVE(type) \
+  static int tdm_solve_##type( \
+      const int n, \
+      const int nrhs, \
+      const bool is_periodic, \
+      const double * restrict l, \
+      const double * restrict c, \
+      const double * restrict u, \
+      double * restrict v, \
+      type * restrict q, \
+      double * restrict q1 \
+){ \
+    if(is_periodic){ \
+      /* solve additional system coming from periodicity | 7 */ \
+      for(int i = 0; i < n-1; i++){ \
+        q1[i] \
+          = i ==   0 ? -l[i] \
+          : i == n-2 ? -u[i] \
+          : 0.; \
+      } \
+      gtsv_double(n-1, l, c, u, v, q1); \
+      for(int j = 0; j < nrhs; j++){ \
+        /* solve normal system | 2 */ \
+        type *q0 = q + j * n; \
+        gtsv_##type(n-1, l, c, u, v, q0); \
+        /* find x_{n-1} | 3 */ \
+        type   num = q0[n-1] - u[n-1] * q0[0] - l[n-1] * q0[n-2]; \
+        double den = c [n-1] + u[n-1] * q1[0] + l[n-1] * q1[n-2]; \
+        q0[n-1] = fabs(den) < DBL_EPSILON ? 0. : num / den; \
+        /* solve original system | 3 */ \
+        for(int i = 0; i < n-1; i++){ \
+          q0[i] = q0[i] + q0[n-1] * q1[i]; \
+        } \
+      } \
+    }else{ \
+      for(int j = 0; j < nrhs; j++){ \
+        gtsv_##type(n, l, c, u, v, q + j * n); \
+      } \
+    } \
+    return 0; \
+  }
+
+// expand macros to define solvers
+GTSV(double)
+GTSV(fftw_complex)
+TDM_SOLVE(double)
+TDM_SOLVE(fftw_complex)
+
+// definition of tdm_info_t_
+/**
+ * @struct tdm_info_t_
+ * @brief struct to keep information about tri-diagonal system and internal buffers
+ * @var size         : size of the system
+ * @var nrhs         : number of right-hand-side terms
+ * @var is_periodic  : periodic boundary condition is imposed or not
+ * @var is_complex   : data type of the right-hand-side terms is fftw_complex or not (double)
+ * @var l, c, u      : lower, center and upper-diagonal parts of the system
+ * @var v            : internal buffer (updated "u" is stored)
+ * @var q1           : internal buffer (additional right-hand-side term to be solved in addition to "q" is stored)
+ */
+struct tdm_info_t_ {
+  int size;
+  int nrhs;
+  bool is_periodic;
+  bool is_complex;
+  double * restrict l;
+  double * restrict c;
+  double * restrict u;
+  double * restrict v;
+  double * restrict q1;
+};
+
+/**
+ * @brief initialise tdm_info_t
+ * @param[in]  size        : size of the system
+ * @param[in]  nrhs        : number of right-hand-side terms
+ * @param[in]  is_periodic : periodic boundary condition is imposed or not
+ * @param[in]  is_complex  : data type of the right-hand-side terms is fftw_complex or not (double)
+ * @param[out] info        : pointer to the resulting structure
+ * @return                 : error code
+ */
+static int construct(
+    const int size,
+    const int nrhs,
+    const bool is_periodic,
+    const bool is_complex,
+    tdm_info_t ** info
+){
+  // sanitise input
+  if(size <= 0){
+    printf("ERROR(%s): size should be positive: %d\n", __func__, size);
+    *info = NULL;
+    return 1;
+  }
+  if(nrhs <= 0){
+    printf("ERROR(%s): nrhs should be positive: %d\n", __func__, nrhs);
+    *info = NULL;
+    return 1;
+  }
+  *info = memory_calloc(1, sizeof(tdm_info_t));
+  // sizes
+  (*info)->size = size;
+  (*info)->nrhs = nrhs;
+  // flags
+  (*info)->is_periodic = is_periodic;
+  (*info)->is_complex  = is_complex;
+  // buffers
+  (*info)->l = memory_calloc(size, sizeof(double));
+  (*info)->c = memory_calloc(size, sizeof(double));
+  (*info)->u = memory_calloc(size, sizeof(double));
+  (*info)->v = memory_calloc(size, sizeof(double));
+  if(is_periodic && /* to avoid zero-size allocation */ 1 < size){
+    (*info)->q1 = memory_calloc(size - 1, sizeof(double));
+  }else{
+    (*info)->q1 = NULL;
+  }
+  return 0;
+}
+
+/**
+ * @brief return pointer to the lower-diagonal matrix
+ * @param[in]  info : initialised by constructor
+ * @param[out] l    : pointer to the lower-diagonal matrix
+ * @return          : error code
+ */
+static int get_l(
+    const tdm_info_t * info,
+    double * restrict * l
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  *l = info->l;
+  return 0;
+}
+
+/**
+ * @brief return pointer to the center-diagonal matrix
+ * @param[in]  info : initialised by constructor
+ * @param[out] c    : pointer to the center-diagonal matrix
+ * @return          : error code
+ */
+static int get_c(
+    const tdm_info_t * info,
+    double * restrict * c
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  *c = info->c;
+  return 0;
+}
+
+/**
+ * @brief return pointer to the upper-diagonal matrix
+ * @param[in]  info : initialised by constructor
+ * @param[out] u    : pointer to the upper-diagonal matrix
+ * @return          : error code
+ */
+static int get_u(
+    const tdm_info_t * info,
+    double * restrict * u
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  *u = info->u;
+  return 0;
+}
+
+static int get_size(
+    const tdm_info_t * info,
+    int * size
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  *size = info->size;
+  return 0;
+}
+
+static int get_nrhs(
+    const tdm_info_t * info,
+    int * nrhs
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  *nrhs = info->nrhs;
+  return 0;
+}
+
+/**
+ * @brief solve tri-diagonal systems for the given input
+ * @param[in]     info : initialised by constructor
+ * @param[in,out] data : pointer to the right-hand-side terms, also used as a place to store the result
+ * @return             : error code
+ */
+static int solve(
+    const tdm_info_t * info,
+    void * restrict data
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  const int size = info->size;
+  const int nrhs = info->nrhs;
+  const bool is_periodic = info->is_periodic;
+  const bool is_complex  = info->is_complex;
+  const double * restrict l = info->l;
+  const double * restrict c = info->c;
+  const double * restrict u = info->u;
+  double * restrict v  = info->v;
+  double * restrict q1 = info->q1;
+  if(is_complex){
+    tdm_solve_fftw_complex(size, nrhs, is_periodic, l, c, u, v, data, q1);
+  }else{
+    tdm_solve_double(size, nrhs, is_periodic, l, c, u, v, data, q1);
+  }
+  return 0;
+}
+
+/**
+ * @brief deallocate internal buffers
+ * @param[in] info : initialised by constructor
+ * @return         : error code
+ */
+static int destruct(
+    tdm_info_t * info
+){
+  if(NULL == info){
+    printf("ERROR(%s): info is NULL\n", __func__);
+    return 1;
+  }
+  memory_free(info->l);
+  memory_free(info->c);
+  memory_free(info->u);
+  memory_free(info->v);
+  memory_free(info->q1);
+  memory_free(info);
+  return 0;
+}
+
+const tdm_t tdm = {
+  .construct = construct,
+  .get_l     = get_l,
+  .get_c     = get_c,
+  .get_u     = get_u,
+  .get_size  = get_size,
+  .get_nrhs  = get_nrhs,
+  .solve     = solve,
+  .destruct  = destruct,
+};
+
diff --git a/src/timer.c b/src/timer.c
new file mode 100644
index 0000000..bbf8a87
--- /dev/null
+++ b/src/timer.c
@@ -0,0 +1,18 @@
+#include <mpi.h>
+#include "timer.h"
+
+/**
+ * @brief get current time
+ * @return : current time
+ */
+double timer(
+    void
+){
+  const int root = 0;
+  // although this is called by all processes,
+  double time = MPI_Wtime();
+  // use the result of the main process
+  MPI_Bcast(&time, 1, MPI_DOUBLE, root, MPI_COMM_WORLD);
+  return time;
+}
+
diff --git a/tools/README.rst b/tools/README.rst
new file mode 100644
index 0000000..c229912
--- /dev/null
+++ b/tools/README.rst
@@ -0,0 +1,19 @@
+######
+tools/
+######
+
+****
+Note
+****
+
+Normally you do not have to pay attention to the files here.
+
+*****
+Files
+*****
+
+#. ``define_arrays.py``
+
+   This script defines macros to enable ``UX(i, j, k)`` notations in C.
+   Since macros have already been included in the package, you do not have to regenerate them.
+
diff --git a/tools/define_arrays.py b/tools/define_arrays.py
new file mode 100644
index 0000000..4605438
--- /dev/null
+++ b/tools/define_arrays.py
@@ -0,0 +1,262 @@
+# define macros to access arrays easily
+# for 1D, x-face, x-center, and their distances are defined
+# for ND, arrays to hold velocity, pressure, temperature, etc. are defined
+
+import os
+import sys
+import re
+
+
+def get_lbound(lbound):
+    # -1 -> "2"
+    #  0 -> "1"
+    # +1 -> "0"
+    lbound = - lbound + 1
+    return f"{lbound}"
+
+
+def get_ubound(ubound, prefix):
+    # -1 -> "isize-1"
+    #  0 -> "isize+0"
+    # +1 -> "isize+1"
+    return f"{prefix}{ubound:+d}"
+
+
+def get_nitems(lbound, ubound):
+    # get size of a vector whose lower- / upper-bounds are given
+    # e.g.
+    #    nitems = isize+1 if lbound = 1 and ubound = isize+1
+    #    nitems = jsize+2 if lbound = 0 and ubound = jsize+1
+    # extract numbers
+    s_ = re.sub("[a-zA-Z]", "", lbound.strip())
+    e_ = re.sub("[a-zA-Z]", "", ubound.strip())
+    s_ = int(s_)
+    if e_ == "":
+        e_ = 0
+    else:
+        e_ = int(e_)
+    nitems = f"{e_ - s_ + 1:+d}"
+    return nitems
+
+
+def get_index(index, lbound):
+    # get index of a vector whose lower-bound is given
+    # e.g.
+    #   return I-1 if lbound is  1 to map I =  1 to the head of vector
+    #   return I+1 if lbound is -1 to map I = -1 to the head of vector
+    if lbound == "0":
+        return f"{index}  "
+    else:
+        val = -int(lbound)
+        return f"{index}{val:+d}"
+
+
+def get_index_1d(lbound_0):
+    iindex = get_index("I", lbound_0)
+    string = (
+        f"({iindex})"
+    )
+    return string
+
+
+def get_index_2d(nitems_0, lbound_0, lbound_1):
+    iindex = get_index("I", lbound_0)
+    jindex = get_index("J", lbound_1)
+    string = (
+        f"({iindex})"
+        f" + "
+        f"(isize{nitems_0})"
+        f" * "
+        f"({jindex})"
+    )
+    return string
+
+
+def get_index_3d(nitems_0, nitems_1, lbound_0, lbound_1, lbound_2):
+    iindex = get_index("I", lbound_0)
+    jindex = get_index("J", lbound_1)
+    kindex = get_index("K", lbound_2)
+    string = (
+        f"({iindex})"
+        f" + "
+        f"(isize{nitems_0})"
+        f" * "
+        f"("
+        f"({jindex})"
+        f" + "
+        f"(jsize{nitems_1})"
+        f" * "
+        f"({kindex})"
+        f")"
+    )
+    return string
+
+
+def output(dname, vname, text):
+    fname = f"{dname}/{vname}.h"
+    # attach include-guard
+    include_guard = fname.replace("/", "_").replace(".", "_").upper()
+    header = (
+            f"#if !defined({include_guard})\n"
+            f"#define {include_guard}\n"
+            f"\n"
+            f"// This file is generated by {sys.argv[0]}\n"
+            f"\n"
+    )
+    footer = (
+            f"#endif // {include_guard}\n"
+    )
+    with open(fname, "w") as f:
+        f.write("".join([header, text, footer]))
+
+
+def gen_1d(dname, vname, bounds):
+    # prepare macros for one-dimensional vector
+    lbound_0 = get_lbound(bounds[0])
+    ubound_0 = get_ubound(bounds[1], "isize")
+    index_1d = get_index_1d(lbound_0)
+    char_0 = "I"
+    text = (
+            f"// [{lbound_0} : {ubound_0}]\n"
+            f"#define {vname.upper()}({char_0})"
+            f" ({vname}[{index_1d}])\n"
+            f"#define {vname.upper()}_NADDS"
+            f" (int [2]){{{bounds[0]}, {bounds[1]}}}\n"
+            f"\n"
+    )
+    output(dname, vname, text)
+
+
+def gen_nd(dname, vname, only_3d, bounds):
+    # prepare macros for N-dimensional array
+    lbound_0 = get_lbound(bounds[0][0])
+    ubound_0 = get_ubound(bounds[0][1], "isize")
+    lbound_1 = get_lbound(bounds[1][0])
+    ubound_1 = get_ubound(bounds[1][1], "jsize")
+    lbound_2 = get_lbound(bounds[2][0])
+    ubound_2 = get_ubound(bounds[2][1], "ksize")
+    char_0 = "I"
+    char_1 = "J"
+    char_2 = "K"
+    nitems_0 = get_nitems(lbound_0, ubound_0)
+    nitems_1 = get_nitems(lbound_1, ubound_1)
+    index_2d = get_index_2d(
+            nitems_0,
+            lbound_0,
+            lbound_1
+    )
+    index_3d = get_index_3d(
+            nitems_0,
+            nitems_1,
+            lbound_0,
+            lbound_1,
+            lbound_2
+    )
+    text = str()
+    # 2D array (not for z-related things)
+    if not only_3d:
+        text += (
+            f"#if NDIMS == 2\n"
+            f"// [{lbound_0} : {ubound_0}],"
+            f" [{lbound_1} : {ubound_1}]\n"
+            f"#define {vname.upper()}({char_0}, {char_1})"
+            f" ({vname}[{index_2d}])\n"
+            f"#define {vname.upper()}_NADDS"
+            f" (int [NDIMS][2]){{"
+            f" {{{bounds[0][0]}, {bounds[0][1]}}},"
+            f" {{{bounds[1][0]}, {bounds[1][1]}}},"
+            f" }}\n"
+            f"#endif\n"
+            f"\n"
+        )
+    # 3D array
+    text += (
+        f"#if NDIMS == 3\n"
+        f"// [{lbound_0} : {ubound_0}],"
+        f" [{lbound_1} : {ubound_1}],"
+        f" [{lbound_2} : {ubound_2}]\n"
+        f"#define {vname.upper()}({char_0}, {char_1}, {char_2})"
+        f" ({vname}[{index_3d}])\n"
+        f"#define {vname.upper()}_NADDS"
+        f" (int [NDIMS][2]){{"
+        f" {{{bounds[0][0]}, {bounds[0][1]}}},"
+        f" {{{bounds[1][0]}, {bounds[1][1]}}},"
+        f" {{{bounds[2][0]}, {bounds[2][1]}}},"
+        f" }}\n"
+        f"#endif\n"
+        f"\n"
+    )
+    output(dname, vname, text)
+
+
+def domain(root):
+    dname = f"{root}/domain"
+    os.system(f"rm {dname}/*.h")
+    gen_1d(dname, "xf",   (+0, +1))
+    gen_1d(dname, "xc",   (+1, +1))
+    # Jacobian determinant at x face and center
+    gen_1d(dname, "jdxf", (+0, +1))
+    gen_1d(dname, "jdxc", (+0, +0))
+    # scale factors in x at x face and center
+    gen_1d(dname, "hxxf", (+0, +1))
+    gen_1d(dname, "hxxc", (+0, +0))
+
+
+def fluid(root):
+    dname = f"{root}/fluid"
+    os.system(f"rm {dname}/*.h")
+    gen_nd(dname, "ux",    False, ((+0, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "uy",    False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "uz",    True,  ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "p",     False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "den",   False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "visux", False, ((+0, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "visuy", False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "visuz", True,  ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "txx",   False, ((+0, +0), (+1, +1), (+1, +1)))
+    gen_nd(dname, "txy",   False, ((+0, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "txz",   True,  ((+0, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "tyy",   False, ((+0, +0), (+1, +1), (+1, +1)))
+    gen_nd(dname, "tyz",   True,  ((+0, +0), (+1, +1), (+1, +1)))
+    gen_nd(dname, "tzz",   True,  ((+0, +0), (+1, +1), (+1, +1)))
+    gen_nd(dname, "psi",   False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "srcux", False, ((-1, +0), (+0, +0), (+0, +0)))
+    gen_nd(dname, "srcuy", False, ((+0, +0), (+0, +0), (+0, +0)))
+    gen_nd(dname, "srcuz", True,  ((+0, +0), (+0, +0), (+0, +0)))
+
+
+def interface(root):
+    dname = f"{root}/interface"
+    os.system(f"rm {dname}/*.h")
+    gen_nd(dname, "ifrcx",  False, ((-1, +0), (+0, +0), (+0, +0)))
+    gen_nd(dname, "ifrcy",  False, ((+0, +0), (+0, +0), (+0, +0)))
+    gen_nd(dname, "ifrcz",  True,  ((+0, +0), (+0, +0), (+0, +0)))
+    gen_nd(dname, "vof",    False, ((+1, +1), (+2, +2), (+2, +2)))
+    gen_nd(dname, "curv",   False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "dvof",   False, ((+0, +1), (+1, +2), (+1, +2)))
+    gen_nd(dname, "fluxx",  False, ((+0, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "fluxy",  False, ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "fluxz",  True,  ((+1, +1), (+1, +1), (+1, +1)))
+    gen_nd(dname, "normal", False, ((+0, +0), (+1, +1), (+1, +1)))
+    gen_nd(dname, "src",    False, ((+0, +0), (+0, +0), (+0, +0)))
+
+
+def statistics(root):
+    dname = f"{root}/statistics"
+    os.system(f"rm {dname}/*.h")
+    gen_nd(dname, "ux1",  False, ((+0, +1), (+0, +0), (+0, +0)))
+    gen_nd(dname, "uy1",  False, ((+1, +1), (+0, +0), (+0, +0)))
+    gen_nd(dname, "uz1",  True,  ((+1, +1), (+0, +0), (+0, +0)))
+    gen_nd(dname, "vof1", False, ((+1, +1), (+0, +0), (+0, +0)))
+
+
+if __name__ == "__main__":
+    root = "include/array_macros"
+    # coordinates in the wall-normal direction
+    domain(root)
+    # velocity, pressure, etc.
+    fluid(root)
+    # vof-related things
+    interface(root)
+    # arrays to store temporally-averaged statistics
+    statistics(root)