From 857a451646e1653287638cf98b2df65755db1987 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Mon, 2 Nov 2020 11:05:37 +1100 Subject: [PATCH 1/4] Add benchmark functions --- doc/api.rst | 5 + src/climtas/__init__.py | 1 + src/climtas/profile.py | 241 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 247 insertions(+) create mode 100644 src/climtas/profile.py diff --git a/doc/api.rst b/doc/api.rst index 840f480..8d367c0 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -22,3 +22,8 @@ climtas.regrid -------------- .. automodule:: climtas.regrid :members: + +climtas.profile +--------------- +.. automodule:: climtas.profile + :members: diff --git a/src/climtas/__init__.py b/src/climtas/__init__.py index cd0a2ab..f6627ec 100644 --- a/src/climtas/__init__.py +++ b/src/climtas/__init__.py @@ -7,5 +7,6 @@ from . import io from . import regrid from . import blocked +from . import profile from .blocked import blocked_resample, blocked_groupby diff --git a/src/climtas/profile.py b/src/climtas/profile.py new file mode 100644 index 0000000..fef4468 --- /dev/null +++ b/src/climtas/profile.py @@ -0,0 +1,241 @@ +#!/g/data/hh5/public/apps/nci_scripts/python-analysis3 +# Copyright 2020 Scott Wales +# author: Scott Wales +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Profiling dask data processing + +* :func:`benchmark` runs a function with different chunks, returning the time + taken for each chunk setting + +* :func:`profile` runs a function with a single chunk setting, returning the + time taken in different dask stages and chunk information + +Profile results +=============== + + time_total + Total time taken to process the data (seconds) + time_open + Time spent opening the dataset (seconds) + time_function + Time spent running the function (seconds) + time_optimize + Time spent optimizing the Dask graph (seconds) + time_load + Time spent computing the data with Dask (seconds) + chunks + Chunk shape + nchunks_in + Number of chunks in loaded data + nchunks_out + Number of chunks in function output + chunksize_in + Size of chunks in loaded data + chunksize_out + Size of chunks in function output + tasks_in + Dask graph size in loaded data + tasks_out + Dask graph size in function output + tasks_optimized + Dask graph size after optimizing function output +""" + +from typing import Dict, Any, List +import xarray +import dask +import time +import pandas +import numpy + + +def benchmark(paths: str, variable: str, chunks: Dict[str,List[int]], function, run_count: int=3, mfdataset_args: Dict[str, Any]={}): + """ + Profile a function on different chunks of data + + Opens a dataset with :func:`xarray.open_mfdataset` with one of the chunk + options, then runs function on variable + + >>> def func(da): + ... return t2m.mean() + >>> climtas.profile.benchmark( + ... '/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_*.nc', + ... variable='t2m', + ... function=func, + ... chunks={'time':[93, 93], 'latitude': [91, 91], 'longitude': [180, 180*2]}) #doctest: +SKIP + + Args: + paths: Paths to open (as :func:`xarray.open_mfdataset`) + variable: Variable in the dataset to use + chunks: Mapping of dimension name to a list of chunk sizes, one entry + for each run + function: Function that takes a :obj:`xarray.DataArray` (the variable) + and returns a :obj:`xarray.DataArray` to test the performance of + run_count: Number of times to run each profile (the minimum time is returned) + mfdataset_args: Extra arguments to pass to :func:`xarray.open_mfdataset` + + Returns: + :obj:`pandas.DataFrame` with information from :func:`profile` for each + run + """ + + css = [] + results = [] + for values in zip(*chunks.values()): + cs = dict(zip(chunks.keys(), values)) + results.append(profile(paths, variable, cs, function, run_count, mfdataset_args)) + + r = pandas.DataFrame(results) + + return r + + +def profile(paths: str, variable: str, chunks: Dict[str, int], function, run_count: int=3, mfdataset_args: Dict[str, Any]={}): + """ + Run a function run_count times, returning the minimum time taken + + >>> def func(da): + ... return t2m.mean() + >>> climtas.profile.profile( + ... '/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_*.nc', + ... variable='t2m', + ... function=func, + ... chunks={'time':93, 'latitude': 91, 'longitude': 180}) #doctest: +SKIP + {'time_total': 9.561158710159361, + 'time_open': 0.014718276914209127, + 'time_function': 0.0033595040440559387, + 'time_optimize': 0.01087462529540062, + 'time_load': 9.529402975924313, + 'chunks': {'time': 93, 'latitude': 91, 'longitude': 180}, + 'nchunks_in': 512, + 'nchunks_out': 1, + 'chunksize_in': '6.09 MB', + 'chunksize_out': '4 B', + 'tasks_in': 513, + 'tasks_out': 1098, + 'tasks_optimized': 1098} + + Args: + paths: Paths to open (as :func:`xarray.open_mfdataset`) + variable: Variable in the dataset to use + chunks: Mapping of dimension name to chunk sizes + function: Function that takes a :obj:`xarray.DataArray` (the variable) + and returns a :obj:`dask.array.Array` to test the performance of + run_count: Number of times to run each profile (the minimum time is returned) + mfdataset_args: Extra arguments to pass to :func:`xarray.open_mfdataset` + + Returns: + Dict[str, int] :ref:`profiling information` + """ + + result = profile_once(paths, variable, chunks, function, mfdataset_args) + + for n in range(run_count - 1): + r = profile_once(paths, variable, chunks, function, mfdataset_args) + + for k,v in r.items(): + if k.startswith('time_') and v < result[k]: + result[k] = v + + return result + + +def profile_once(paths: str, variable: str, chunks: Dict[str, int], function, mfdataset_args: Dict[str, Any]={}): + """ + Run a single profile instance + + >>> def func(da): + ... return t2m.mean() + >>> climtas.profile.profile_once( + ... '/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_*.nc', + ... variable='t2m', + ... function=func, + ... chunks={'time':93, 'latitude': 91, 'longitude': 180}) #doctest: +SKIP + {'time_total': 9.561158710159361, + 'time_open': 0.014718276914209127, + 'time_function': 0.0033595040440559387, + 'time_optimize': 0.01087462529540062, + 'time_load': 9.529402975924313, + 'chunks': {'time': 93, 'latitude': 91, 'longitude': 180}, + 'nchunks_in': 512, + 'nchunks_out': 1, + 'chunksize_in': '6.09 MB', + 'chunksize_out': '4 B', + 'tasks_in': 513, + 'tasks_out': 1098, + 'tasks_optimized': 1098} + + Args: + paths: Paths to open (as :func:`xarray.open_mfdataset`) + variable: Variable in the dataset to use + chunks: Mapping of dimension name to chunk sizes + function: Function that takes a :obj:`xarray.DataArray` (the variable) + and returns a :obj:`dask.array.Array` to test the performance of + run_count: Number of times to run each profile (the minimum time is returned) + mfdataset_args: Extra arguments to pass to :func:`xarray.open_mfdataset` + + Returns: + Dict[str, int] :ref:`profiling information` + """ + + results = {} + + total_start = time.perf_counter() + + open_start = time.perf_counter() + with xarray.open_mfdataset(paths, chunks=chunks, **mfdataset_args) as data: + open_end = time.perf_counter() + + var = data[variable] + tasks_in = len(var.data.__dask_graph__()) + chunks_in = var.data.npartitions + chunksize_in = dask.utils.format_bytes(numpy.prod(var.data.chunksize) * var.dtype.itemsize) + + func_start = time.perf_counter() + r = function(var).data + func_end = time.perf_counter() + + tasks = len(r.__dask_graph__()) + chunksize = dask.utils.format_bytes(numpy.prod(r.chunksize) * r.dtype.itemsize) + chunks_out = r.npartitions + + opt_start = time.perf_counter() + opt = dask.optimize(r) + opt_end = time.perf_counter() + + tasks_opt = len(r.__dask_graph__()) + + load_start = time.perf_counter() + dask.compute(opt) + load_end = time.perf_counter() + + total_end = time.perf_counter() + + results['time_total'] = total_end - total_start + results['time_open'] = open_end - open_start + results['time_function'] = func_end - func_start + results['time_optimize'] = opt_end - opt_start + results['time_load'] = load_end - load_start + results['chunks'] = chunks + results['nchunks_in'] = chunks_in + results['nchunks_out'] = chunks_out + results['chunksize_in'] = chunksize_in + results['chunksize_out'] = chunksize + results['tasks_in'] = tasks_in + results['tasks_out'] = tasks + results['tasks_optimized'] = tasks_opt + + return results + From 4aa9177adb029dad66b887960f169849ea38d5a1 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Mon, 2 Nov 2020 11:06:49 +1100 Subject: [PATCH 2/4] Blacken --- src/climtas/profile.py | 67 ++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/src/climtas/profile.py b/src/climtas/profile.py index fef4468..4953ecf 100644 --- a/src/climtas/profile.py +++ b/src/climtas/profile.py @@ -61,7 +61,14 @@ import numpy -def benchmark(paths: str, variable: str, chunks: Dict[str,List[int]], function, run_count: int=3, mfdataset_args: Dict[str, Any]={}): +def benchmark( + paths: str, + variable: str, + chunks: Dict[str, List[int]], + function, + run_count: int = 3, + mfdataset_args: Dict[str, Any] = {}, +): """ Profile a function on different chunks of data @@ -95,14 +102,23 @@ def benchmark(paths: str, variable: str, chunks: Dict[str,List[int]], function, results = [] for values in zip(*chunks.values()): cs = dict(zip(chunks.keys(), values)) - results.append(profile(paths, variable, cs, function, run_count, mfdataset_args)) + results.append( + profile(paths, variable, cs, function, run_count, mfdataset_args) + ) r = pandas.DataFrame(results) return r -def profile(paths: str, variable: str, chunks: Dict[str, int], function, run_count: int=3, mfdataset_args: Dict[str, Any]={}): +def profile( + paths: str, + variable: str, + chunks: Dict[str, int], + function, + run_count: int = 3, + mfdataset_args: Dict[str, Any] = {}, +): """ Run a function run_count times, returning the minimum time taken @@ -145,14 +161,20 @@ def profile(paths: str, variable: str, chunks: Dict[str, int], function, run_cou for n in range(run_count - 1): r = profile_once(paths, variable, chunks, function, mfdataset_args) - for k,v in r.items(): - if k.startswith('time_') and v < result[k]: + for k, v in r.items(): + if k.startswith("time_") and v < result[k]: result[k] = v return result -def profile_once(paths: str, variable: str, chunks: Dict[str, int], function, mfdataset_args: Dict[str, Any]={}): +def profile_once( + paths: str, + variable: str, + chunks: Dict[str, int], + function, + mfdataset_args: Dict[str, Any] = {}, +): """ Run a single profile instance @@ -176,7 +198,7 @@ def profile_once(paths: str, variable: str, chunks: Dict[str, int], function, mf 'tasks_in': 513, 'tasks_out': 1098, 'tasks_optimized': 1098} - + Args: paths: Paths to open (as :func:`xarray.open_mfdataset`) variable: Variable in the dataset to use @@ -201,7 +223,9 @@ def profile_once(paths: str, variable: str, chunks: Dict[str, int], function, mf var = data[variable] tasks_in = len(var.data.__dask_graph__()) chunks_in = var.data.npartitions - chunksize_in = dask.utils.format_bytes(numpy.prod(var.data.chunksize) * var.dtype.itemsize) + chunksize_in = dask.utils.format_bytes( + numpy.prod(var.data.chunksize) * var.dtype.itemsize + ) func_start = time.perf_counter() r = function(var).data @@ -223,19 +247,18 @@ def profile_once(paths: str, variable: str, chunks: Dict[str, int], function, mf total_end = time.perf_counter() - results['time_total'] = total_end - total_start - results['time_open'] = open_end - open_start - results['time_function'] = func_end - func_start - results['time_optimize'] = opt_end - opt_start - results['time_load'] = load_end - load_start - results['chunks'] = chunks - results['nchunks_in'] = chunks_in - results['nchunks_out'] = chunks_out - results['chunksize_in'] = chunksize_in - results['chunksize_out'] = chunksize - results['tasks_in'] = tasks_in - results['tasks_out'] = tasks - results['tasks_optimized'] = tasks_opt + results["time_total"] = total_end - total_start + results["time_open"] = open_end - open_start + results["time_function"] = func_end - func_start + results["time_optimize"] = opt_end - opt_start + results["time_load"] = load_end - load_start + results["chunks"] = chunks + results["nchunks_in"] = chunks_in + results["nchunks_out"] = chunks_out + results["chunksize_in"] = chunksize_in + results["chunksize_out"] = chunksize + results["tasks_in"] = tasks_in + results["tasks_out"] = tasks + results["tasks_optimized"] = tasks_opt return results - From 217faecf9deab9140c0a6f55a2f7aa0b8a41ee1c Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Mon, 2 Nov 2020 11:10:33 +1100 Subject: [PATCH 3/4] Add type --- src/climtas/profile.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/climtas/profile.py b/src/climtas/profile.py index 4953ecf..29e4b1e 100644 --- a/src/climtas/profile.py +++ b/src/climtas/profile.py @@ -98,7 +98,6 @@ def benchmark( run """ - css = [] results = [] for values in zip(*chunks.values()): cs = dict(zip(chunks.keys(), values)) @@ -209,10 +208,10 @@ def profile_once( mfdataset_args: Extra arguments to pass to :func:`xarray.open_mfdataset` Returns: - Dict[str, int] :ref:`profiling information` + Dict[str, Any] :ref:`profiling information` """ - results = {} + results: Dict[str, Any] = {} total_start = time.perf_counter() From 6444814a04b06e7a8b2b4d65f71cedc2a3c45e08 Mon Sep 17 00:00:00 2001 From: Scott Wales Date: Mon, 2 Nov 2020 11:38:58 +1100 Subject: [PATCH 4/4] Add notebook --- notebooks/Benchmarking.ipynb | 463 +++++++++++++++++++++++++++++++++++ 1 file changed, 463 insertions(+) create mode 100644 notebooks/Benchmarking.ipynb diff --git a/notebooks/Benchmarking.ipynb b/notebooks/Benchmarking.ipynb new file mode 100644 index 0000000..07deb74 --- /dev/null +++ b/notebooks/Benchmarking.ipynb @@ -0,0 +1,463 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('0.16.1', '0.2.5+11.g217faec')" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xarray\n", + "import climtas\n", + "\n", + "xarray.__version__, climtas.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[0m\u001b[01;32m/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_20190131.nc\u001b[0m*\n", + "\u001b[01;32m/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190201_20190228.nc\u001b[0m*\n", + "\u001b[01;32m/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190301_20190331.nc\u001b[0m*\n", + "\u001b[01;32m/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190401_20190430.nc\u001b[0m*\n", + "\u001b[01;32m/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190501_20190531.nc\u001b[0m*\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190601_20190630.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190701_20190731.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190801_20190831.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190901_20190930.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20191001_20191031.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20191101_20191130.nc\n", + "/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20191201_20191231.nc\n" + ] + } + ], + "source": [ + "ls /g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_*.nc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Function to benchmark\n", + "\n", + "We'll look at a simple function to start off with, that just returns the mean of the data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def func(da): \n", + " return da.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running Once\n", + "\n", + "Just running it once gives a basic idea of performance. I've used the chunking in the NetCDF file as a starting point." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'time_total': 7.717563376761973,\n", + " 'time_open': 0.020742579828947783,\n", + " 'time_function': 0.003424877766519785,\n", + " 'time_optimize': 0.01196580519899726,\n", + " 'time_load': 7.677215476054698,\n", + " 'chunks': {'time': 93, 'latitude': 91, 'longitude': 180},\n", + " 'nchunks_in': 512,\n", + " 'nchunks_out': 1,\n", + " 'chunksize_in': '6.09 MB',\n", + " 'chunksize_out': '4 B',\n", + " 'tasks_in': 513,\n", + " 'tasks_out': 1098,\n", + " 'tasks_optimized': 1098}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "climtas.profile.profile_once('/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_*.nc',\n", + " variable='t2m',\n", + " function=func,\n", + " chunks={'time':93, 'latitude': 91, 'longitude': 180})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmarking\n", + "\n", + "Now let's benchmark with a few different chunk shapes. The function gets run three times with each of the chunk options and the minimum time taken is returned" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
time_totaltime_opentime_functiontime_optimizetime_loadchunksnchunks_innchunks_outchunksize_inchunksize_outtasks_intasks_outtasks_optimized
08.3447630.0111510.0030140.0104188.316582{'time': 93, 'latitude': 91, 'longitude': 180}51216.09 MB4 B51310981098
18.2375210.0119470.0018750.0028998.213991{'time': 93, 'latitude': 182, 'longitude': 360}128124.37 MB4 B129276276
211.8807700.0083980.0044150.02043711.834581{'time': 93, 'latitude': 91, 'longitude': 90}102413.05 MB4 B102521962196
\n", + "
" + ], + "text/plain": [ + " time_total time_open time_function time_optimize time_load \\\n", + "0 8.344763 0.011151 0.003014 0.010418 8.316582 \n", + "1 8.237521 0.011947 0.001875 0.002899 8.213991 \n", + "2 11.880770 0.008398 0.004415 0.020437 11.834581 \n", + "\n", + " chunks nchunks_in nchunks_out \\\n", + "0 {'time': 93, 'latitude': 91, 'longitude': 180} 512 1 \n", + "1 {'time': 93, 'latitude': 182, 'longitude': 360} 128 1 \n", + "2 {'time': 93, 'latitude': 91, 'longitude': 90} 1024 1 \n", + "\n", + " chunksize_in chunksize_out tasks_in tasks_out tasks_optimized \n", + "0 6.09 MB 4 B 513 1098 1098 \n", + "1 24.37 MB 4 B 129 276 276 \n", + "2 3.05 MB 4 B 1025 2196 2196 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "climtas.profile.benchmark('/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_20190101_*.nc',\n", + " variable='t2m',\n", + " function=func,\n", + " chunks={'time':[93, 93, 93], 'latitude': [91, 91*2, 91], 'longitude': [180, 180*2, 180//2]})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distributed\n", + "\n", + "Let's switch to Dask's distributed mode, and process a whole year of data on 4 cpus" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/scratch/w35/saw562/conda/envs/dev/lib/python3.8/site-packages/distributed/node.py:151: UserWarning: Port 8787 is already in use.\n", + "Perhaps you already have a cluster running?\n", + "Hosting the HTTP server on port 38883 instead\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 4
  • \n", + "
  • Cores: 4
  • \n", + "
  • Memory: 17.18 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import climtas.nci\n", + "client = climtas.nci.GadiClient()\n", + "client" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
time_totaltime_opentime_functiontime_optimizetime_loadchunksnchunks_innchunks_outchunksize_inchunksize_outtasks_intasks_outtasks_optimized
059.9224520.1194710.0186150.33310458.995246{'time': 93, 'latitude': 91, 'longitude': 180}614416.09 MB4 B123001933219332
146.9085120.1728090.0068540.05671146.660048{'time': 93, 'latitude': 182, 'longitude': 360}1536124.37 MB4 B308448604860
295.7613460.2131630.0341420.82963293.819402{'time': 93, 'latitude': 91, 'longitude': 90}1228813.05 MB4 B245883864038640
\n", + "
" + ], + "text/plain": [ + " time_total time_open time_function time_optimize time_load \\\n", + "0 59.922452 0.119471 0.018615 0.333104 58.995246 \n", + "1 46.908512 0.172809 0.006854 0.056711 46.660048 \n", + "2 95.761346 0.213163 0.034142 0.829632 93.819402 \n", + "\n", + " chunks nchunks_in nchunks_out \\\n", + "0 {'time': 93, 'latitude': 91, 'longitude': 180} 6144 1 \n", + "1 {'time': 93, 'latitude': 182, 'longitude': 360} 1536 1 \n", + "2 {'time': 93, 'latitude': 91, 'longitude': 90} 12288 1 \n", + "\n", + " chunksize_in chunksize_out tasks_in tasks_out tasks_optimized \n", + "0 6.09 MB 4 B 12300 19332 19332 \n", + "1 24.37 MB 4 B 3084 4860 4860 \n", + "2 3.05 MB 4 B 24588 38640 38640 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "climtas.profile.benchmark('/g/data/ub4/era5/netcdf/surface/t2m/2019/t2m_era5_global_*.nc',\n", + " variable='t2m',\n", + " function=func,\n", + " chunks={'time':[93, 93, 93], 'latitude': [91, 91*2, 91], 'longitude': [180, 180*2, 180//2]})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:conda-dev]", + "language": "python", + "name": "conda-env-conda-dev-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}