From 641614a5c7d71d42ebfe5e2951ca98f9e4f33278 Mon Sep 17 00:00:00 2001 From: Mike Hendricks Date: Fri, 19 Jan 2024 12:01:01 -0800 Subject: [PATCH] Add per-site cache mechanism to speed up site resolve The slowest part of using the hab cli is the globing of config/distro paths(especially for network paths on windows). Individually parsing hundreds of json files also is slower than parsing a single json file containing the same data, which caching helps out with. --- README.md | 108 ++++++++++++++------ hab/cache.py | 186 ++++++++++++++++++++++++++++++++++ hab/cli.py | 31 +++++- hab/parsers/config.py | 3 + hab/parsers/distro_version.py | 114 +++++++++++++-------- hab/parsers/hab_base.py | 22 +++- hab/resolver.py | 17 ++-- hab/site.py | 47 ++++++++- 8 files changed, 436 insertions(+), 92 deletions(-) create mode 100644 hab/cache.py diff --git a/README.md b/README.md index e404ed7..77cef6a 100644 --- a/README.md +++ b/README.md @@ -417,6 +417,41 @@ Note the order of left/middle/right in the test_paths variable. Also, for site file with it defined is used. The other path maps are picked up from the site file they are defined in. +#### Platform Path Maps + +The site setting `platform_path_maps` is a dictionary, the key is a unique name +for each mapping, and value is a dictionary of leading directory paths for each platform. +[PurePath.relative_to](https://docs.python.org/3/library/pathlib.html#pathlib.PurePath.relative_to) +is used to match, so full directory names need to be used. The unique name allows +for multiple site json files to override the setting as well as converting +resolved file paths to str.format style paths (`{server-main}/folder/file.txt`). +If multiple site json files specify the same key, the right-most site json file +specifying that key is used. It is safe to use forward slashes for windows paths. + +```json +{ + "append": { + "platform_path_maps": { + "server-main": { + "linux": "/mnt/main", + "windows": "//example//main" + }, + "server-dev": { + "linux": "/mnt/dev", + "windows": "//example//dev" + } + } + }, + "set": { + "platforms": ["linux", "windows"] + } +} +``` + +With these settings, if a path on a linux host, starts with `/mnt/main` when +generating the corresponding windows file path it will translate it to +`\\example\main`. Note the use of `platforms` to disable osx platform support. + #### Hab Entry Points The site file can be used to replace some hab functionality with custom plugins. @@ -509,6 +544,37 @@ Alternatively, you could create a second host site file named `c:\hab\host_no_gu put the gui disabling config in that file and on the host's you want to disable the gui prepend to `HAB_PATHS=c:\hab\host_no_gui.json;c:\hab\host.json;\\server\share\studio.json`. +#### Habcache + +By default hab has to find and process all available configs and distros every +time it's launched. This has to glob the file paths in `config_paths` and +`distro_paths`, and parse each file it finds. As you add more distro versions +and configs this can slow down the launching of hab. This is especially true +when storing them on the network and when using windows. + +To address this you can add per-site habcache files. This is a cross-platform +collection of all of the found files for a specific site file's `config_paths` +and `distro_paths` glob strings. + +To enable caching run `hab cache /path/to/site_file.json`. This will create a +habcache file next to the `site_file.json`. + +It will be named matching `site["site_cache_file_template"][0]`, which defaults +to `{stem}.habcache` where stem is the site filename without extension. For the +example command it would create the file `/path/to/site_file.habcache`. To ensure +cross platform support, make sure your `HAB_PATHS` configuration contains all of +the required [`platform_path_maps`](#platform-path-maps) site mappings. + +While the `site_file.habcache` exists and `HAB_PATHS` includes `site_file.json` +hab will use the cached value unless the `--no-cache` flag is used. After adding, +updating or removing a config or distro, you will need to run the `hab cache` +command to update the cache with your changes. If using a distribution ci you +should add this command call there. + +The habcache is cross platform as long as the hab site configuration loaded when +calling `hab cache` has all of the required `platform_path_maps` defined. The +cache will replace the start of file paths matching one of the current platform's +mapping values with the mappings key. ### Python version @@ -528,9 +594,14 @@ the scripts: * `colorize`: If `hab dump` should colorize its output for ease of reading. * `config_paths`: Configures where URI configs are discovered. See below. * `distro_paths`: Configures where distros discovered. See below. -* `platform_path_maps`: Configures mappings used to convert paths from one -operating system to another. This is used by the freeze system to ensure that if -unfrozen on another platform it will still work. +* `ignored_distros`: Don't use distros that have this version number. This makes +it possible for a ci to deploy some non-versioned copies of distros next to the +distros so non-hab workflows can access known file paths. For example this could +be used to put a latest folder next to each of the releases of a distro and not +have to remove the .hab.json file in that folder. +* [`platform_path_maps`](#platform-path-maps): Configures mappings used to convert +paths from one operating system to another. This is used by the freeze system to +ensure that if unfrozen on another platform it will still work. * `platforms`: A list of platforms that are supported by these hab configurations. When using freeze, all of these platforms will be stored. Defaults to linux, osx, windows. * `prereleases`: If pre-release distros should be allowed. Works the same as @@ -545,6 +616,8 @@ to override the default(as long as its not disabled.) `hab --prefs dump ...`. than this duration, force the user to re-save the URI returned for `-` when using the `--save-prefs` flag. To enable a timeout set this to a dictionary of kwargs to initialize a `datetime.timedelta` object. +* `site_cache_file_template`: The str.format template defining the name of +[habcache](#habcache) files. `config_paths` and `distro_paths` take a list of glob paths. For a given glob string in these variables you can not have duplicate values. For configs a @@ -560,35 +633,6 @@ global shared configs/distros they are not working on. See [specifying distro version](#specifying-distro-version) for details on specifying a distro version in a git repo. -`platform_path_maps` is a dictionary, the key is a unique name for each mapping, -and value is a dictionary of leading paths for each platform. The unique name -allows for multiple site json files to override the setting. If multiple site -json files specify the same key, the right-most site json file specifying that -key is used. - -```json -{ - "append": { - "platform_path_maps": { - "server-main": { - "linux": "/mnt/main", - "windows": "\\\\example\\main" - }, - "server-dev": { - "linux": "/mnt/dev", - "windows": "\\\\example\\dev" - } - } - }, - "set": { - "platforms": ["linux", "windows"] - } -} -``` - -With these settings, if a path on a linux host, starts with `/mnt/main` when -generating the corresponding windows file path it will translate it to -`\\example\main`. Note the use of `platforms` to disable osx platform support. ### Distro diff --git a/hab/cache.py b/hab/cache.py new file mode 100644 index 0000000..071ee35 --- /dev/null +++ b/hab/cache.py @@ -0,0 +1,186 @@ +import glob +import json +import logging +from pathlib import Path + +from packaging.version import InvalidVersion + +from . import utils +from .errors import InvalidVersionError, _IgnoredVersionError + +logger = logging.getLogger(__name__) + + +class Cache: + """Used to save/restore cached data to speed up initialization of hab. + + The caches are stored per-site file as file next to the site file using the + same stem name. (Ie by default studio.json would have a cache file called + studio.cache). + + If this cache file exists it is used unless enabled is set to False. Cache + files are useful when you have some sort of CI setup to ensure the cache is + re-generated using `save_cache` any time you make changes to configs or + distros that site file references. + + Properties: + cache_template (dict): The str.format template used to find the cache files. + This template requires the kwarg `stem`. + enabled (bool): Used to disable using of the cached data forcing a full + glob and parse of files described by all site files. + """ + + def __init__(self, site): + self.site = site + self._cache = None + self.enabled = True + + # Get the template filename used to find the cache files on disk + self.cache_template = self.site.get("site_cache_file", "{stem}.cache") + + @property + def cached_keys(self): + """A dict of cache keys and how they should be processed. + {Name of key to cache: ("relative file glob", class used to process)} + """ + try: + return self._cached_keys + except AttributeError: + pass + + from .parsers import Config, DistroVersion + + self._cached_keys = { + "config_paths": ("*.json", Config), + "distro_paths": ("*/.hab.json", DistroVersion), + } + return self._cached_keys + + def cache(self, force=False): + if not self.enabled: + # If caching is disabled, never attempt to load the cache + return {} + + if self._cache is not None and not force: + return self._cache + + self._cache = {} + + # Process caches from right to left. This makes it so the left most + # cache_file is respected if any paths are duplicated. + for path in reversed(self.site.paths): + cache_file = self.site_cache_path(path) + if cache_file.is_file(): + logger.debug(f"Site cache loading: {cache_file!s}") + self.load_cache(cache_file) + + # Create a flattened cache removing the glob paths. + flat_cache = {key: {} for key in self.cached_keys} + for key in self._cache: + for values in self._cache.get(key, {}).values(): + flat_cache[key].update(values) + + self._cache["flat"] = flat_cache + + return self._cache + + def config_paths(self, flat=False): + if flat: + return self.cache().get("flat", {}).get("config_paths", {}) + return self.cache().get("config_paths", {}) + + def distro_paths(self, flat=False): + if flat: + return self.cache().get("flat", {}).get("distro_paths", {}) + return self.cache().get("distro_paths", {}) + + def generate_cache(self, resolver, site_file, version=1): + """Generate a cache file of the current state defined by this site file. + This contains the raw values of each URI config and distro file including + version. If this cache exists it is used instead of searching the file + system for each path defined in config_paths or distro_paths defined in + the provided site file. Use this method any time changes are made that + hab needs to be aware of. Caching is enabled by the existence of this file. + """ + from .site import Site + + output = {"version": version} + + # read the site file to get paths to process + temp_site = Site([site_file]) + + for key, stats in self.cached_keys.items(): + glob_str, cls = stats + # Process each glob dir defined for this site + for dirname in temp_site.get(key, []): + cfg_paths = output.setdefault(key, {}).setdefault( + dirname.as_posix(), {} + ) + + # Add each found hab config to the cache + for path in sorted(glob.glob(str(dirname / glob_str))): + path = Path(path) + try: + data = cls(forest={}, resolver=resolver)._load( + path, cached=False + ) + except ( + InvalidVersion, + InvalidVersionError, + _IgnoredVersionError, + ) as error: + logger.debug(str(error)) + else: + cfg_paths[path.as_posix()] = data + + return output + + @classmethod + def iter_cache_paths(cls, name, paths, cache, glob_str=None, include_path=True): + """Yields path information stored in the cache falling back to glob if + not cached. + + Yields: + dirname: Each path stored in paths. + path + """ + for dirname in paths: + dn_posix = dirname.as_posix() + cached = dn_posix in cache + if cached: + logger.debug(f"Using cache for {name} dir: {dn_posix}") + paths = cache[dn_posix] + else: + logger.debug(f"Using glob for {name} dir: {dirname}") + # Fallback to globing the file system + if glob_str: + paths = sorted(glob.glob(str(dirname / glob_str))) + else: + paths = [] + if not include_path: + yield dirname, None, cached + else: + for path in paths: + yield dirname, path, cached + + def load_cache(self, filename): + """For each glob dir add or replace the contents. If a previous cache + has the same glob dir, it's cache is ignored. This expects that + load_cache is called from right to left for each path in `self.site.path`. + """ + contents = utils.load_json_file(filename) + for key in self.cached_keys: + if key in contents: + self._cache.setdefault(key, {}).update(contents[key]) + + def save_cache(self, resolver, site_file, version=1): + cache_file = self.site_cache_path(site_file) + cache = self.generate_cache(resolver, site_file, version=version) + + with cache_file.open("w") as fle: + json.dump(cache, fle, indent=4, cls=utils.HabJsonEncoder) + return cache_file + + def site_cache_path(self, path): + """Returns the name of the cache file for the given site file.""" + return path.parent / self.cache_template.format(stem=path.stem) diff --git a/hab/cli.py b/hab/cli.py index 581a679..e9f6a85 100644 --- a/hab/cli.py +++ b/hab/cli.py @@ -2,6 +2,7 @@ import re import sys import traceback +from datetime import datetime from pathlib import Path import click @@ -168,6 +169,7 @@ def __init__( dump_scripts=False, enable_user_prefs=None, enable_user_prefs_save=False, + cached=True, ): self.verbosity = verbosity self.script_dir = Path(script_dir or ".").resolve() @@ -179,6 +181,7 @@ def __init__( self.site_paths = site_paths if site_paths else [] self.enable_user_prefs = enable_user_prefs self.enable_user_prefs_save = enable_user_prefs_save + self.cached = cached @classmethod def log_context(cls, uri): @@ -192,6 +195,7 @@ def log_context(cls, uri): def resolver(self): if self._resolver is None: site = Site(self.site_paths) + site.cache.enabled = self.cached self._resolver = Resolver( site=site, prereleases=self.prereleases, @@ -393,7 +397,7 @@ def get_command(self, ctx, name): "--script-dir", callback=SharedSettings.set_ctx_instance, type=click.Path(file_okay=False, resolve_path=False), - help="This directory will contain the shell specific script files to enable" + help="This directory will contain the shell specific script files to enable " "this environment configuration.", ) @click.option( @@ -402,6 +406,13 @@ def get_command(self, ctx, name): help="The shell specific scripts created in script-dir will have this " "format and extension.", ) +@click.option( + "--cache/--no-cache", + "cached", + callback=SharedSettings.set_ctx_instance, + default=True, + help="Allow per-site caching of configs and distros.", +) @click.option( "--pre/--no-pre", "prereleases", @@ -663,6 +674,24 @@ def launch(settings, uri, alias, args): settings.write_script(uri, create_launch=True, launch=alias, exit=True, args=args) +# Cache command +@_cli.command() +@click.argument("path", type=click.Path(file_okay=True, resolve_path=True)) +@click.pass_obj +def cache(settings, path): + """Create/update the cache for a given site file. The path argument is the + site config file. To allow for cross-platform support you should make sure + you are loading the same site configuration that will be used by this cache + or at least a site configuration that defines the same `platform_path_maps`. + """ + path = Path(path) + click.echo(f"Caching: {path}") + s = datetime.now() + out = settings.resolver.site.cache.save_cache(settings.resolver, path) + e = datetime.now() + click.echo(f"Cache took: {e - s}, cache file: {out}") + + def cli(*args, **kwargs): """Runs the hab cli. If an exception is raised, only the exception message is printed and the stack trace is hidden. Use `hab -v ...` to enable showing diff --git a/hab/parsers/config.py b/hab/parsers/config.py index af775c4..a2a5d77 100644 --- a/hab/parsers/config.py +++ b/hab/parsers/config.py @@ -18,6 +18,9 @@ def __init__(self, *args, **kwargs): self._alias_mods = NotSet super().__init__(*args, **kwargs) + def _cache(self): + return self.resolver.site.cache.config_paths(flat=True) + @hab_property(process_order=120) def aliases(self): """Dict of the names and commands that need created to launch desired diff --git a/hab/parsers/distro_version.py b/hab/parsers/distro_version.py index b51c459..bbe830b 100644 --- a/hab/parsers/distro_version.py +++ b/hab/parsers/distro_version.py @@ -18,6 +18,66 @@ def __init__(self, *args, **kwargs): self._alias_mods = NotSet super().__init__(*args, **kwargs) + def _cache(self): + return self.resolver.site.cache.distro_paths(flat=True) + + def _resolve_version(self, data, filename): + """Sets and returns self.version to the correct value for this distro. + + This resolves the version from the several ways it can be stored to + simplify deployment and development. See InvalidVersionError for details + on how distros version can be set. + + Raises: + _IgnoredVersionError: Internal use, this version should not be processed. + InvalidVersionError: Raised if the version could not be resolved. + """ + version_txt = self.dirname / ".hab_version.txt" + + if "version" in data: + self.version = data["version"] + return self.version + elif version_txt.exists(): + self.version = version_txt.open().read().strip() + return self.version + + # If version is not defined in json data extract it from the parent + # directory name. This allows for simpler distribution without needing + # to modify version controlled files. + try: + self.version = self.dirname.name + return self.version + except InvalidVersion: + """The parent directory was not a valid version, attempt to get a + version using setuptools_scm. + """ + try: + from setuptools_scm import get_version + except ImportError as error: + raise InvalidVersionError(filename, error=error) from None + + def check_ignored_version(): + if self.dirname.name in self.resolver.ignored: + # This object is not added to the forest until super is called + raise _IgnoredVersionError( + 'Skipping "{}" its dirname is in the ignored list.'.format( + filename + ) + ) from None + + try: + self.version = get_version( + root=self.dirname, version_scheme="release-branch-semver" + ) + return self.version + except LookupError: + check_ignored_version() + raise InvalidVersionError(filename) from None + except Exception as error: + check_ignored_version() + # To make debugging easier include the original exception + raise InvalidVersionError(filename, error=error) from None + @hab_property() def aliases(self): """List of the names and commands that need created to launch desired @@ -38,6 +98,18 @@ def alias_mods(self): """ return self._alias_mods + def _load(self, filename, cached=True): + """Sets self.filename and parses the json file returning the data.""" + ret = super()._load(filename, cached=cached) + + # Resolve the version from the various supported ways its stored. + self._resolve_version(ret, filename) + + if not cached: + # Ensure the version is stored on the returned dictionary + ret["version"] = str(self.version) + return ret + def load(self, filename): # Fill in the DistroVersion specific settings before calling super data = self._load(filename) @@ -46,48 +118,6 @@ def load(self, filename): # Store any alias_mods, they will be processed later when flattening self._alias_mods = data.get("alias_mods", NotSet) - # The version can be stored in several ways to make deployment and dev easier - version_txt = self.dirname / ".hab_version.txt" - if "version" in data: - self.version = data["version"] - elif version_txt.exists(): - self.version = version_txt.open().read().strip() - else: - # If version is not defined in json data extract it from the parent - # directory name. This allows for simpler distribution without needing - # to modify version controlled files. - try: - self.version = self.dirname.name - except InvalidVersion: - """The parent directory was not a valid version, attempt to get a - version using setuptools_scm. - """ - try: - from setuptools_scm import get_version - except ImportError as error: - raise InvalidVersionError(self.filename, error=error) from None - - def check_ignored_version(): - if self.dirname.name in self.resolver.ignored: - # This object is not added to the forest until super is called - raise _IgnoredVersionError( - 'Skipping "{}" its dirname is in the ignored list.'.format( - filename - ) - ) from None - - try: - self.version = get_version( - root=self.dirname, version_scheme="release-branch-semver" - ) - except LookupError: - check_ignored_version() - raise InvalidVersionError(self.filename) from None - except Exception as error: - check_ignored_version() - # To make debugging easier include the original exception - raise InvalidVersionError(self.filename, error=error) from None - # The name should be the version == specifier. self.distro_name = data.get("name") self.name = "{}=={}".format(self.distro_name, self.version) diff --git a/hab/parsers/hab_base.py b/hab/parsers/hab_base.py index f558059..981ba7f 100644 --- a/hab/parsers/hab_base.py +++ b/hab/parsers/hab_base.py @@ -62,6 +62,9 @@ def __repr__(self): cls = type(self) return "{}.{}('{}')".format(cls.__module__, cls.__name__, self.fullpath) + def _cache(self): + return {} + def _collect_values(self, node, props=None, default=False): """Recursively process this config node and its parents until all missing_values have been resolved or we run out of parents. @@ -517,10 +520,23 @@ def inherits(self): # Note: Sub-classes need to override this method to enable inheritance. return False - def _load(self, filename): - """Sets self.filename and parses the json file returning the data.""" + def _load(self, filename, cached=True): + """Sets self.filename and parses the json file returning the data dict. + + Args: + filename (pathlib.Path): The file to load. + cached (bool, optional): Enables loading of cached data instead of + loading the data from disk. + """ self.filename = Path(filename) - logger.debug('Loading "{}"'.format(filename)) + + if cached: + ret = self._cache().get(Path(filename).as_posix()) + if ret: + logger.debug(f'Cached: "{filename}"') + return ret + + logger.debug(f'Loading "{filename}"') return utils.load_json_file(self.filename) def load(self, filename, data=None): diff --git a/hab/resolver.py b/hab/resolver.py index a65916f..29f516c 100644 --- a/hab/resolver.py +++ b/hab/resolver.py @@ -1,6 +1,5 @@ # __all__ = ["Resolver"] -import glob import logging import anytree @@ -261,20 +260,18 @@ def find_distro(self, requirement): def parse_configs(self, config_paths, forest=None): if forest is None: forest = {} - for dirname in config_paths: - for path in sorted(glob.glob(str(dirname / "*.json"))): - Config(forest, self, path, root_paths=set((dirname,))) + for dirname, path in self.site.config_paths(config_paths): + Config(forest, self, path, root_paths=set((dirname,))) return forest def parse_distros(self, distro_paths, forest=None): if forest is None: forest = {} - for dirname in distro_paths: - for path in sorted(glob.glob(str(dirname / "*" / ".hab.json"))): - try: - DistroVersion(forest, self, path, root_paths=set((dirname,))) - except _IgnoredVersionError as error: - logger.debug(str(error)) + for dirname, path in self.site.distro_paths(distro_paths): + try: + DistroVersion(forest, self, path, root_paths=set((dirname,))) + except _IgnoredVersionError as error: + logger.debug(str(error)) return forest def resolve(self, uri): diff --git a/hab/site.py b/hab/site.py index 7ca9ed9..bd2278e 100644 --- a/hab/site.py +++ b/hab/site.py @@ -3,7 +3,10 @@ from collections import UserDict from pathlib import Path, PurePosixPath, PureWindowsPath +from colorama import Fore, Style + from . import utils +from .cache import Cache from .merge_dict import MergeDict logger = logging.getLogger(__name__) @@ -45,6 +48,7 @@ def __init__(self, paths=None, platform=None): paths = os.getenv("HAB_PATHS", "").split(os.pathsep) self.paths = [Path(os.path.expandvars(p)).expanduser() for p in paths if p] + self.cache = Cache(self) self.load() @property @@ -65,14 +69,35 @@ def dump(self, verbosity=0, color=None): if color is None: color = self.get("colorize", True) + def cached_fmt(path, cached): + if not cached: + return path + if color: + return f"{path} {Fore.YELLOW}(cached){Style.RESET_ALL}" + else: + return f"{path} (cached)" + # Include the paths used to configure this site object - site_ret = utils.dump_object( - {"HAB_PATHS": [str(p) for p in self.paths]}, color=color - ) + hab_paths = [] + for path in self.paths: + if verbosity: + # Indicate if a cache file exists for each site config file. + cache_file = self.cache.site_cache_path(path) + path = cached_fmt(path, cache_file.is_file()) + hab_paths.append(str(path)) + site_ret = utils.dump_object({"HAB_PATHS": hab_paths}, color=color) # Include all of the resolved site configurations ret = [] for prop, value in self.items(): - if verbosity < 1 and isinstance(value, dict): + if verbosity and prop in ("config_paths", "distro_paths"): + cache = getattr(self.cache, prop)() + paths = [] + for dirname, _, cached in self.cache.iter_cache_paths( + prop, value, cache, include_path=False + ): + paths.append(cached_fmt(dirname, cached)) + txt = utils.dump_object(paths, label=f"{prop}: ", color=color) + elif verbosity < 1 and isinstance(value, dict): # This is too complex for most site dumps, hide the details behind # a higher verbosity setting. txt = utils.dump_object( @@ -259,3 +284,17 @@ def standardize_platform_path_maps(self): mapping[platform] = PureWindowsPath(mapping[platform]) else: mapping[platform] = PurePosixPath(mapping[platform]) + + def config_paths(self, config_paths): + cache = self.cache.config_paths() + for dirname, path, _ in self.cache.iter_cache_paths( + "config_paths", config_paths, cache, "*.json" + ): + yield dirname, path + + def distro_paths(self, distro_paths): + cache = self.cache.distro_paths() + for dirname, path, _ in self.cache.iter_cache_paths( + "distro_paths", distro_paths, cache, "*/.hab.json" + ): + yield dirname, path