-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add per-site cache mechanism to speed up site resolve
The slowest part of using the hab cli is the globing of config/distro paths(especially for network paths on windows). Individually parsing hundreds of json files also is slower than parsing a single json file containing the same data, which caching helps out with.
- Loading branch information
1 parent
d1e97eb
commit 641614a
Showing
8 changed files
with
436 additions
and
92 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,186 @@ | ||
import glob | ||
import json | ||
import logging | ||
from pathlib import Path | ||
|
||
from packaging.version import InvalidVersion | ||
|
||
from . import utils | ||
from .errors import InvalidVersionError, _IgnoredVersionError | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Cache: | ||
"""Used to save/restore cached data to speed up initialization of hab. | ||
The caches are stored per-site file as file next to the site file using the | ||
same stem name. (Ie by default studio.json would have a cache file called | ||
studio.cache). | ||
If this cache file exists it is used unless enabled is set to False. Cache | ||
files are useful when you have some sort of CI setup to ensure the cache is | ||
re-generated using `save_cache` any time you make changes to configs or | ||
distros that site file references. | ||
Properties: | ||
cache_template (dict): The str.format template used to find the cache files. | ||
This template requires the kwarg `stem`. | ||
enabled (bool): Used to disable using of the cached data forcing a full | ||
glob and parse of files described by all site files. | ||
""" | ||
|
||
def __init__(self, site): | ||
self.site = site | ||
self._cache = None | ||
self.enabled = True | ||
|
||
# Get the template filename used to find the cache files on disk | ||
self.cache_template = self.site.get("site_cache_file", "{stem}.cache") | ||
|
||
@property | ||
def cached_keys(self): | ||
"""A dict of cache keys and how they should be processed. | ||
{Name of key to cache: ("relative file glob", class used to process)} | ||
""" | ||
try: | ||
return self._cached_keys | ||
except AttributeError: | ||
pass | ||
|
||
from .parsers import Config, DistroVersion | ||
|
||
self._cached_keys = { | ||
"config_paths": ("*.json", Config), | ||
"distro_paths": ("*/.hab.json", DistroVersion), | ||
} | ||
return self._cached_keys | ||
|
||
def cache(self, force=False): | ||
if not self.enabled: | ||
# If caching is disabled, never attempt to load the cache | ||
return {} | ||
|
||
if self._cache is not None and not force: | ||
return self._cache | ||
|
||
self._cache = {} | ||
|
||
# Process caches from right to left. This makes it so the left most | ||
# cache_file is respected if any paths are duplicated. | ||
for path in reversed(self.site.paths): | ||
cache_file = self.site_cache_path(path) | ||
if cache_file.is_file(): | ||
logger.debug(f"Site cache loading: {cache_file!s}") | ||
self.load_cache(cache_file) | ||
|
||
# Create a flattened cache removing the glob paths. | ||
flat_cache = {key: {} for key in self.cached_keys} | ||
for key in self._cache: | ||
for values in self._cache.get(key, {}).values(): | ||
flat_cache[key].update(values) | ||
|
||
self._cache["flat"] = flat_cache | ||
|
||
return self._cache | ||
|
||
def config_paths(self, flat=False): | ||
if flat: | ||
return self.cache().get("flat", {}).get("config_paths", {}) | ||
return self.cache().get("config_paths", {}) | ||
|
||
def distro_paths(self, flat=False): | ||
if flat: | ||
return self.cache().get("flat", {}).get("distro_paths", {}) | ||
return self.cache().get("distro_paths", {}) | ||
|
||
def generate_cache(self, resolver, site_file, version=1): | ||
"""Generate a cache file of the current state defined by this site file. | ||
This contains the raw values of each URI config and distro file including | ||
version. If this cache exists it is used instead of searching the file | ||
system for each path defined in config_paths or distro_paths defined in | ||
the provided site file. Use this method any time changes are made that | ||
hab needs to be aware of. Caching is enabled by the existence of this file. | ||
""" | ||
from .site import Site | ||
|
||
output = {"version": version} | ||
|
||
# read the site file to get paths to process | ||
temp_site = Site([site_file]) | ||
|
||
for key, stats in self.cached_keys.items(): | ||
glob_str, cls = stats | ||
# Process each glob dir defined for this site | ||
for dirname in temp_site.get(key, []): | ||
cfg_paths = output.setdefault(key, {}).setdefault( | ||
dirname.as_posix(), {} | ||
) | ||
|
||
# Add each found hab config to the cache | ||
for path in sorted(glob.glob(str(dirname / glob_str))): | ||
path = Path(path) | ||
try: | ||
data = cls(forest={}, resolver=resolver)._load( | ||
path, cached=False | ||
) | ||
except ( | ||
InvalidVersion, | ||
InvalidVersionError, | ||
_IgnoredVersionError, | ||
) as error: | ||
logger.debug(str(error)) | ||
else: | ||
cfg_paths[path.as_posix()] = data | ||
|
||
return output | ||
|
||
@classmethod | ||
def iter_cache_paths(cls, name, paths, cache, glob_str=None, include_path=True): | ||
"""Yields path information stored in the cache falling back to glob if | ||
not cached. | ||
Yields: | ||
dirname: Each path stored in paths. | ||
path | ||
""" | ||
for dirname in paths: | ||
dn_posix = dirname.as_posix() | ||
cached = dn_posix in cache | ||
if cached: | ||
logger.debug(f"Using cache for {name} dir: {dn_posix}") | ||
paths = cache[dn_posix] | ||
else: | ||
logger.debug(f"Using glob for {name} dir: {dirname}") | ||
# Fallback to globing the file system | ||
if glob_str: | ||
paths = sorted(glob.glob(str(dirname / glob_str))) | ||
else: | ||
paths = [] | ||
if not include_path: | ||
yield dirname, None, cached | ||
else: | ||
for path in paths: | ||
yield dirname, path, cached | ||
|
||
def load_cache(self, filename): | ||
"""For each glob dir add or replace the contents. If a previous cache | ||
has the same glob dir, it's cache is ignored. This expects that | ||
load_cache is called from right to left for each path in `self.site.path`. | ||
""" | ||
contents = utils.load_json_file(filename) | ||
for key in self.cached_keys: | ||
if key in contents: | ||
self._cache.setdefault(key, {}).update(contents[key]) | ||
|
||
def save_cache(self, resolver, site_file, version=1): | ||
cache_file = self.site_cache_path(site_file) | ||
cache = self.generate_cache(resolver, site_file, version=version) | ||
|
||
with cache_file.open("w") as fle: | ||
json.dump(cache, fle, indent=4, cls=utils.HabJsonEncoder) | ||
return cache_file | ||
|
||
def site_cache_path(self, path): | ||
"""Returns the name of the cache file for the given site file.""" | ||
return path.parent / self.cache_template.format(stem=path.stem) |
Oops, something went wrong.