diff --git a/src/diffinsights_web/apps/contributors.py b/src/diffinsights_web/apps/contributors.py index 041a215..1ee4392 100644 --- a/src/diffinsights_web/apps/contributors.py +++ b/src/diffinsights_web/apps/contributors.py @@ -5,12 +5,15 @@ import panel as pn import diffinsights_web.utils.notifications as notifications -from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir +from diffinsights_web.datastore.linesstats import LinesStatsDataStore +from diffinsights_web.datastore.timeline import TimelineDataStore +from diffinsights_web.datastore import find_dataset_dir from diffinsights_web.utils.notifications import onload_callback from diffinsights_web.views.authorsgrid import AuthorInfo, AuthorsGrid from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, \ TimelineDataFrameEnum, perspective_pane from diffinsights_web.views.info import ContributorsHeader, RepoPlotHeader, ContributionsPercHeader +from diffinsights_web.views.plots.sankey import SankeyPlot from diffinsights_web.views.plots.timeseries import TimeseriesPlot from diffinsights_web.widgets.caching import ClearCacheButton @@ -27,33 +30,43 @@ pn.state.onload(onload_callback) dataset_dir = find_dataset_dir() -data_store = TimelineDataStore(dataset_dir=dataset_dir) +timeline_data_store = TimelineDataStore(dataset_dir=dataset_dir) +lines_stats_data_store = LinesStatsDataStore( + dataset_dir=dataset_dir, + timeseries_file=timeline_data_store.select_file_widget, + repo_name=timeline_data_store.select_repo_widget, +) page_header = ContributorsHeader( - repo=data_store.select_repo_widget, - freq=data_store.resample_frequency_widget, - end_date=data_store.timeline_max_date_rx, + repo=timeline_data_store.select_repo_widget, + freq=timeline_data_store.resample_frequency_widget, + end_date=timeline_data_store.timeline_max_date_rx, +) +sankey_plot = SankeyPlot( + data_store=lines_stats_data_store, + from_date_str=page_header.select_period_from_widget, ) timeseries_plot = TimeseriesPlot( - data_store=data_store, + data_store=timeline_data_store, column_name=page_header.select_contribution_type_widget, from_date_str=page_header.select_period_from_widget, + sankey_plot=sankey_plot, ) timeseries_plot_header = RepoPlotHeader( - freq=data_store.resample_frequency_widget, + freq=timeline_data_store.resample_frequency_widget, column_name=page_header.select_contribution_type_widget, plot=timeseries_plot, ) contributions_perc_header = ContributionsPercHeader( - data_store=data_store, + data_store=timeline_data_store, from_date_str=page_header.select_period_from_widget, ) authors_info_panel = AuthorInfo( - data_store=data_store, + data_store=timeline_data_store, authors_info_df=timeseries_plot.authors_info_df_rx, ) authors_grid = AuthorsGrid( - data_store=data_store, + data_store=timeline_data_store, main_plot=timeseries_plot, authors_info_df=timeseries_plot.authors_info_df_rx, top_n=authors_info_panel.top_n_widget, @@ -65,7 +78,7 @@ title="Contributors Graph", # TODO: make title dynamic favicon="favicon.svg", sidebar=[ - data_store, + timeline_data_store, *authors_info_panel.widgets(), pn.layout.Divider(), # - - - - - - - - - - - - - @@ -88,11 +101,11 @@ authors_grid, ], ) -timeline_perspective = TimelinePerspective(data_store=data_store) +timeline_perspective = TimelinePerspective(data_store=timeline_data_store) template.main.extend([ pn.layout.Divider(), pn.Tabs( - ('JSON', TimelineJSONViewer(data_store=data_store)), + ('JSON', TimelineJSONViewer(data_store=timeline_data_store)), ('data', timeline_perspective.panel(TimelineDataFrameEnum.TIMELINE_DATA)), ('resampled', timeline_perspective.panel(TimelineDataFrameEnum.RESAMPLED_DATA)), ('by author+resampled', timeline_perspective.panel(TimelineDataFrameEnum.BY_AUTHOR_DATA)), @@ -101,7 +114,7 @@ perspective_pane( df=timeseries_plot.authors_info_df_rx, title=pn.rx("Authors info for repo={repo!r}, from={from_date!r}") \ - .format(repo=data_store.select_repo_widget, + .format(repo=timeline_data_store.select_repo_widget, from_date=page_header.select_period_from_widget) ) ), diff --git a/src/diffinsights_web/datastore/__init__.py b/src/diffinsights_web/datastore/__init__.py index e69de29..00ec0fd 100644 --- a/src/diffinsights_web/datastore/__init__.py +++ b/src/diffinsights_web/datastore/__init__.py @@ -0,0 +1,17 @@ +from pathlib import Path +from typing import Optional + +import panel as pn + +DATASET_DIR = 'data/examples/stats' + + +@pn.cache +def find_dataset_dir() -> Optional[Path]: + for TOP_DIR in ['', '..', '../..']: + full_dir = Path(TOP_DIR).joinpath(DATASET_DIR) + + if full_dir.is_dir(): + return full_dir + + return None diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py new file mode 100644 index 0000000..d37a6f8 --- /dev/null +++ b/src/diffinsights_web/datastore/linesstats.py @@ -0,0 +1,336 @@ +import json +from collections import Counter, defaultdict +from collections.abc import Container, Iterable +from pathlib import Path, PurePosixPath +from typing import Union, Optional + +import panel as pn +import param + + +def get_lines_stats_data(dataset_dir: str, timeseries_file: str) -> Optional[dict]: + timeseries_file_path = Path(timeseries_file) + if not timeseries_file_path.is_absolute(): + timeseries_file_path = Path(dataset_dir).joinpath(timeseries_file) + + dataset_dir = timeseries_file_path.parent + lines_stats_file = timeseries_file_path.name.replace('.timeline.', '.lines-stats.') + file_path = dataset_dir.joinpath(lines_stats_file) + + if file_path.is_file(): + with open(file_path, mode='r') as json_fp: + return json.load(json_fp) + else: + return None + + +def count_file_x_line_in_lines_stats(lines_stats_data: Optional[dict], + repo_name: str, + change_type: str = "+/-", + prefix: str = 'type.') -> Optional[Counter]: + #print(f"count_file_line_in_lines_stats(..., {repo_name=}, {change_type=}, {prefix=})") + if lines_stats_data is None: + return None + + result = Counter() + + for dataset, dataset_data in lines_stats_data.items(): + for bug_or_repo, lines_data in dataset_data.items(): + if bug_or_repo != repo_name: + #print(f" - skipping: {bug_or_repo!r} != {repo_name!r}") + continue + + for patch_file, patch_data in lines_data.items(): + for file_name, file_data in patch_data.items(): + if change_type not in file_data: + continue + + for line_info, n_lines in file_data[change_type].items(): + if not line_info.startswith(prefix): + continue + + result[(file_name, line_info)] += n_lines + + return result + + +def sorted_changed_files(lines_stats_counter: Optional[Counter]) -> Optional[list[str]]: + if lines_stats_counter is None: + return None + + counts = Counter() + for kv, n_lines in lines_stats_counter.items(): + file_name = kv[0] + counts[file_name] += n_lines + + return [elem[0] for elem in counts.most_common()] + + +def limit_count_to_selected_files(lines_stats_counter: Counter, + files: Union[Container[str], Iterable[str]]) -> Counter: + return Counter({ + kv: n_lines for kv, n_lines in lines_stats_counter.items() + if kv[0] in files + }) + + +def sankey_triples_from_counter(data_counter: Counter) -> list[tuple[str, str, int]]: + return [(p[0], p[1], v) for p, v in data_counter.items()] + + +def sankey_counter_from_triples(data_list: list[tuple[str, str, int]]) -> Counter: + return Counter({(p_f, p_t): v for p_f, p_t, v in data_list}) + + +def path_to_dirs_only_counter(data_counter: Counter) -> Counter: + result = Counter() + + for (p, l), v in data_counter.items(): + #print(f"{p} ={v}=> {l}") + p_path = PurePosixPath(p) + result[(str(p_path.parent), l)] += v + for p_f, p_t in zip(p_path.parent.parents, p_path.parents): + #print(f"- ({p_f}, {p_t})") + result[(str(p_f), str(p_t))] += v + + return result + + +def add_dashdash_dirs_to_counter(data_counter: Counter) -> Counter: + res = data_counter.copy() + + xsankey_data_sets = { + 'dir-to-dir': set(), + 'dir-to-line': set(), + } + #xsankey_data_cntr = Counter() + xsankey_data_line = defaultdict(set) + + for (p_f, p_t), v in data_counter.items(): + if p_t.startswith('type.'): + xsankey_data_sets['dir-to-line'].add(p_f) + #xsankey_data_cntr[p_f] += v + xsankey_data_line[p_f].add(p_t) + else: + xsankey_data_sets['dir-to-dir'].add(p_f) + + xsankey_data_sets['intersection'] = xsankey_data_sets['dir-to-dir'] & xsankey_data_sets['dir-to-line'] + + #xsankey_data_extracted = {k: v for k, v in xsankey_data_cntr.items() if k in xsankey_data_sets['intersection']} + + for d in xsankey_data_sets['intersection']: + #print(f"{d!r}:") + for l in xsankey_data_line[d]: + #print(f" {l!r}") + res[(f"__{d}__", l)] = res[(d, l)] + res[(d, f"__{d}__")] += res[(d, l)] + del res[(d, l)] + + return res + + +def reduce_sankey_from_tail(data_counter: Counter) -> Counter: + res = data_counter.copy() + + #print("reduce_sankey_from_tail():") + + max_level = 0 + for (p_f, _) in data_counter.keys(): + n_dashes = p_f.count('/') + if n_dashes > max_level: + max_level = n_dashes + + #print(f" {max_level=}") + + to_delete = lambda x: x.count('/') == max_level + can_delete = True + + helper_info = { + 'delete-contents': defaultdict(dict), + 'to-prev': {} + } + + # sanity check + for k, v in data_counter.items(): + (p_f, p_t) = k + if to_delete(p_f): + if not p_t.startswith('type.'): + #print(f" {p_f!r} is not final: {p_f!r} =[{v}]=> {p_t!r}") + can_delete = False + else: + helper_info['delete-contents'][p_f][p_t] = v + + if to_delete(p_t): + helper_info['to-prev'][p_t] = p_f + + #print(f" {can_delete=}") + + if can_delete: + to_prev_dict = {} + for p_t, p_f in helper_info['to-prev'].items(): + if (p_f, f"__{p_f}__") in data_counter: + #print(f"({p_f}, __{p_f}__): {xsankey_cntr_5[(p_f, f'__{p_f}__')]}") + to_prev_dict[f"__{p_f}__"] = p_f + + #print(f" extra 'to-prev':{len(to_prev_dict)}") + helper_info['to-prev'] |= to_prev_dict + + for k, v in data_counter.items(): + (p_f, p_t) = k + if (p_f in helper_info['to-prev'] and + p_t.startswith('type.')): + helper_info['delete-contents'][p_f][p_t] = v + + for k, v in data_counter.items(): # we are changing res + (p_f, p_t) = k + if p_t in helper_info['to-prev'] and p_f == helper_info['to-prev'][p_t]: + #print(f"({p_f}, {p_t}): {v})") + for kk, vv in helper_info['delete-contents'][p_t].items(): + res[(p_f, kk)] += vv + #print(f" ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}") + del res[(p_f, p_t)] + if p_f in helper_info['to-prev']: + del res[(p_f, p_t)] + + return res + + +def reduce_sankey_thin_out(data_counter: Counter, + threshold_ratio: float = 0.005) -> Counter: + #print("reduce_sankey_thin_out():") + # TODO: use threshold on max value, not on sum of values + + total_lines = 0 + for (p_f, p_t), v in data_counter.items(): + if p_f != '.': + continue + total_lines += v + + #print(f" {total_lines=}") + #print(f" threshold={threshold_ratio}*{total_lines}={threshold_ratio * total_lines}") + + data_info = { + 'to-remove': set() + } + + for (p_f, p_t), v in data_counter.items(): + if v < threshold_ratio * total_lines: + #print(f" - ({p_f}, {p_t}): {v} {'*' if p_t.startswith('type.') else ' '}") + data_info['to-remove'].add(p_f) + + data_info |= { + 'delete-contents': defaultdict(dict), + 'to-prev': {}, + 'can-remove': set(), + } + + #print(" gathering data:") + + for (p_f, p_t), v in data_counter.items(): + # want to remove, and can remove + if p_f in data_info['to-remove'] and p_t.startswith('type.'): + #print(f" - saving data for ({p_f}, {p_t}): {v}") + data_info['delete-contents'][p_f][p_t] = v + + for (p_f, p_t), v in data_counter.items(): + if p_t in data_info['to-remove'] and p_t in data_info['delete-contents']: + data_info['to-prev'][p_t] = p_f + + total_width = 0 + for v in data_info['delete-contents'][p_t].values(): + total_width += v + if total_width < threshold_ratio * total_lines: + if f"__{p_f}__" == p_t: + #print(f" ! ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}") + pass + elif p_f == ".": + #print(f" # ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}") + pass + else: + #print(f" + ({p_f}) => ({p_t}) => {data_info['delete-contents'][p_t]}") + data_info['can-remove'].add(p_t) + else: + #print(f" - ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}") + pass + + ## ------------------------------------------------------- + ## actual removal + res = data_counter.copy() + + #print(" deleting/compressing:") + for k, v in data_counter.items(): # we are changing res + (p_f, p_t) = k + if p_t in data_info['can-remove']: + if p_t in data_info['to-prev'] and p_f == data_info['to-prev'][p_t]: + #print(f" - ({p_f}, {p_t}): {v})") + for kk, vv in data_info['delete-contents'][p_t].items(): + res[(p_f, kk)] += vv + #print(f" ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}") + del res[(p_f, p_t)] + + if p_f in data_info['can-remove']: + if p_f in data_info['to-prev']: + del res[(p_f, p_t)] + + return res + + +def process_sankey(lines_stats_counter: Optional[Counter], + max_files: Optional[int] = None, + threshold: float = 0.0): + if lines_stats_counter is None: + return None + + changed_files = sorted_changed_files(lines_stats_counter=lines_stats_counter) + if max_files is not None: + lines_stats_counter = limit_count_to_selected_files( + lines_stats_counter=lines_stats_counter, + files=changed_files[:max_files] + ) + + sankey_counter = path_to_dirs_only_counter(lines_stats_counter) + sankey_counter = add_dashdash_dirs_to_counter(sankey_counter) + if 0.0 < threshold < 1.0: + sankey_counter = reduce_sankey_thin_out(sankey_counter, threshold_ratio=threshold) + + sankey_triples = sankey_triples_from_counter(sankey_counter) + + return sankey_triples + + +class LinesStatsDataStore(pn.viewable.Viewer): + dataset_dir = param.Foldername( + constant=True, + doc="Dataset directory with *.lines-stats.*.json files " + "(used if `timeseries_file_path` is relative path)", + ) + timeseries_file = param.String( + allow_refs=True, # to allow widgets and reactive expressions + doc="Selected JSON file with timeline data to find lines-stats companion for" + ) + repo_name = param.String( + allow_refs=True, # allow for reactive expressions, and widgets + doc="Name of the repository, for selecting data", + ) + + def __init__(self, **params): + super().__init__(**params) + + self.lines_stats_data_rx = pn.rx(get_lines_stats_data)( + dataset_dir=self.dataset_dir, # does not change, no need for rx + timeseries_file=self.param.timeseries_file.rx(), + ) + self.lines_stats_counter_rx = pn.rx(count_file_x_line_in_lines_stats)( + lines_stats_data=self.lines_stats_data_rx, + repo_name=self.param.repo_name.rx(), + ) + + self.num_files_widget = pn.widgets.Select( + name="top N files", + options=[10,100,None], + value=100, + ) + self.sankey_data_rx = pn.rx(process_sankey)( + lines_stats_counter=self.lines_stats_counter_rx, + max_files=self.num_files_widget, + ) diff --git a/src/diffinsights_web/datastore/timeline.py b/src/diffinsights_web/datastore/timeline.py index bed54de..5c234bf 100644 --- a/src/diffinsights_web/datastore/timeline.py +++ b/src/diffinsights_web/datastore/timeline.py @@ -10,20 +10,6 @@ from diffinsights_web.utils.notifications import warning_notification -DATASET_DIR = 'data/examples/stats' - - -@pn.cache -def find_dataset_dir() -> Optional[Path]: - for TOP_DIR in ['', '..', '../..']: - full_dir = Path(TOP_DIR).joinpath(DATASET_DIR) - - if full_dir.is_dir(): - return full_dir - - return None - - @pn.cache def find_timeline_files(dataset_dir: Union[Path, str, param.Path, None]) -> dict[str, str]: if dataset_dir is None: diff --git a/src/diffinsights_web/views/__init__.py b/src/diffinsights_web/views/__init__.py index c0ce42a..e4b71ab 100644 --- a/src/diffinsights_web/views/__init__.py +++ b/src/diffinsights_web/views/__init__.py @@ -17,6 +17,7 @@ def __init__(self, **params): class SpecialColumnEnum(Enum): LINE_TYPES_PERC = "timeline|KIND [%]" LINE_TYPES_PERC_HEATMAP = "heatmap|±KIND [%]" + SANKEY_DIAGRAM = "sankey|SANKEY" NO_PLOT = "" @@ -31,6 +32,7 @@ class SpecialColumnEnum(Enum): # special cases: "Line types distribution [%]": SpecialColumnEnum.LINE_TYPES_PERC.value, "Line types heatmap ±[%]": SpecialColumnEnum.LINE_TYPES_PERC_HEATMAP.value, + "Flow from path to line type": SpecialColumnEnum.SANKEY_DIAGRAM.value, "No plot": SpecialColumnEnum.NO_PLOT.value # this special value should be last } column_to_contribution = { diff --git a/src/diffinsights_web/views/info.py b/src/diffinsights_web/views/info.py index b5ecf35..db7782a 100644 --- a/src/diffinsights_web/views/info.py +++ b/src/diffinsights_web/views/info.py @@ -98,10 +98,18 @@ def __init__(self, **params): name="Contributions:", options=contribution_types_map, value="timeline|n_commits", # first value in contribution_types_map + # NOTE: disabled_options does not seem to work, no disabling (???) + # therefore there is no code that does disabling and enabling of this + #disabled_options=[ + # SpecialColumnEnum.SANKEY_DIAGRAM.value, # need .lines-stats.purpose-to-type.json + #], # style width=200, margin=(self.widget_top_margin, 0), # last widget, use x margin of 0 ) + #print(f"{self.select_contribution_type_widget.value=}") + #print(f"{self.select_contribution_type_widget.options=}") + #print(f"{self.select_contribution_type_widget.disabled_options=}") def update_period_selector(self, new_value: datetime.datetime) -> None: #print(f"ContributorsHeader.update_period_from_selector({new_value=})") @@ -126,7 +134,17 @@ def sampling_info(resample_freq: str, if '|' in column: plot_type, _ = column.split('|', maxsplit=2) - if plot_type not in {"timeline", "heatmap"}: + if plot_type == "sankey": + # Sankey diagrams do not use resampling + return f""" +

Distribution of changed lines types based on the directory structure

+

Using commits + from {html_date_humane(min_max_date[0])} + to {html_date_humane(min_max_date[1])} +

+ """ + + elif plot_type not in {"timeline", "heatmap"}: print(f"sampling_info(): got unexpected plot type of {plot_type!r}") return f"No support for {plot_type} plot type, for plotting {column!r}" diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py new file mode 100644 index 0000000..2e0f86c --- /dev/null +++ b/src/diffinsights_web/views/plots/sankey.py @@ -0,0 +1,48 @@ +from pathlib import Path +from typing import Optional + +import holoviews as hv +import panel as pn +import param + +from diffinsights_web.datastore.linesstats import LinesStatsDataStore + + +def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], + width: int = 800, + height: int = 400) -> hv.Sankey: + return hv.Sankey(sankey_data).opts( + edge_color_index=1, + width=width, + height=height, + ) + + +def plot_sankey(sankey_data: Optional[list[tuple[str, str, int]]], + timeseries_file: str, + width: int = 800, + height: int = 400): + if isinstance(sankey_data, param.rx): + sankey_data = sankey_data.rx.value + + if sankey_data is None or len(sankey_data) == 0: + return pn.pane.HTML( + "

No data needed to create Sankey diagram found for " + f"{Path(timeseries_file).name!r}

") + else: + #print(f"plot_sankey(): {type(sankey_data)=}") + return sankey_plot_from_triples(sankey_data, width, height) + + +class SankeyPlot(pn.viewable.Viewer): + data_store = param.ClassSelector(class_=LinesStatsDataStore) + # allow_refs=True is here to allow widgets + from_date_str = param.String(allow_refs=True) # TODO: implement support for it + + def __init__(self, **params): + super().__init__(**params) + + self.plot_sankey_rx = pn.rx(plot_sankey)( + sankey_data=self.data_store.sankey_data_rx, + timeseries_file=self.data_store.param.timeseries_file.rx(), + ) diff --git a/src/diffinsights_web/views/plots/timeseries.py b/src/diffinsights_web/views/plots/timeseries.py index 78f9678..90869e1 100644 --- a/src/diffinsights_web/views/plots/timeseries.py +++ b/src/diffinsights_web/views/plots/timeseries.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Optional import holoviews as hv @@ -14,6 +15,7 @@ get_date_range, get_value_range, filter_df_by_from_date, authors_info_df, author_timeline_df_freq from diffinsights_web.utils.notifications import warning_notification from diffinsights_web.views import TimelineView, SpecialColumnEnum, column_to_contribution +from diffinsights_web.views.plots.sankey import SankeyPlot def line_type_sorting_key(column_name: str) -> int: @@ -257,6 +259,11 @@ class TimeseriesPlot(TimelineView): # allow_refs=True is here to allow widgets column_name = param.String(allow_refs=True) from_date_str = param.String(allow_refs=True) + sankey_plot = param.ClassSelector( + default=None, + allow_None=True, + class_=SankeyPlot, + ) def __init__(self, **params): super().__init__(**params) @@ -288,12 +295,16 @@ def __init__(self, **params): from_date_str=self.param.from_date_str.rx(), ) + plot_widgets = { + 'timeline': self.plot_commits_rx, + 'heatmap': self.plot_heatmap_rx, + } + if self.sankey_plot is not None: + plot_widgets['sankey'] = self.sankey_plot.plot_sankey_rx + self.select_plot_rx = pn.rx(self.select_plot)( column=self.param.column_name.rx(), - plot_widgets={ - 'timeline': self.plot_commits_rx, - 'heatmap': self.plot_heatmap_rx, - }, + plot_widgets=plot_widgets, ) self.select_plot_theme_widget = pn.widgets.Select( @@ -355,6 +366,26 @@ def select_plot(self, column: str, plot_widgets: dict, height: int = 350): #print(f"TimeseriesPlot.select_plot({column=}, ...): returning error message") return pn.pane.HTML(f"Unknown plot type {plot_type}") + @param.depends('data_store.select_file_widget.param', watch=True, on_init=True) + def check_is_sankey_possible(self): + pathname = self.data_store.select_file_widget.value + #print(f"check_is_sankey_possible(): {pathname=},") + + stem = Path(pathname) + while stem.suffix in ['.timeline', '']: + stem = stem.with_suffix('') + #print(f" {stem=}") + + checked_file = stem.with_suffix('.lines-stats.purpose-to-type.json') + result = checked_file.is_file() + #print(f" {checked_file=}, {result=}") + #print(f" {self.param.column_name=}") + #print(f" {self.column_name=}") + #if result: + # print(f" can have sankey ({result=})") + + return result + def __panel__(self) -> pn.viewable.Viewable: if self.column_name == SpecialColumnEnum.NO_PLOT.value: return pn.Spacer(height=0) @@ -385,11 +416,17 @@ def __init__(self, **params): xlim=self.main_plot.date_range_rx, ylim=self.main_plot.value_range_rx, # TODO: allow to switch between totals, max N, and own ) + self.plot_heatmap_rx = pn.rx(plot_heatmap)( + resampled_df = self.resampled_df_rx, + from_date_str = self.main_plot.param.from_date_str.rx(), + figsize = (8, 3.75), + ) self.select_plot_rx = pn.rx(self.main_plot.select_plot)( column=self.main_plot.param.column_name.rx(), plot_widgets={ 'timeline': self.plot_commits_rx, + 'heatmap': self.plot_heatmap_rx, }, height=256, ) diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py new file mode 100644 index 0000000..7e540d2 --- /dev/null +++ b/tests/test_datastore_linesstats.py @@ -0,0 +1,150 @@ +import pytest + +hv = pytest.importorskip("holoviews") +param = pytest.importorskip("param") +pn = pytest.importorskip("panel") + +from diffinsights_web.datastore import find_dataset_dir +from diffinsights_web.datastore.timeline import TimelineDataStore +from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \ + limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail, reduce_sankey_thin_out + + +def test_timeseries_file_no_such_file(): + data_store = LinesStatsDataStore( + name='test_repo_name_no_such_file', + dataset_dir=find_dataset_dir(), + timeseries_file='does-not-exist', + repo_name='repo-does-not-exist', + ) + + # DEBUG + #print(f"{data_store=}") + #print(f"{data_store.lines_stats_data_rx=}") + #print(f"{data_store.lines_stats_data_rx._obj=}") + #print(f"{data_store.lines_stats_data_rx._operation=}") + #print(f"{data_store.lines_stats_data_rx.rx.value=}") + + actual = data_store.lines_stats_data_rx.rx.value + assert actual is None, \ + "LinesDataStore returns None for data if lines-stats file does not exist" + + actual = data_store.lines_stats_counter_rx.rx.value + assert actual is None, \ + "LinesDataStore returns None for counter if lines-stats file does not exist" + + actual = data_store.sankey_data_rx.rx.value + assert actual is None, \ + "LinesDataStore returns None for Sankey data if lines-stats file does not exist" + + +def test_timeseries_file_from_widget_default_value(): + dataset_dir = find_dataset_dir() + data_store = TimelineDataStore(dataset_dir=dataset_dir) + + lines_stats = LinesStatsDataStore( + dataset_dir='.', # should be ignored, not tested + timeseries_file=data_store.select_file_widget, + repo_name=data_store.select_repo_widget, + ) + + actual = lines_stats.lines_stats_data_rx.rx.value + assert actual is None or isinstance(actual, dict), \ + "No crashes, returned something for value from widget" + + +def test_timeseries_file_hellogitworld(): + lines_stats = LinesStatsDataStore( + dataset_dir='data/examples/stats', # directory part, relative to top directory of project + timeseries_file='hellogitworld.timeline.purpose-to-type.json', # filename part + repo_name='hellogitworld', + ) + actual = lines_stats.lines_stats_data_rx.rx.value + + assert isinstance(actual, dict), \ + "correctly found lines-stats file, retrieved data, did not return None" + assert 'data/examples/annotations/hellogitworld' in actual, \ + "hellogitworld lines-stats file came from hellogitworld annotations" + assert 'hellogitworld' in actual['data/examples/annotations/hellogitworld'], \ + "data nicknamed 'hellogitworld' in hellogitworld lines-stats file" + + data = actual['data/examples/annotations/hellogitworld']['hellogitworld'] + assert len(data.keys()) > 0, \ + "there is data from multiple files with annotation data" + + actual = lines_stats.lines_stats_counter_rx.rx.value + #print(f"{len(actual)=}") + #print(f"{actual.keys()=}") + assert ('README.txt', 'type.documentation') in actual, \ + "there were changes marked as documentation lines to 'README.txt' file" + assert actual[('README.txt', 'type.documentation')] > 0, \ + "there were non-zero amount of changes marked as documentation to 'README.txt' file" + assert ('README.txt', 'type.code') not in actual, \ + "there were no changes marked as code lines to 'README.txt' file" + + actual = sorted_changed_files(lines_stats.lines_stats_counter_rx.rx.value) + assert actual[0] == 'src/Main.groovy', \ + "file with most changes was 'src/Main.groovy'" + + selected_files = actual[:3] + actual = limit_count_to_selected_files( + lines_stats_counter=lines_stats.lines_stats_counter_rx.rx.value, + files=selected_files, + ) + assert len(actual) >= len(selected_files), \ + "at least one counter entry for each file" + + counter_limited = actual + actual = sorted_changed_files(counter_limited) + assert actual == selected_files, \ + "list of files after filtering is filter list, if filter list is from counter" + + actual = path_to_dirs_only_counter(counter_limited) + assert ('.', 'src') in actual, \ + "path from top dir to 'src' subdirectory present" + assert ('src', 'type.code') in actual, \ + "'src/Main.groovy' lines of code contributions changed to 'src' contributions" + + + starting_counter = actual + actual = reduce_sankey_from_tail(starting_counter) + assert len(actual) < len(starting_counter), \ + "removed at least one node from Sankey diagram" + # TODO: check that it removed only last level + + actual = reduce_sankey_thin_out(starting_counter, threshold_ratio=0.5) + assert len(actual) < len(starting_counter), \ + "removed at least one node from Sankey diagram" + # TODO: add more checks + + actual = lines_stats.sankey_data_rx.rx.value + assert len(actual) > 0, \ + "there is something to create Sankey diagram from" + + +def test_switch_repos_same_file(): + lines_stats = LinesStatsDataStore( + dataset_dir='data/examples/stats', # directory part, relative to top directory of project + timeseries_file='hellogitworld.timeline.purpose-to-type.json', # filename part + repo_name='hellogitworld', + ) + actual = lines_stats.lines_stats_data_rx.rx.value + + assert isinstance(actual, dict), \ + "correctly found lines-stats file, retrieved data, did not return None" + + lines_stats.timeseries_file = 'does-not-exist-directly' + actual = lines_stats.lines_stats_data_rx.rx.value + assert actual is None, \ + "switching to not-existing file clears retrieved data, makes it None" + + actual = lines_stats.lines_stats_counter_rx.rx.value + assert actual is None, \ + "switching to not-existing file clears stats counter, makes it None" + + actual = lines_stats.sankey_data_rx.rx.value + assert actual is None, \ + "switching to not-existing file clears computed sankey data, makes it None" + + +# TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()