From a3cf773fa7ba89cc9440f267680ded632cb50e3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Thu, 5 Dec 2024 18:04:01 +0100
Subject: [PATCH 01/29] diffinsights_web: Prepare for introducing Sankey
 diagram

A Sankey Diagram is a visualisation technique that allows to display
flows: https://www.data-to-viz.com/graph/sankey.html

In the case of PatchScope it would be flow through the directory
structure to the type of changed line at the end.

This commit just reserves space for Sankey diagram in "Contributions:"
widget, though it possibly gets a better name.  There is also
(temporarily) protection added to not crash when it is selected
because it is not implemented yet.

It should be, for the time being, a disabled option, but for some reason
(bug in Panel) it is not disabled, and user is able to select it (!).

https://panel.holoviz.org/reference/widgets/Select.html#core

> - `disabled_options` (list): Optional list of `options` that are disabled,
>    i.e. unusable and un-clickable. If `options` is a dictionary the list
>    items must be dictionary values.
---
 src/diffinsights_web/views/__init__.py         | 2 ++
 src/diffinsights_web/views/info.py             | 4 ++++
 src/diffinsights_web/views/plots/timeseries.py | 5 +++++
 3 files changed, 11 insertions(+)
diff --git a/src/diffinsights_web/views/__init__.py b/src/diffinsights_web/views/__init__.py
index f5c967b..6b928cc 100644
--- a/src/diffinsights_web/views/__init__.py
+++ b/src/diffinsights_web/views/__init__.py
@@ -16,6 +16,7 @@ def __init__(self, **params):
 
 class SpecialColumnEnum(Enum):
     LINE_TYPES_PERC = "timeline|KIND [%]"
+    SANKEY_DIAGRAM = "sankey|SANKEY"
     NO_PLOT = "<NO PLOT>"
 
 
@@ -29,6 +30,7 @@ class SpecialColumnEnum(Enum):
     "Patch spreading (lines)": "timeline|diff.groups_spread",
     # special cases:
     "Line types distribution [%]": SpecialColumnEnum.LINE_TYPES_PERC.value,
+    "Flow from path to line type": SpecialColumnEnum.SANKEY_DIAGRAM.value,
     "No plot": SpecialColumnEnum.NO_PLOT.value  # this special value should be last
 }
 column_to_contribution = {
diff --git a/src/diffinsights_web/views/info.py b/src/diffinsights_web/views/info.py
index dcfe4da..9bbd2d3 100644
--- a/src/diffinsights_web/views/info.py
+++ b/src/diffinsights_web/views/info.py
@@ -98,6 +98,10 @@ def __init__(self, **params):
             name="Contributions:",
             options=contribution_types_map,
             value="timeline|n_commits",  # first value in contribution_types_map
+            # NOTE: disabled_options does not seem to work, no disabling (???)
+            disabled_options=[
+                SpecialColumnEnum.SANKEY_DIAGRAM.value,  # need <name>.lines-stats.purpose-to-type.json
+            ],
             # style
             width=200,
             margin=(self.widget_top_margin, 0),  # last widget, use x margin of 0
diff --git a/src/diffinsights_web/views/plots/timeseries.py b/src/diffinsights_web/views/plots/timeseries.py
index 27325fa..2ed0439 100644
--- a/src/diffinsights_web/views/plots/timeseries.py
+++ b/src/diffinsights_web/views/plots/timeseries.py
@@ -48,6 +48,11 @@ def plot_commits(resampled_df: pd.DataFrame,
     if column == SpecialColumnEnum.NO_PLOT.value:
         return
 
+    # TODO: temporary
+    if column == SpecialColumnEnum.SANKEY_DIAGRAM.value:
+        warning_notification('Sankey diagram not implemented yet')
+        return
+
     filtered_df = filter_df_by_from_date(resampled_df, from_date_str)
 
     hvplot_kwargs = {}

From 45cc85b4f7f63a38481bb1fab041c0ee9de80bd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Thu, 5 Dec 2024 18:31:42 +0100
Subject: [PATCH 02/29] diffinsights_web: Introduce check_is_sankey_possible()
 method

This method in TimeseriesPlot class (though I am bit unsure if it is a
correct place for it) is intended to be ran automatically via
@param.depend decorator, and manually if needed, to check whether
creating Sankey plot is possible - do we have data (which needs
different file that '*.timeline.*.json' one.

It was planned that this metod woul enable or disable the appopriate
selection option in "Contributions:", but it turns out that accessing
the widget is more complex than at first glance it looks like.
---
 src/diffinsights_web/views/info.py            |  3 +++
 .../views/plots/timeseries.py                 | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/src/diffinsights_web/views/info.py b/src/diffinsights_web/views/info.py
index 9bbd2d3..f2ff4e6 100644
--- a/src/diffinsights_web/views/info.py
+++ b/src/diffinsights_web/views/info.py
@@ -106,6 +106,9 @@ def __init__(self, **params):
             width=200,
             margin=(self.widget_top_margin, 0),  # last widget, use x margin of 0
         )
+        #print(f"{self.select_contribution_type_widget.value=}")
+        #print(f"{self.select_contribution_type_widget.options=}")
+        #print(f"{self.select_contribution_type_widget.disabled_options=}")
 
     def update_period_selector(self, new_value: datetime.datetime) -> None:
         #print(f"ContributorsHeader.update_period_from_selector({new_value=})")
diff --git a/src/diffinsights_web/views/plots/timeseries.py b/src/diffinsights_web/views/plots/timeseries.py
index 2ed0439..34e6bef 100644
--- a/src/diffinsights_web/views/plots/timeseries.py
+++ b/src/diffinsights_web/views/plots/timeseries.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Optional
 
 import pandas as pd
@@ -293,6 +294,26 @@ def select_plot(self, column: str, plot_widgets: dict, height: int = 350):
             #print(f"TimeseriesPlot.select_plot({column=}, ...): returning error message")
             return pn.pane.HTML(f"Unknown plot type <strong>{plot_type}</strong>")
 
+    @param.depends('data_store.select_file_widget.param', watch=True, on_init=True)
+    def check_is_sankey_possible(self):
+        pathname = self.data_store.select_file_widget.value
+        #print(f"check_is_sankey_possible(): {pathname=},")
+
+        stem = Path(pathname)
+        while stem.suffix in ['.timeline', '']:
+            stem = stem.with_suffix('')
+        #print(f"  {stem=}")
+
+        checked_file = stem.with_suffix('.lines-stats.purpose-to-type.json')
+        result = checked_file.is_file()
+        #print(f"  {checked_file=}, {result=}")
+        #print(f"  {self.param.column_name=}")
+        #print(f"  {self.column_name=}")
+        #if result:
+        #    print(f"  can have sankey ({result=})")
+
+        return result
+
     def __panel__(self) -> pn.viewable.Viewable:
         if self.column_name == SpecialColumnEnum.NO_PLOT.value:
             return pn.Spacer(height=0)

From 2f7701d0937c3e2fd6625be125f987142c67e652 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sat, 7 Dec 2024 18:28:56 +0100
Subject: [PATCH 03/29] diffinsights_web: Move find_dataset_dir() to
 datastore/__init__.py

This function is to be a common function for a new datastore to be
intriduced, one that reads *.lines-stats{,.*}.json files.
---
 src/diffinsights_web/apps/contributors.py  |  3 ++-
 src/diffinsights_web/datastore/__init__.py | 17 +++++++++++++++++
 src/diffinsights_web/datastore/timeline.py | 14 --------------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/diffinsights_web/apps/contributors.py b/src/diffinsights_web/apps/contributors.py
index 041a215..14e68ed 100644
--- a/src/diffinsights_web/apps/contributors.py
+++ b/src/diffinsights_web/apps/contributors.py
@@ -5,7 +5,8 @@
 import panel as pn
 
 import diffinsights_web.utils.notifications as notifications
-from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir
+from diffinsights_web.datastore.timeline import TimelineDataStore
+from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.utils.notifications import onload_callback
 from diffinsights_web.views.authorsgrid import AuthorInfo, AuthorsGrid
 from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, \
diff --git a/src/diffinsights_web/datastore/__init__.py b/src/diffinsights_web/datastore/__init__.py
index e69de29..00ec0fd 100644
--- a/src/diffinsights_web/datastore/__init__.py
+++ b/src/diffinsights_web/datastore/__init__.py
@@ -0,0 +1,17 @@
+from pathlib import Path
+from typing import Optional
+
+import panel as pn
+
+DATASET_DIR = 'data/examples/stats'
+
+
+@pn.cache
+def find_dataset_dir() -> Optional[Path]:
+    for TOP_DIR in ['', '..', '../..']:
+        full_dir = Path(TOP_DIR).joinpath(DATASET_DIR)
+
+        if full_dir.is_dir():
+            return full_dir
+
+    return None
diff --git a/src/diffinsights_web/datastore/timeline.py b/src/diffinsights_web/datastore/timeline.py
index bed54de..5c234bf 100644
--- a/src/diffinsights_web/datastore/timeline.py
+++ b/src/diffinsights_web/datastore/timeline.py
@@ -10,20 +10,6 @@
 from diffinsights_web.utils.notifications import warning_notification
 
 
-DATASET_DIR = 'data/examples/stats'
-
-
-@pn.cache
-def find_dataset_dir() -> Optional[Path]:
-    for TOP_DIR in ['', '..', '../..']:
-        full_dir = Path(TOP_DIR).joinpath(DATASET_DIR)
-
-        if full_dir.is_dir():
-            return full_dir
-
-    return None
-
-
 @pn.cache
 def find_timeline_files(dataset_dir: Union[Path, str, param.Path, None]) -> dict[str, str]:
     if dataset_dir is None:

From 7f9ada3ffde5e3321738243bf6834566193dbbd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sat, 7 Dec 2024 20:48:07 +0100
Subject: [PATCH 04/29] diffinsights_web: Introduce datastore/linesstats.py

Currently only the data reading part is implemented.  As this datastore
is intended to be an auxiliary data store, to augment timeline data
store with data required to create Sankey diagram, it is intended to try
to find a companion data file to the one used by the timeline data
store.

Also added some basic tests, which are skipped if .[web] prerequisites
are not available.
---
 src/diffinsights_web/datastore/linesstats.py | 44 ++++++++++++++
 tests/test_datastore_linesstats.py           | 61 ++++++++++++++++++++
 2 files changed, 105 insertions(+)
 create mode 100644 src/diffinsights_web/datastore/linesstats.py
 create mode 100644 tests/test_datastore_linesstats.py

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
new file mode 100644
index 0000000..6dc8edd
--- /dev/null
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -0,0 +1,44 @@
+import json
+from pathlib import Path
+from typing import Union, Optional
+
+import panel as pn
+import param
+
+from diffinsights_web.utils.notifications import warning_notification
+
+
+def get_lines_stats_data(dataset_dir: str, timeseries_file: str) -> Optional[dict]:
+    timeseries_file_path = Path(timeseries_file)
+    if not timeseries_file_path.is_absolute():
+        timeseries_file_path = Path(dataset_dir).joinpath(timeseries_file)
+
+    dataset_dir = timeseries_file_path.parent
+    lines_stats_file = timeseries_file_path.name.replace('.timeline.', '.lines-stats.')
+    file_path = dataset_dir.joinpath(lines_stats_file)
+
+    if file_path.is_file():
+        with open(file_path, mode='r') as json_fp:
+            return json.load(json_fp)
+    else:
+        return None
+
+
+class LinesStatsDataStore(pn.viewable.Viewer):
+    dataset_dir = param.Foldername(
+        constant=True,
+        doc="Dataset directory with *.lines-stats.*.json files "
+            "(used if `timeseries_file_path` is relative path)",
+    )
+    timeseries_file = param.String(
+        allow_refs=True,  # to allow widgets and reactive expressions
+        doc="Selected JSON file with timeline data to find lines-stats companion for"
+    )
+
+    def __init__(self, **params):
+        super().__init__(**params)
+
+        self.lines_stats_data_rx = pn.rx(get_lines_stats_data)(
+            dataset_dir=self.dataset_dir,
+            timeseries_file=self.timeseries_file,
+        )
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
new file mode 100644
index 0000000..e860e97
--- /dev/null
+++ b/tests/test_datastore_linesstats.py
@@ -0,0 +1,61 @@
+import pytest
+
+from diffinsights_web.datastore import find_dataset_dir
+from diffinsights_web.datastore.timeline import TimelineDataStore
+from diffinsights_web.datastore.linesstats import LinesStatsDataStore
+
+param = pytest.importorskip("param")
+panel = pytest.importorskip("panel")
+
+
+def test_timeseries_file_no_such_file():
+    data_store = LinesStatsDataStore(
+        name='test_repo_name_no_such_file',
+        dataset_dir=find_dataset_dir(),
+        timeseries_file='does-not-exist',
+    )
+
+    # DEBUG
+    #print(f"{data_store=}")
+    #print(f"{data_store.lines_stats_data_rx=}")
+    #print(f"{data_store.lines_stats_data_rx._obj=}")
+    #print(f"{data_store.lines_stats_data_rx._operation=}")
+    #print(f"{data_store.lines_stats_data_rx.rx.value=}")
+
+    actual = data_store.lines_stats_data_rx.rx.value
+    assert actual is None, \
+        "LinesDataStore returns None if lines-stats file does not exist"
+
+
+def test_timeseries_file_from_widget_default_value():
+    dataset_dir = find_dataset_dir()
+    data_store = TimelineDataStore(dataset_dir=dataset_dir)
+
+    lines_stats = LinesStatsDataStore(
+        dataset_dir='.',  # should be ignored, not tested
+        timeseries_file=data_store.select_file_widget,
+    )
+
+    actual = lines_stats.lines_stats_data_rx.rx.value
+    assert actual is None or isinstance(actual, dict), \
+        "No crashes, returned something for value from widget"
+
+
+def test_timeseries_file_hellogitworld():
+    lines_stats = LinesStatsDataStore(
+        dataset_dir='data/examples/stats',  # directory part, relative to top directory of project
+        timeseries_file='hellogitworld.timeline.purpose-to-type.json',  #filename part
+    )
+    actual = lines_stats.lines_stats_data_rx.rx.value
+
+    assert isinstance(actual, dict), \
+        "correctly found lines-stats file, retrieved data, did not return None"
+    assert 'data/examples/annotations/hellogitworld' in actual, \
+        "hellogitworld lines-stats file came from hellogitworld annotations"
+    assert 'hellogitworld' in actual['data/examples/annotations/hellogitworld'], \
+        "data nicknamed 'hellogitworld' in hellogitworld lines-stats file"
+
+    data = actual['data/examples/annotations/hellogitworld']['hellogitworld']
+    assert len(data.keys()) > 0, \
+        "there is data from multiple files with annotation data"
+

From 5172721075c5184efeab5a9835ca44dd6b58dc9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sat, 7 Dec 2024 23:09:59 +0100
Subject: [PATCH 05/29] diffinsights_web: Add
 count_file_x_line_in_lines_stats()

This is first step in extracting data needed to create Sankey diagram
from the lines stats data.  Code taken from "Sankey for whole timeline"
section in 'notebooks/panel/02-contributors_graph.ipynb' noteboook.

The LinesStatsDataStore had to be modified to take an additional
parameter, namely `repo_name`, so that it would extract data
corresponding to the data being extracted from timeline data.
---
 src/diffinsights_web/datastore/linesstats.py | 35 ++++++++++++++++++++
 tests/test_datastore_linesstats.py           | 14 +++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 6dc8edd..fc2fcab 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -1,4 +1,5 @@
 import json
+from collections import Counter
 from pathlib import Path
 from typing import Union, Optional
 
@@ -24,6 +25,32 @@ def get_lines_stats_data(dataset_dir: str, timeseries_file: str) -> Optional[dic
         return None
 
 
+def count_file_x_line_in_lines_stats(lines_stats_data: dict,
+                                     repo_name: str,
+                                     change_type: str = "+/-",
+                                     prefix: str = 'type.') -> Counter:
+    #print(f"count_file_line_in_lines_stats(..., {repo_name=}, {change_type=}, {prefix=})")
+    result = Counter()
+
+    for dataset, dataset_data in lines_stats_data.items():
+        for bug_or_repo, lines_data in dataset_data.items():
+            if bug_or_repo != repo_name:
+                print(f"    - skipping: {bug_or_repo!r} != {repo_name!r}")
+
+            for patch_file, patch_data in lines_data.items():
+                for file_name, file_data in patch_data.items():
+                    if change_type not in file_data:
+                        continue
+
+                    for line_info, n_lines in file_data[change_type].items():
+                        if not line_info.startswith(prefix):
+                            continue
+
+                        result[(file_name, line_info)] += n_lines
+
+    return result
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
@@ -34,6 +61,10 @@ class LinesStatsDataStore(pn.viewable.Viewer):
         allow_refs=True,  # to allow widgets and reactive expressions
         doc="Selected JSON file with timeline data to find lines-stats companion for"
     )
+    repo_name = param.String(
+        allow_refs=True,  # allow for reactive expressions, and widgets
+        doc="Name of the repository, for selecting data",
+    )
 
     def __init__(self, **params):
         super().__init__(**params)
@@ -42,3 +73,7 @@ def __init__(self, **params):
             dataset_dir=self.dataset_dir,
             timeseries_file=self.timeseries_file,
         )
+        self.lines_stats_counter_rx = pn.rx(count_file_x_line_in_lines_stats)(
+            lines_stats_data=self.lines_stats_data_rx,
+            repo_name=self.repo_name,
+        )
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index e860e97..fdf91f7 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -13,6 +13,7 @@ def test_timeseries_file_no_such_file():
         name='test_repo_name_no_such_file',
         dataset_dir=find_dataset_dir(),
         timeseries_file='does-not-exist',
+        repo_name='repo-does-not-exist',
     )
 
     # DEBUG
@@ -34,6 +35,7 @@ def test_timeseries_file_from_widget_default_value():
     lines_stats = LinesStatsDataStore(
         dataset_dir='.',  # should be ignored, not tested
         timeseries_file=data_store.select_file_widget,
+        repo_name=data_store.select_repo_widget,
     )
 
     actual = lines_stats.lines_stats_data_rx.rx.value
@@ -44,7 +46,8 @@ def test_timeseries_file_from_widget_default_value():
 def test_timeseries_file_hellogitworld():
     lines_stats = LinesStatsDataStore(
         dataset_dir='data/examples/stats',  # directory part, relative to top directory of project
-        timeseries_file='hellogitworld.timeline.purpose-to-type.json',  #filename part
+        timeseries_file='hellogitworld.timeline.purpose-to-type.json',  # filename part
+        repo_name='hellogitworld',
     )
     actual = lines_stats.lines_stats_data_rx.rx.value
 
@@ -59,3 +62,12 @@ def test_timeseries_file_hellogitworld():
     assert len(data.keys()) > 0, \
         "there is data from multiple files with annotation data"
 
+    actual = lines_stats.lines_stats_counter_rx.rx.value
+    #print(f"{len(actual)=}")
+    #print(f"{actual.keys()=}")
+    assert ('README.txt', 'type.documentation') in actual, \
+        "there were changes marked as documentation lines to 'README.txt' file"
+    assert actual[('README.txt', 'type.documentation')] > 0, \
+        "there were non-zero amount of changes marked as documentation to 'README.txt' file"
+    assert ('README.txt', 'type.code') not in actual, \
+        "there were no changes marked as code lines to 'README.txt' file"

From 25dbb2d772a8d61d2117522d306b91983af409ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 02:27:12 +0100
Subject: [PATCH 06/29] diffinsights_web: Add
 LineStatsDataStore.sorted_changed_files()

Probably it would be changed to be a function, in order to make it
easier to make reactive expression out of it.  It is not something
specific to the LineStatsDataStore class...

This function will be used to implement cutoff, to limit Sankey diagram
to top N most changed files (istead of using all files).
---
 src/diffinsights_web/datastore/linesstats.py | 8 ++++++++
 tests/test_datastore_linesstats.py           | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index fc2fcab..78d29df 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -77,3 +77,11 @@ def __init__(self, **params):
             lines_stats_data=self.lines_stats_data_rx,
             repo_name=self.repo_name,
         )
+
+    def sorted_changed_files(self):
+        counts = Counter()
+        for kv, n_lines in self.lines_stats_counter_rx.rx.value.items():
+            file_name = kv[0]
+            counts[file_name] += n_lines
+
+        return counts.most_common()
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index fdf91f7..3940db4 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -71,3 +71,7 @@ def test_timeseries_file_hellogitworld():
         "there were non-zero amount of changes marked as documentation to 'README.txt' file"
     assert ('README.txt', 'type.code') not in actual, \
         "there were no changes marked as code lines to 'README.txt' file"
+
+    actual = lines_stats.sorted_changed_files()
+    assert actual[0][0] == 'src/Main.groovy', \
+        "file with most changes was 'src/Main.groovy'"

From 7390261a98f1264cb86fbb135fda0871bdb8b63d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 09:21:31 +0100
Subject: [PATCH 07/29] diffinsights_web: Make .sorted_changed_files() return
 files only

The count of changed lines (of any type) for a give file is now dropped
from the .sorted_changed_files() method output, but we might at to
revert this chage (go back on that), if we want to implement the
percentage cutoff based on the number of changed lines (for example,
keepeing only files that contribute to 80% of changed lines).
---
 src/diffinsights_web/datastore/linesstats.py | 4 ++--
 tests/test_datastore_linesstats.py           | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 78d29df..f053d82 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -78,10 +78,10 @@ def __init__(self, **params):
             repo_name=self.repo_name,
         )
 
-    def sorted_changed_files(self):
+    def sorted_changed_files(self) -> list[str]:
         counts = Counter()
         for kv, n_lines in self.lines_stats_counter_rx.rx.value.items():
             file_name = kv[0]
             counts[file_name] += n_lines
 
-        return counts.most_common()
+        return [elem[0] for elem in counts.most_common()]
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 3940db4..a1512ff 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -73,5 +73,5 @@ def test_timeseries_file_hellogitworld():
         "there were no changes marked as code lines to 'README.txt' file"
 
     actual = lines_stats.sorted_changed_files()
-    assert actual[0][0] == 'src/Main.groovy', \
+    assert actual[0] == 'src/Main.groovy', \
         "file with most changes was 'src/Main.groovy'"

From fe2c5cf359149ba0cd73e18722027d72df972bfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 09:44:13 +0100
Subject: [PATCH 08/29] diffinsights_web: Make sorted_changed_files() into a
 function

It would need to be tested if the chain of reactive expressions
works correctly, in the future.  The sorted_changed_files() function
may be called from the inside of a more encompasing function.
---
 src/diffinsights_web/datastore/linesstats.py | 16 +++++++++-------
 tests/test_datastore_linesstats.py           |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index f053d82..6ee4523 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -51,6 +51,15 @@ def count_file_x_line_in_lines_stats(lines_stats_data: dict,
     return result
 
 
+def sorted_changed_files(lines_stats_counter: Counter) -> list[str]:
+    counts = Counter()
+    for kv, n_lines in lines_stats_counter.items():
+        file_name = kv[0]
+        counts[file_name] += n_lines
+
+    return [elem[0] for elem in counts.most_common()]
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
@@ -78,10 +87,3 @@ def __init__(self, **params):
             repo_name=self.repo_name,
         )
 
-    def sorted_changed_files(self) -> list[str]:
-        counts = Counter()
-        for kv, n_lines in self.lines_stats_counter_rx.rx.value.items():
-            file_name = kv[0]
-            counts[file_name] += n_lines
-
-        return [elem[0] for elem in counts.most_common()]
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index a1512ff..30f0950 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -2,7 +2,7 @@
 
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
-from diffinsights_web.datastore.linesstats import LinesStatsDataStore
+from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files
 
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
@@ -72,6 +72,6 @@ def test_timeseries_file_hellogitworld():
     assert ('README.txt', 'type.code') not in actual, \
         "there were no changes marked as code lines to 'README.txt' file"
 
-    actual = lines_stats.sorted_changed_files()
+    actual = sorted_changed_files(lines_stats.lines_stats_counter_rx.rx.value)
     assert actual[0] == 'src/Main.groovy', \
         "file with most changes was 'src/Main.groovy'"

From 4296335b45cd34bada669edf0a4a369d88d39241 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 13:14:34 +0100
Subject: [PATCH 09/29] diffinsights_web: Add limit_count_to_selected_files()

When creating Sankey diagram for all changes, without limiting the
number of nodes the diagram might be too busy to be readable.  The just
introduced function, together with the sorted_changed_files(), could be
used for filtering out irrelevant data.
---
 src/diffinsights_web/datastore/linesstats.py |  9 +++++++++
 tests/test_datastore_linesstats.py           | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 6ee4523..fc40d3d 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -1,5 +1,6 @@
 import json
 from collections import Counter
+from collections.abc import Container, Iterable
 from pathlib import Path
 from typing import Union, Optional
 
@@ -60,6 +61,14 @@ def sorted_changed_files(lines_stats_counter: Counter) -> list[str]:
     return [elem[0] for elem in counts.most_common()]
 
 
+def limit_count_to_selected_files(lines_stats_counter: Counter,
+                                  files: Union[Container[str], Iterable[str]]) -> Counter:
+    return Counter({
+        kv: n_lines for kv, n_lines in lines_stats_counter.items()
+        if kv[0] in files
+    })
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 30f0950..adccdf7 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -2,7 +2,8 @@
 
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
-from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files
+from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
+    limit_count_to_selected_files
 
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
@@ -75,3 +76,16 @@ def test_timeseries_file_hellogitworld():
     actual = sorted_changed_files(lines_stats.lines_stats_counter_rx.rx.value)
     assert actual[0] == 'src/Main.groovy', \
         "file with most changes was 'src/Main.groovy'"
+
+    selected_files = actual[:3]
+    actual = limit_count_to_selected_files(
+        lines_stats_counter=lines_stats.lines_stats_counter_rx.rx.value,
+        files=selected_files,
+    )
+    assert len(actual) >= len(selected_files), \
+        "at least one counter entry for each file"
+
+    counter_limited = actual
+    actual = sorted_changed_files(counter_limited)
+    assert actual == selected_files, \
+        "list of files after filtering is filter list, if filter list is from counter"

From 16e9d09777998820113967e135c9ad17990d323d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 13:24:29 +0100
Subject: [PATCH 10/29] diffinsights_web: Add sankey_triples_from_counter() and
 its reverse

This will be used to extract the data in the format that hv.Sankey()
accepts, and that can be easily turned ito pd.DataFrame.
---
 src/diffinsights_web/datastore/linesstats.py | 8 ++++++++
 tests/test_datastore_linesstats.py           | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index fc40d3d..87fc4ab 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -69,6 +69,14 @@ def limit_count_to_selected_files(lines_stats_counter: Counter,
     })
 
 
+def sankey_triples_from_counter(data_counter: Counter) -> list[tuple[str, str, int]]:
+    return [(p[0], p[1], v) for p, v in data_counter.items()]
+
+
+def sankey_counter_from_triples(data_list: list[tuple[str, str, int]]) -> Counter:
+    return Counter({(p_f, p_t): v for p_f, p_t, v in data_list})
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index adccdf7..e4cdb6d 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -89,3 +89,6 @@ def test_timeseries_file_hellogitworld():
     actual = sorted_changed_files(counter_limited)
     assert actual == selected_files, \
         "list of files after filtering is filter list, if filter list is from counter"
+
+
+# TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 91a0857746aff1024709c0016b66826cb1eeac0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 13:48:45 +0100
Subject: [PATCH 11/29] diffinsights_web: Add path_to_dirs_only_counter()

This function creates graph of directory structure from the lines types
statistics for individual changed files.  Here, each directory is
credited sum of contributions of all files in that directory.

To reduce clutter, final component i.e. files was removed from the
consideration, or rather -- not [re-]added.
---
 src/diffinsights_web/datastore/linesstats.py | 16 +++++++++++++++-
 tests/test_datastore_linesstats.py           |  8 +++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 87fc4ab..41b6b8b 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -1,7 +1,7 @@
 import json
 from collections import Counter
 from collections.abc import Container, Iterable
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 from typing import Union, Optional
 
 import panel as pn
@@ -77,6 +77,20 @@ def sankey_counter_from_triples(data_list: list[tuple[str, str, int]]) -> Counte
     return Counter({(p_f, p_t): v for p_f, p_t, v in data_list})
 
 
+def path_to_dirs_only_counter(data_counter: Counter) -> Counter:
+    result = Counter()
+
+    for (p, l), v in data_counter.items():
+        # print(f"{p} ={v}=> {l}")
+        p_path = PurePosixPath(p)
+        result[(str(p_path.parent), l)] += v
+        for p_f, p_t in zip(p_path.parent.parents, p_path.parents):
+            # print(f"- ({p_f}, {p_t})")
+            result[(str(p_f), str(p_t))] += v
+
+    return result
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index e4cdb6d..bbb244b 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -3,7 +3,7 @@
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
 from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
-    limit_count_to_selected_files
+    limit_count_to_selected_files, path_to_dirs_only_counter
 
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
@@ -90,5 +90,11 @@ def test_timeseries_file_hellogitworld():
     assert actual == selected_files, \
         "list of files after filtering is filter list, if filter list is from counter"
 
+    actual = path_to_dirs_only_counter(counter_limited)
+    assert ('.', 'src') in actual, \
+        "path from top dir to 'src' subdirectory present"
+    assert ('src', 'type.code') in actual, \
+        "'src/Main.groovy' lines of code contributions changed to 'src' contributions"
+
 
 # TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 044982b1abe72f4cc88817d7e82402115c8674e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 13:59:28 +0100
Subject: [PATCH 12/29] diffinsights_web: Add add_dashdash_dirs_to_counter(),
 not tested
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This function adds __<dir>__ nodes that gather contribitions from all
files that are in <dir>, and not one of its directories.

Idea taken from "Wolves in Developers’ Clothing: Analyzing the
Software Engineering Practice in the XZ Utils Supply Chain Attack"
preprint, Fig. 3 therein.
---
 src/diffinsights_web/datastore/linesstats.py | 33 ++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 41b6b8b..021ae9e 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -91,6 +91,39 @@ def path_to_dirs_only_counter(data_counter: Counter) -> Counter:
     return result
 
 
+def add_dashdash_dirs_to_counter(data_counter: Counter) -> Counter:
+    res = data_counter.copy()
+
+    xsankey_data_sets = {
+        'dir-to-dir': set(),
+        'dir-to-line': set(),
+    }
+    #xsankey_data_cntr = Counter()
+    xsankey_data_line = defaultdict(set)
+
+    for (p_f, p_t), v in data_counter.items():
+        if p_t.startswith('type.'):
+            xsankey_data_sets['dir-to-line'].add(p_f)
+            #xsankey_data_cntr[p_f] += v
+            xsankey_data_line[p_f].add(p_t)
+        else:
+            xsankey_data_sets['dir-to-dir'].add(p_f)
+
+    xsankey_data_sets['intersection'] = xsankey_data_sets['dir-to-dir'] & xsankey_data_sets['dir-to-line']
+
+    #xsankey_data_extracted = {k: v for k, v in xsankey_data_cntr.items() if k in xsankey_data_sets['intersection']}
+
+    for d in xsankey_data_sets['intersection']:
+        #print(f"{d!r}:")
+        for l in xsankey_data_line[d]:
+            #print(f"    {l!r}")
+            res[(f"__{d}__", l)]  = res[(d, l)]
+            res[(d, f"__{d}__")] += res[(d, l)]
+            del res[(d, l)]
+
+    return res
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,

From f23c9db2b2c8748d57917d013ed8bb212164dc3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 14:45:18 +0100
Subject: [PATCH 13/29] diffinsights_web: Add reduce_sankey_from_tail()

First attempt at reducing the number of nodes in the Sankey diagram,
while keeping it overall shape.

This function would remove olny the last level, from the tail end.
---
 src/diffinsights_web/datastore/linesstats.py | 68 +++++++++++++++++++-
 tests/test_datastore_linesstats.py           |  8 ++-
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 021ae9e..aea1487 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -1,5 +1,5 @@
 import json
-from collections import Counter
+from collections import Counter, defaultdict
 from collections.abc import Container, Iterable
 from pathlib import Path, PurePosixPath
 from typing import Union, Optional
@@ -124,6 +124,72 @@ def add_dashdash_dirs_to_counter(data_counter: Counter) -> Counter:
     return res
 
 
+def reduce_sankey_from_tail(data_counter: Counter) -> Counter:
+    res = data_counter.copy()
+
+    #print("reduce_sankey_from_tail():")
+
+    max_level = 0
+    for (p_f, _) in data_counter.keys():
+        n_dashes = p_f.count('/')
+        if n_dashes > max_level:
+            max_level = n_dashes
+
+    #print(f"  {max_level=}")
+
+    to_delete = lambda x: x.count('/') == max_level
+    can_delete = True
+
+    helper_info = {
+        'delete-contents': defaultdict(dict),
+        'to-prev': {}
+    }
+
+    # sanity check
+    for k, v in data_counter.items():
+        (p_f, p_t) = k
+        if to_delete(p_f):
+            if not p_t.startswith('type.'):
+                #print(f"  {p_f!r} is not final: {p_f!r} =[{v}]=> {p_t!r}")
+                can_delete = False
+            else:
+                helper_info['delete-contents'][p_f][p_t] = v
+
+        if to_delete(p_t):
+            helper_info['to-prev'][p_t] = p_f
+
+    #print(f"  {can_delete=}")
+
+    if can_delete:
+        to_prev_dict = {}
+        for p_t, p_f in helper_info['to-prev'].items():
+            if (p_f, f"__{p_f}__") in data_counter:
+                #print(f"({p_f}, __{p_f}__): {xsankey_cntr_5[(p_f, f'__{p_f}__')]}")
+                to_prev_dict[f"__{p_f}__"] = p_f
+
+        #print(f"  extra 'to-prev':{len(to_prev_dict)}")
+        helper_info['to-prev'] |= to_prev_dict
+
+        for k, v in data_counter.items():
+            (p_f, p_t) = k
+            if (p_f in helper_info['to-prev'] and
+                p_t.startswith('type.')):
+                helper_info['delete-contents'][p_f][p_t] = v
+
+        for k, v in data_counter.items():  # we are changing res
+            (p_f, p_t) = k
+            if p_t in helper_info['to-prev'] and p_f == helper_info['to-prev'][p_t]:
+                #print(f"({p_f}, {p_t}): {v})")
+                for kk, vv in helper_info['delete-contents'][p_t].items():
+                    res[(p_f, kk)] += vv
+                    #print(f"  ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}")
+                del res[(p_f, p_t)]
+            if p_f in helper_info['to-prev']:
+                del res[(p_f, p_t)]
+
+    return res
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index bbb244b..25766cc 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -3,7 +3,7 @@
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
 from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
-    limit_count_to_selected_files, path_to_dirs_only_counter
+    limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail
 
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
@@ -97,4 +97,10 @@ def test_timeseries_file_hellogitworld():
         "'src/Main.groovy' lines of code contributions changed to 'src' contributions"
 
 
+    starting_counter = actual
+    actual = reduce_sankey_from_tail(starting_counter)
+    assert len(actual) < len(starting_counter), \
+        "removed at least one node from Sankey diagram"
+
+
 # TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 41b11ddfdfabe025459fc849763a06b316094270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 15:45:40 +0100
Subject: [PATCH 14/29] diffinsights_web: Add reduce_sankey_thin_out()

Another function to reduce number of nodes in Sankey diagram by
assigning their contributions to the parent node (there is only one
parent node).

It is quite conservative, and not the best at removing nodes.
Need rethinging, so that it would be able to compress / reduce
chain of nodes (with all edges on the chain being thin edges).
---
 src/diffinsights_web/datastore/linesstats.py | 80 ++++++++++++++++++++
 tests/test_datastore_linesstats.py           |  8 +-
 2 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index aea1487..823efa6 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -190,6 +190,86 @@ def reduce_sankey_from_tail(data_counter: Counter) -> Counter:
     return res
 
 
+def reduce_sankey_thin_out(data_counter: Counter,
+                           threshold_ratio: float = 0.005) -> Counter:
+    #print("reduce_sankey_thin_out():")
+    # TODO: use threshold on max value, not on sum of values
+
+    total_lines = 0
+    for (p_f, p_t), v in data_counter.items():
+        if p_f != '.':
+            continue
+        total_lines += v
+
+    #print(f"  {total_lines=}")
+    #print(f"  threshold={threshold_ratio}*{total_lines}={threshold_ratio * total_lines}")
+
+    data_info = {
+        'to-remove': set()
+    }
+
+    for (p_f, p_t), v in data_counter.items():
+        if v < threshold_ratio * total_lines:
+            #print(f"  - ({p_f}, {p_t}): {v} {'*' if p_t.startswith('type.') else ' '}")
+            data_info['to-remove'].add(p_f)
+
+    data_info |= {
+        'delete-contents': defaultdict(dict),
+        'to-prev': {},
+        'can-remove': set(),
+    }
+
+    #print("  gathering data:")
+
+    for (p_f, p_t), v in data_counter.items():
+        # want to remove, and can remove
+        if p_f in data_info['to-remove'] and p_t.startswith('type.'):
+            #print(f"   - saving data for ({p_f}, {p_t}): {v}")
+            data_info['delete-contents'][p_f][p_t] = v
+
+    for (p_f, p_t), v in data_counter.items():
+        if p_t in data_info['to-remove'] and p_t in data_info['delete-contents']:
+            data_info['to-prev'][p_t] = p_f
+
+            total_width = 0
+            for v in data_info['delete-contents'][p_t].values():
+                total_width += v
+            if total_width < threshold_ratio * total_lines:
+                if f"__{p_f}__" == p_t:
+                    #print(f"   ! ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}")
+                    pass
+                elif p_f == ".":
+                    #print(f"   # ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}")
+                    pass
+                else:
+                    #print(f"   + ({p_f}) => ({p_t}) => {data_info['delete-contents'][p_t]}")
+                    data_info['can-remove'].add(p_t)
+            else:
+                #print(f"  - ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}")
+                pass
+
+    ## -------------------------------------------------------
+    ## actual removal
+    res = data_counter.copy()
+
+    #print("  deleting/compressing:")
+    for k, v in data_counter.items():  # we are changing res
+        (p_f, p_t) = k
+        if p_t in data_info['can-remove']:
+            if p_t in data_info['to-prev'] and p_f == data_info['to-prev'][p_t]:
+                #print(f"  - ({p_f}, {p_t}): {v})")
+                for kk, vv in data_info['delete-contents'][p_t].items():
+                    res[(p_f, kk)] += vv
+                    #print(f"  ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}")
+                del res[(p_f, p_t)]
+
+        if p_f in data_info['can-remove']:
+            if p_f in data_info['to-prev']:
+                del res[(p_f, p_t)]
+
+    return res
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 25766cc..f61042d 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -3,7 +3,7 @@
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
 from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
-    limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail
+    limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail, reduce_sankey_thin_out
 
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
@@ -101,6 +101,12 @@ def test_timeseries_file_hellogitworld():
     actual = reduce_sankey_from_tail(starting_counter)
     assert len(actual) < len(starting_counter), \
         "removed at least one node from Sankey diagram"
+    # TODO: check that it removed only last level
+
+    actual = reduce_sankey_thin_out(starting_counter, threshold_ratio=0.5)
+    assert len(actual) < len(starting_counter), \
+        "removed at least one node from Sankey diagram"
+    # TODO: add more checks
 
 
 # TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 92aaf6e1e9d60a9576c749785248c0f14f6aa7f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 15:53:19 +0100
Subject: [PATCH 15/29] diffinsights_web: Add sankey_plot_from_triplets() for
 rough preview

It is here to quickly get something out; the more clean version will be
produced not from tuple, but from the DataFrame (where it is easier to
perform some of the adjustments).

Untested!
---
 src/diffinsights_web/datastore/linesstats.py | 5 +++++
 tests/test_datastore_linesstats.py           | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 823efa6..0d6580a 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -4,6 +4,7 @@
 from pathlib import Path, PurePosixPath
 from typing import Union, Optional
 
+import holoviews as hv
 import panel as pn
 import param
 
@@ -270,6 +271,10 @@ def reduce_sankey_thin_out(data_counter: Counter,
     return res
 
 
+def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], width: int = 800, height: int = 400) -> hv.Sankey:
+    return hv.Sankey(sankey_data).opts(edge_color_index=1, width=width, height=height)
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index f61042d..77b7ff9 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -5,6 +5,7 @@
 from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
     limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail, reduce_sankey_thin_out
 
+hv = pytest.importorskip("holoviews")
 param = pytest.importorskip("param")
 panel = pytest.importorskip("panel")
 

From dedfc461e989b893dad2ccd475102f4902774e9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 16:01:31 +0100
Subject: [PATCH 16/29] diffinsights_web: Handle input of None in
 count_file_x_line_lines_stats()

This fixes the problem where there is no file with relevant data, and
not every function took notice.  The fact that there is no data is
handled by returning None.
---
 src/diffinsights_web/datastore/linesstats.py | 7 +++++--
 tests/test_datastore_linesstats.py           | 6 +++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index 0d6580a..f5c17b8 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -27,11 +27,14 @@ def get_lines_stats_data(dataset_dir: str, timeseries_file: str) -> Optional[dic
         return None
 
 
-def count_file_x_line_in_lines_stats(lines_stats_data: dict,
+def count_file_x_line_in_lines_stats(lines_stats_data: Optional[dict],
                                      repo_name: str,
                                      change_type: str = "+/-",
-                                     prefix: str = 'type.') -> Counter:
+                                     prefix: str = 'type.') -> Optional[Counter]:
     #print(f"count_file_line_in_lines_stats(..., {repo_name=}, {change_type=}, {prefix=})")
+    if lines_stats_data is None:
+        return None
+
     result = Counter()
 
     for dataset, dataset_data in lines_stats_data.items():
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 77b7ff9..3aad95a 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -27,7 +27,11 @@ def test_timeseries_file_no_such_file():
 
     actual = data_store.lines_stats_data_rx.rx.value
     assert actual is None, \
-        "LinesDataStore returns None if lines-stats file does not exist"
+        "LinesDataStore returns None for data if lines-stats file does not exist"
+
+    actual = data_store.lines_stats_counter_rx.rx.value
+    assert actual is None, \
+        "LinesDataStore returns None for counter if lines-stats file does not exist"
 
 
 def test_timeseries_file_from_widget_default_value():

From 1793c11b6180a8010b2122db440b587de85bc2b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 20:18:46 +0100
Subject: [PATCH 17/29] diffinsights_web: Add process_sankey()

This function, taken from the "Sankey for whole timeline" subsection
in 'notebooks/panel/02-contributors_graph.ipynb', filters and processes
data extracted fom *.lines-stats.*.json file, turning it into format
suitable for creating Sankey diagram.

Next step would be to actually draw such plot.
---
 src/diffinsights_web/datastore/linesstats.py | 29 ++++++++++++++++++++
 tests/test_datastore_linesstats.py           |  4 +++
 2 files changed, 33 insertions(+)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index f5c17b8..f6afaa6 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -278,6 +278,26 @@ def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], width: int
     return hv.Sankey(sankey_data).opts(edge_color_index=1, width=width, height=height)
 
 
+def process_sankey(lines_stats_counter: Counter,
+                   max_files: Optional[int] = None,
+                   threshold: float = 0.0):
+    changed_files = sorted_changed_files(lines_stats_counter=lines_stats_counter)
+    if max_files is not None:
+        lines_stats_counter = limit_count_to_selected_files(
+            lines_stats_counter=lines_stats_counter,
+            files=changed_files[:max_files]
+        )
+
+    sankey_counter = path_to_dirs_only_counter(lines_stats_counter)
+    sankey_counter = add_dashdash_dirs_to_counter(sankey_counter)
+    if 0.0 < threshold < 1.0:
+        sankey_counter = reduce_sankey_thin_out(sankey_counter, threshold_ratio=threshold)
+
+    sankey_triples = sankey_triples_from_counter(sankey_counter)
+
+    return sankey_triples
+
+
 class LinesStatsDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,
@@ -305,3 +325,12 @@ def __init__(self, **params):
             repo_name=self.repo_name,
         )
 
+        self.num_files_widget = pn.widgets.Select(
+            name="top N files",
+            options=[10,100,None],
+            value=100,
+        )
+        self.sankey_data_rx = pn.rx(process_sankey)(
+            lines_stats_counter=self.lines_stats_counter_rx,
+            max_files=self.num_files_widget,
+        )
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 3aad95a..332bf45 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -113,5 +113,9 @@ def test_timeseries_file_hellogitworld():
         "removed at least one node from Sankey diagram"
     # TODO: add more checks
 
+    actual = lines_stats.sankey_data_rx.rx.value
+    assert len(actual) > 0, \
+        "there is something to create Sankey diagram from"
+
 
 # TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 02a74331bc5511f7f63f882906da56c5051f7397 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 20:44:23 +0100
Subject: [PATCH 18/29] diffinsights_web: Fix handling of missing lines-stats
 file

The protocol / API is to return None when there is no data for Sankey
diagram.  These changes made the code propagate value of None correctly.
---
 src/diffinsights_web/datastore/linesstats.py | 10 ++++++++--
 tests/test_datastore_linesstats.py           |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index f6afaa6..d20cf51 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -56,7 +56,10 @@ def count_file_x_line_in_lines_stats(lines_stats_data: Optional[dict],
     return result
 
 
-def sorted_changed_files(lines_stats_counter: Counter) -> list[str]:
+def sorted_changed_files(lines_stats_counter: Optional[Counter]) -> Optional[list[str]]:
+    if lines_stats_counter is None:
+        return None
+
     counts = Counter()
     for kv, n_lines in lines_stats_counter.items():
         file_name = kv[0]
@@ -278,9 +281,12 @@ def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], width: int
     return hv.Sankey(sankey_data).opts(edge_color_index=1, width=width, height=height)
 
 
-def process_sankey(lines_stats_counter: Counter,
+def process_sankey(lines_stats_counter: Optional[Counter],
                    max_files: Optional[int] = None,
                    threshold: float = 0.0):
+    if lines_stats_counter is None:
+        return None
+
     changed_files = sorted_changed_files(lines_stats_counter=lines_stats_counter)
     if max_files is not None:
         lines_stats_counter = limit_count_to_selected_files(
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index 332bf45..e762614 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -33,6 +33,10 @@ def test_timeseries_file_no_such_file():
     assert actual is None, \
         "LinesDataStore returns None for counter if lines-stats file does not exist"
 
+    actual = data_store.sankey_data_rx.rx.value
+    assert actual is None, \
+        "LinesDataStore returns None for Sankey data if lines-stats file does not exist"
+
 
 def test_timeseries_file_from_widget_default_value():
     dataset_dir = find_dataset_dir()

From 060193cf61d3a3801505c70b6353d71d47804eed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 21:47:56 +0100
Subject: [PATCH 19/29] diffinsights_web: Add views/plots/sankey.py

Move sankey_plot_from_triples() to the new file, and create the
SankeyPlot class.

Currently not used, and not tested.  Doesn't actually even try to use
from_date_str parameter, for now.
---
 src/diffinsights_web/datastore/linesstats.py |  7 ----
 src/diffinsights_web/views/plots/sankey.py   | 41 ++++++++++++++++++++
 2 files changed, 41 insertions(+), 7 deletions(-)
 create mode 100644 src/diffinsights_web/views/plots/sankey.py

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index d20cf51..f48e496 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -4,12 +4,9 @@
 from pathlib import Path, PurePosixPath
 from typing import Union, Optional
 
-import holoviews as hv
 import panel as pn
 import param
 
-from diffinsights_web.utils.notifications import warning_notification
-
 
 def get_lines_stats_data(dataset_dir: str, timeseries_file: str) -> Optional[dict]:
     timeseries_file_path = Path(timeseries_file)
@@ -277,10 +274,6 @@ def reduce_sankey_thin_out(data_counter: Counter,
     return res
 
 
-def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], width: int = 800, height: int = 400) -> hv.Sankey:
-    return hv.Sankey(sankey_data).opts(edge_color_index=1, width=width, height=height)
-
-
 def process_sankey(lines_stats_counter: Optional[Counter],
                    max_files: Optional[int] = None,
                    threshold: float = 0.0):
diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py
new file mode 100644
index 0000000..8c83247
--- /dev/null
+++ b/src/diffinsights_web/views/plots/sankey.py
@@ -0,0 +1,41 @@
+from typing import Optional
+
+import holoviews as hv
+import panel as pn
+import param
+
+from diffinsights_web.datastore.linesstats import LinesStatsDataStore
+
+
+def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]],
+                             width: int = 800,
+                             height: int = 400) -> hv.Sankey:
+    return hv.Sankey(sankey_data).opts(
+        edge_color_index=1,
+        width=width,
+        height=height
+    )
+
+
+def plot_sankey(sankey_data: Optional[list[tuple[str, str, int]]],
+                timeline_file: str,
+                width: int = 800,
+                height: int = 400):
+    if sankey_data is None:
+        return pn.pane.HTML(f"No data needed to create Sankey diagram found for {timeline_file!r}")
+    else:
+        return sankey_plot_from_triples(sankey_data, width, height)
+
+
+class SankeyPlot(pn.viewable.Viewer):
+    data_store = param.ClassSelector(class_=LinesStatsDataStore)
+    # allow_refs=True is here to allow widgets
+    from_date_str = param.String(allow_refs=True)  # TODO: implement support for it
+
+    def __init__(self, **params):
+        super().__init__(**params)
+
+        self.plot_sankey_rx = pn.rx(plot_sankey(
+            sankey_data=self.data_store.sankey_data_rx,
+            timeline_file=self.data_store.timeline_file,
+        ))

From 1b64ead7d02c718992019b214b45ea2ab711921c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 23:10:43 +0100
Subject: [PATCH 20/29] diffinsights_web: Rename data_store to
 timeline_data_store in contributors.py

This simple rename is in preparation for having more than one data
store.  We will be aing LinesStatsDataStore in the next commit.
---
 src/diffinsights_web/apps/contributors.py | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/diffinsights_web/apps/contributors.py b/src/diffinsights_web/apps/contributors.py
index 14e68ed..9da6e94 100644
--- a/src/diffinsights_web/apps/contributors.py
+++ b/src/diffinsights_web/apps/contributors.py
@@ -28,33 +28,33 @@
 pn.state.onload(onload_callback)
 
 dataset_dir = find_dataset_dir()
-data_store = TimelineDataStore(dataset_dir=dataset_dir)
+timeline_data_store = TimelineDataStore(dataset_dir=dataset_dir)
 
 page_header = ContributorsHeader(
-    repo=data_store.select_repo_widget,
-    freq=data_store.resample_frequency_widget,
-    end_date=data_store.timeline_max_date_rx,
+    repo=timeline_data_store.select_repo_widget,
+    freq=timeline_data_store.resample_frequency_widget,
+    end_date=timeline_data_store.timeline_max_date_rx,
 )
 timeseries_plot = TimeseriesPlot(
-    data_store=data_store,
+    data_store=timeline_data_store,
     column_name=page_header.select_contribution_type_widget,
     from_date_str=page_header.select_period_from_widget,
 )
 timeseries_plot_header = RepoPlotHeader(
-    freq=data_store.resample_frequency_widget,
+    freq=timeline_data_store.resample_frequency_widget,
     column_name=page_header.select_contribution_type_widget,
     plot=timeseries_plot,
 )
 contributions_perc_header = ContributionsPercHeader(
-    data_store=data_store,
+    data_store=timeline_data_store,
     from_date_str=page_header.select_period_from_widget,
 )
 authors_info_panel = AuthorInfo(
-    data_store=data_store,
+    data_store=timeline_data_store,
     authors_info_df=timeseries_plot.authors_info_df_rx,
 )
 authors_grid = AuthorsGrid(
-    data_store=data_store,
+    data_store=timeline_data_store,
     main_plot=timeseries_plot,
     authors_info_df=timeseries_plot.authors_info_df_rx,
     top_n=authors_info_panel.top_n_widget,
@@ -66,7 +66,7 @@
     title="Contributors Graph",  # TODO: make title dynamic
     favicon="favicon.svg",
     sidebar=[
-        data_store,
+        timeline_data_store,
         *authors_info_panel.widgets(),
 
         pn.layout.Divider(),  # - - - - - - - - - - - - -
@@ -89,11 +89,11 @@
         authors_grid,
     ],
 )
-timeline_perspective = TimelinePerspective(data_store=data_store)
+timeline_perspective = TimelinePerspective(data_store=timeline_data_store)
 template.main.extend([
     pn.layout.Divider(),
     pn.Tabs(
-        ('JSON', TimelineJSONViewer(data_store=data_store)),
+        ('JSON', TimelineJSONViewer(data_store=timeline_data_store)),
         ('data', timeline_perspective.panel(TimelineDataFrameEnum.TIMELINE_DATA)),
         ('resampled', timeline_perspective.panel(TimelineDataFrameEnum.RESAMPLED_DATA)),
         ('by author+resampled', timeline_perspective.panel(TimelineDataFrameEnum.BY_AUTHOR_DATA)),
@@ -102,7 +102,7 @@
             perspective_pane(
                 df=timeseries_plot.authors_info_df_rx,
                 title=pn.rx("Authors info for repo={repo!r}, from={from_date!r}") \
-                    .format(repo=data_store.select_repo_widget,
+                    .format(repo=timeline_data_store.select_repo_widget,
                             from_date=page_header.select_period_from_widget)
             )
         ),

From bc010491cbaad474e30c5ac329298e4988abd2b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 23:12:06 +0100
Subject: [PATCH 21/29] diffinsights_web: Add SankeyPlot to list of possible
 plots

Currently it just barely works; it does not respect "Period:" aka.
`from_date_str` widget, and ut looks like it does not refresh on
repository change.  Also, the Sankey diagram produced is quite ugly, and
it includes tools we do not want.
---
 src/diffinsights_web/apps/contributors.py      | 12 ++++++++++++
 src/diffinsights_web/views/plots/sankey.py     | 14 +++++++++-----
 src/diffinsights_web/views/plots/timeseries.py | 16 +++++++++++++---
 3 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/diffinsights_web/apps/contributors.py b/src/diffinsights_web/apps/contributors.py
index 9da6e94..1ee4392 100644
--- a/src/diffinsights_web/apps/contributors.py
+++ b/src/diffinsights_web/apps/contributors.py
@@ -5,6 +5,7 @@
 import panel as pn
 
 import diffinsights_web.utils.notifications as notifications
+from diffinsights_web.datastore.linesstats import LinesStatsDataStore
 from diffinsights_web.datastore.timeline import TimelineDataStore
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.utils.notifications import onload_callback
@@ -12,6 +13,7 @@
 from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, \
     TimelineDataFrameEnum, perspective_pane
 from diffinsights_web.views.info import ContributorsHeader, RepoPlotHeader, ContributionsPercHeader
+from diffinsights_web.views.plots.sankey import SankeyPlot
 from diffinsights_web.views.plots.timeseries import TimeseriesPlot
 from diffinsights_web.widgets.caching import ClearCacheButton
 
@@ -29,16 +31,26 @@
 
 dataset_dir = find_dataset_dir()
 timeline_data_store = TimelineDataStore(dataset_dir=dataset_dir)
+lines_stats_data_store = LinesStatsDataStore(
+    dataset_dir=dataset_dir,
+    timeseries_file=timeline_data_store.select_file_widget,
+    repo_name=timeline_data_store.select_repo_widget,
+)
 
 page_header = ContributorsHeader(
     repo=timeline_data_store.select_repo_widget,
     freq=timeline_data_store.resample_frequency_widget,
     end_date=timeline_data_store.timeline_max_date_rx,
 )
+sankey_plot = SankeyPlot(
+    data_store=lines_stats_data_store,
+    from_date_str=page_header.select_period_from_widget,
+)
 timeseries_plot = TimeseriesPlot(
     data_store=timeline_data_store,
     column_name=page_header.select_contribution_type_widget,
     from_date_str=page_header.select_period_from_widget,
+    sankey_plot=sankey_plot,
 )
 timeseries_plot_header = RepoPlotHeader(
     freq=timeline_data_store.resample_frequency_widget,
diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py
index 8c83247..5c99852 100644
--- a/src/diffinsights_web/views/plots/sankey.py
+++ b/src/diffinsights_web/views/plots/sankey.py
@@ -13,18 +13,22 @@ def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]],
     return hv.Sankey(sankey_data).opts(
         edge_color_index=1,
         width=width,
-        height=height
+        height=height,
     )
 
 
 def plot_sankey(sankey_data: Optional[list[tuple[str, str, int]]],
-                timeline_file: str,
+                timeseries_file: str,
                 width: int = 800,
                 height: int = 400):
     if sankey_data is None:
-        return pn.pane.HTML(f"No data needed to create Sankey diagram found for {timeline_file!r}")
+        return pn.pane.HTML(f"No data needed to create Sankey diagram found for {timeseries_file!r}")
     else:
-        return sankey_plot_from_triples(sankey_data, width, height)
+        #print(f"plot_sankey(): {type(sankey_data)=}")
+        if isinstance(sankey_data, param.rx):
+            return sankey_plot_from_triples(sankey_data.rx.value, width, height)
+        else:
+            return sankey_plot_from_triples(sankey_data, width, height)
 
 
 class SankeyPlot(pn.viewable.Viewer):
@@ -37,5 +41,5 @@ def __init__(self, **params):
 
         self.plot_sankey_rx = pn.rx(plot_sankey(
             sankey_data=self.data_store.sankey_data_rx,
-            timeline_file=self.data_store.timeline_file,
+            timeseries_file=self.data_store.timeseries_file,
         ))
diff --git a/src/diffinsights_web/views/plots/timeseries.py b/src/diffinsights_web/views/plots/timeseries.py
index 34e6bef..a1d1302 100644
--- a/src/diffinsights_web/views/plots/timeseries.py
+++ b/src/diffinsights_web/views/plots/timeseries.py
@@ -11,6 +11,7 @@
     get_date_range, get_value_range, filter_df_by_from_date, authors_info_df, author_timeline_df_freq
 from diffinsights_web.utils.notifications import warning_notification
 from diffinsights_web.views import TimelineView, SpecialColumnEnum, column_to_contribution
+from diffinsights_web.views.plots.sankey import SankeyPlot
 
 
 def line_type_sorting_key(column_name: str) -> int:
@@ -215,6 +216,11 @@ class TimeseriesPlot(TimelineView):
     # allow_refs=True is here to allow widgets
     column_name = param.String(allow_refs=True)
     from_date_str = param.String(allow_refs=True)
+    sankey_plot = param.ClassSelector(
+        default=None,
+        allow_None=True,
+        class_=SankeyPlot,
+    )
 
     def __init__(self, **params):
         super().__init__(**params)
@@ -241,11 +247,15 @@ def __init__(self, **params):
             from_date_str=self.param.from_date_str.rx(),
         )
 
+        plot_widgets = {
+            'timeline': self.plot_commits_rx,
+        }
+        if self.sankey_plot is not None:
+            plot_widgets['sankey'] = self.sankey_plot.plot_sankey_rx
+
         self.select_plot_rx = pn.rx(self.select_plot)(
             column=self.param.column_name.rx(),
-            plot_widgets={
-                'timeline': self.plot_commits_rx,
-            },
+            plot_widgets=plot_widgets,
         )
 
         self.select_plot_theme_widget = pn.widgets.Select(

From 1a15dddf18eb4b55e9a30c06d0ee5cee8331f2ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 23:32:09 +0100
Subject: [PATCH 22/29] diffinsights_web: Add support for "sankey" plot type to
 sampling_info()

Note that Sankey diagrams do not use resampling, therefore the
description of the plot must be different than for other plots.
---
 src/diffinsights_web/views/info.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/diffinsights_web/views/info.py b/src/diffinsights_web/views/info.py
index f2ff4e6..c5ff3e6 100644
--- a/src/diffinsights_web/views/info.py
+++ b/src/diffinsights_web/views/info.py
@@ -133,7 +133,17 @@ def sampling_info(resample_freq: str,
     if '|' in column:
         plot_type, _ = column.split('|', maxsplit=2)
 
-    if plot_type != "timeline":
+    if plot_type == "sankey":
+        # Sankey diagrams do not use resampling
+        return f"""
+        <p><strong>Distribution of changed lines types based on the directory structure</strong></p>
+        <p><s>Using commits
+        from {html_date_humane(min_max_date[0])}
+        to {html_date_humane(min_max_date[1])}
+        </s></p>
+        """
+
+    elif plot_type != "timeline":
         print(f"sampling_info(): expected plot_type of 'timeline', got {plot_type=}")
         return f"No support for <strong>{plot_type}</strong> plot type, for plotting <em>{column!r}</em>"
 

From 2354116f20878cdfc3fb8fd454fd6ec538a4fd28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Sun, 8 Dec 2024 23:40:40 +0100
Subject: [PATCH 23/29] diffinsights_web: Turn off disabling SANKEY_DIAGRAM for
 "Contributions:"

Perhaps turned off temporarily; the best solution would be to disable
the option if the data for Sankey plot is not available (unless the option
was selected: then disable it as soon as other option was selected).

This is something for the future.
---
 src/diffinsights_web/views/info.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/views/info.py b/src/diffinsights_web/views/info.py
index c5ff3e6..2c190b3 100644
--- a/src/diffinsights_web/views/info.py
+++ b/src/diffinsights_web/views/info.py
@@ -99,9 +99,10 @@ def __init__(self, **params):
             options=contribution_types_map,
             value="timeline|n_commits",  # first value in contribution_types_map
             # NOTE: disabled_options does not seem to work, no disabling (???)
-            disabled_options=[
-                SpecialColumnEnum.SANKEY_DIAGRAM.value,  # need <name>.lines-stats.purpose-to-type.json
-            ],
+            #       therefore there is no code that does disabling and enabling of this
+            #disabled_options=[
+            #    SpecialColumnEnum.SANKEY_DIAGRAM.value,  # need <name>.lines-stats.purpose-to-type.json
+            #],
             # style
             width=200,
             margin=(self.widget_top_margin, 0),  # last widget, use x margin of 0

From 20aac4ba2a50ac801a4f22e8737005841eab20ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 01:07:47 +0100
Subject: [PATCH 24/29] diffinsights_web: Fix LinesStatsDataStore not
 refreshing on file change

If we want to have value changed depending on parameter changing, that
parameter must be reactive expression (or we need to use something other
than pn.rx(), like pn.bind(), or watchers and triggers).

Test the fix, adding test_switch_repos_same_file().
---
 src/diffinsights_web/datastore/linesstats.py |  6 ++---
 tests/test_datastore_linesstats.py           | 25 ++++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index f48e496..b1a8280 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -316,12 +316,12 @@ def __init__(self, **params):
         super().__init__(**params)
 
         self.lines_stats_data_rx = pn.rx(get_lines_stats_data)(
-            dataset_dir=self.dataset_dir,
-            timeseries_file=self.timeseries_file,
+            dataset_dir=self.dataset_dir,  # does not change, no need for rx
+            timeseries_file=self.param.timeseries_file.rx(),
         )
         self.lines_stats_counter_rx = pn.rx(count_file_x_line_in_lines_stats)(
             lines_stats_data=self.lines_stats_data_rx,
-            repo_name=self.repo_name,
+            repo_name=self.param.repo_name.rx(),
         )
 
         self.num_files_widget = pn.widgets.Select(
diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index e762614..ce67091 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -122,4 +122,29 @@ def test_timeseries_file_hellogitworld():
         "there is something to create Sankey diagram from"
 
 
+def test_switch_repos_same_file():
+    lines_stats = LinesStatsDataStore(
+        dataset_dir='data/examples/stats',  # directory part, relative to top directory of project
+        timeseries_file='hellogitworld.timeline.purpose-to-type.json',  # filename part
+        repo_name='hellogitworld',
+    )
+    actual = lines_stats.lines_stats_data_rx.rx.value
+
+    assert isinstance(actual, dict), \
+        "correctly found lines-stats file, retrieved data, did not return None"
+
+    lines_stats.timeseries_file = 'does-not-exist-directly'
+    actual = lines_stats.lines_stats_data_rx.rx.value
+    assert actual is None, \
+        "switching to not-existing file clears retrieved data, makes it None"
+
+    actual = lines_stats.lines_stats_counter_rx.rx.value
+    assert actual is None, \
+        "switching to not-existing file clears stats counter, makes it None"
+
+    actual = lines_stats.sankey_data_rx.rx.value
+    assert actual is None, \
+        "switching to not-existing file clears computed sankey data, makes it None"
+
+
 # TODO: add test for sankey_triples_from_counter() and sankey_counter_from_triples()

From 337d9cff756543f3d05a406a3e37649d9afa14f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 02:02:00 +0100
Subject: [PATCH 25/29] diffinsights_web: Fix SankeyPlot not refreshing on file
 change

The bug was twofold:
- it was not using reactive expression for timeseries file to watch
- the pn.rx() construct was wrong, done on function result instead
  on function itself (on the function call)
---
 src/diffinsights_web/views/plots/sankey.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py
index 5c99852..dc87f37 100644
--- a/src/diffinsights_web/views/plots/sankey.py
+++ b/src/diffinsights_web/views/plots/sankey.py
@@ -39,7 +39,7 @@ class SankeyPlot(pn.viewable.Viewer):
     def __init__(self, **params):
         super().__init__(**params)
 
-        self.plot_sankey_rx = pn.rx(plot_sankey(
+        self.plot_sankey_rx = pn.rx(plot_sankey)(
             sankey_data=self.data_store.sankey_data_rx,
-            timeseries_file=self.data_store.timeseries_file,
-        ))
+            timeseries_file=self.data_store.param.timeseries_file.rx(),
+        )

From f6e4d141088226f9c67155018961fb95fc23069d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 02:11:19 +0100
Subject: [PATCH 26/29] diffinsigths_web: Remove leftover debug-print, fix
 unlikely case

When parsing data from lines stats file, we need to skip data if repos
do not match, not just print that they do not match only.
---
 src/diffinsights_web/datastore/linesstats.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/diffinsights_web/datastore/linesstats.py b/src/diffinsights_web/datastore/linesstats.py
index b1a8280..d37a6f8 100644
--- a/src/diffinsights_web/datastore/linesstats.py
+++ b/src/diffinsights_web/datastore/linesstats.py
@@ -37,7 +37,8 @@ def count_file_x_line_in_lines_stats(lines_stats_data: Optional[dict],
     for dataset, dataset_data in lines_stats_data.items():
         for bug_or_repo, lines_data in dataset_data.items():
             if bug_or_repo != repo_name:
-                print(f"    - skipping: {bug_or_repo!r} != {repo_name!r}")
+                #print(f"    - skipping: {bug_or_repo!r} != {repo_name!r}")
+                continue
 
             for patch_file, patch_data in lines_data.items():
                 for file_name, file_data in patch_data.items():
@@ -85,11 +86,11 @@ def path_to_dirs_only_counter(data_counter: Counter) -> Counter:
     result = Counter()
 
     for (p, l), v in data_counter.items():
-        # print(f"{p} ={v}=> {l}")
+        #print(f"{p} ={v}=> {l}")
         p_path = PurePosixPath(p)
         result[(str(p_path.parent), l)] += v
         for p_f, p_t in zip(p_path.parent.parents, p_path.parents):
-            # print(f"- ({p_f}, {p_t})")
+            #print(f"- ({p_f}, {p_t})")
             result[(str(p_f), str(p_t))] += v
 
     return result

From 2e02bca60172063c651dd82fae4a13f59d678184 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 02:13:20 +0100
Subject: [PATCH 27/29] diffinsights_web: Improve message when Sankey diagram
 is not possible

Use just name of the file, and not full path, to explain why making
Sankey diagram is impossible.
---
 src/diffinsights_web/views/plots/sankey.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py
index dc87f37..4e26f68 100644
--- a/src/diffinsights_web/views/plots/sankey.py
+++ b/src/diffinsights_web/views/plots/sankey.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Optional
 
 import holoviews as hv
@@ -22,7 +23,9 @@ def plot_sankey(sankey_data: Optional[list[tuple[str, str, int]]],
                 width: int = 800,
                 height: int = 400):
     if sankey_data is None:
-        return pn.pane.HTML(f"No data needed to create Sankey diagram found for {timeseries_file!r}")
+        return pn.pane.HTML(
+            "<p>No data needed to create Sankey diagram found for "
+            f"<tt>{Path(timeseries_file).name!r}</tt></p>")
     else:
         #print(f"plot_sankey(): {type(sankey_data)=}")
         if isinstance(sankey_data, param.rx):

From 32bf65ce2dc5b22ef83010c59b8bea1dc2b0da54 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 02:27:34 +0100
Subject: [PATCH 28/29] diffinsights_web: File with Sankey data -> file without
 transition works

Ensure that selecting Sankey diagram for JSON file that has data for the
diagram, and the selecting different JSON file, one that does not have
that data, works as intended.

While at it, this commit slightly simplyfies plot_sankey() function.
---
 src/diffinsights_web/views/plots/sankey.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/diffinsights_web/views/plots/sankey.py b/src/diffinsights_web/views/plots/sankey.py
index 4e26f68..2e0f86c 100644
--- a/src/diffinsights_web/views/plots/sankey.py
+++ b/src/diffinsights_web/views/plots/sankey.py
@@ -22,16 +22,16 @@ def plot_sankey(sankey_data: Optional[list[tuple[str, str, int]]],
                 timeseries_file: str,
                 width: int = 800,
                 height: int = 400):
-    if sankey_data is None:
+    if isinstance(sankey_data, param.rx):
+        sankey_data = sankey_data.rx.value
+
+    if sankey_data is None or len(sankey_data) == 0:
         return pn.pane.HTML(
             "<p>No data needed to create Sankey diagram found for "
             f"<tt>{Path(timeseries_file).name!r}</tt></p>")
     else:
         #print(f"plot_sankey(): {type(sankey_data)=}")
-        if isinstance(sankey_data, param.rx):
-            return sankey_plot_from_triples(sankey_data.rx.value, width, height)
-        else:
-            return sankey_plot_from_triples(sankey_data, width, height)
+        return sankey_plot_from_triples(sankey_data, width, height)
 
 
 class SankeyPlot(pn.viewable.Viewer):

From 0341d795d6beb148c78a8f323b50be9860faadda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Nar=C4=99bski?= <jnareb@mat.umk.pl>
Date: Mon, 9 Dec 2024 03:52:01 +0100
Subject: [PATCH 29/29] test_datastore_linesstats.py: Move importorskip earlier

The pytest.importorskip must be before local imports, that might to try
to import module like 'panel' that is not available (e.g. in GitHub
Action environment).

While at it, import 'panel' as 'pn' (with .importorskip), like in
other places.
---
 tests/test_datastore_linesstats.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_datastore_linesstats.py b/tests/test_datastore_linesstats.py
index ce67091..7e540d2 100644
--- a/tests/test_datastore_linesstats.py
+++ b/tests/test_datastore_linesstats.py
@@ -1,14 +1,14 @@
 import pytest
 
+hv = pytest.importorskip("holoviews")
+param = pytest.importorskip("param")
+pn = pytest.importorskip("panel")
+
 from diffinsights_web.datastore import find_dataset_dir
 from diffinsights_web.datastore.timeline import TimelineDataStore
 from diffinsights_web.datastore.linesstats import LinesStatsDataStore, sorted_changed_files, \
     limit_count_to_selected_files, path_to_dirs_only_counter, reduce_sankey_from_tail, reduce_sankey_thin_out
 
-hv = pytest.importorskip("holoviews")
-param = pytest.importorskip("param")
-panel = pytest.importorskip("panel")
-
 
 def test_timeseries_file_no_such_file():
     data_store = LinesStatsDataStore(