Merge pull request #54 from ncusi/visualization - headers and authors…

… grid Visualization: Add "type.<line kind> [%]", main plot header, and authors grid
ncusi · Nov 26, 2024 · e0f18f0 · e0f18f0
2 parents 2200c2c + 52774cb
commit e0f18f0
Show file tree

Hide file tree

Showing 8 changed files with 513 additions and 74 deletions.
diff --git a/src/diffinsights_web/apps/contributors.py b/src/diffinsights_web/apps/contributors.py
@@ -1,12 +1,17 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+from typing import Optional
+
+import pandas as pd
 import panel as pn
 
 import diffinsights_web.utils.notifications as notifications
-from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir
+from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir, author_timeline_df
 from diffinsights_web.utils.notifications import onload_callback
-from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, TimelineDataFrameEnum
-from diffinsights_web.views.info import ContributorsHeader
+from diffinsights_web.views.authorsgrid import AuthorInfo, AuthorsGrid
+from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, TimelineDataFrameEnum, \
+    perspective_pane
+from diffinsights_web.views.info import ContributorsHeader, RepoPlotHeader
 from diffinsights_web.views.plots.timeseries import TimeseriesPlot
 from diffinsights_web.widgets.caching import ClearCacheButton
 
@@ -33,6 +38,26 @@
     column_name=page_header.select_contribution_type_widget,
     from_date_str=page_header.select_period_from_widget,
 )
+timeseries_plot_header = RepoPlotHeader(
+    freq=data_store.resample_frequency_widget,
+    column_name=page_header.select_contribution_type_widget,
+    plot=timeseries_plot,
+)
+#authors_info_panel = AuthorInfo(
+#    data_store=data_store,
+#    authors_info_df=timeseries_plot.authors_info_df_rx,
+#)
+top_n_widget = pn.widgets.Select(
+    name="top N",
+    options=[4, 10, 32],
+    value=4,
+)
+authors_grid = AuthorsGrid(
+    data_store=data_store,
+    main_plot=timeseries_plot,
+    authors_info_df=timeseries_plot.authors_info_df_rx,
+    top_n=top_n_widget,
+)
 
 # Create the dashboard layout
 template = pn.template.MaterialTemplate(
@@ -41,15 +66,26 @@
     favicon="favicon.svg",
     sidebar=[
         data_store,
-        pn.layout.Divider(),
+        #*authors_info_panel.widgets(),
+        top_n_widget,
+
+        pn.layout.Divider(),  # - - - - - - - - - - - - -
+
         timeseries_plot.select_plot_theme_widget,
         ClearCacheButton(),
     ],
     main=[
         pn.Column(
             page_header,
         ),
-        timeseries_plot,
+        pn.Card(
+            pn.Column(
+                timeseries_plot_header,
+                timeseries_plot,
+            ),
+            collapsible=False, hide_header=True,
+        ),
+        authors_grid,
     ],
 )
 timeline_perspective = TimelinePerspective(data_store=data_store)
@@ -60,6 +96,16 @@
         ('data', timeline_perspective.panel(TimelineDataFrameEnum.TIMELINE_DATA)),
         ('resampled', timeline_perspective.panel(TimelineDataFrameEnum.RESAMPLED_DATA)),
         ('by author+resampled', timeline_perspective.panel(TimelineDataFrameEnum.BY_AUTHOR_DATA)),
+        (
+            'authors info',
+            perspective_pane(
+                df=timeseries_plot.authors_info_df_rx,
+                title=pn.rx("Authors info for repo={repo!r}, from={from_date!r}") \
+                    .format(repo=data_store.select_repo_widget,
+                            from_date=page_header.select_period_from_widget)
+            )
+        ),
+        #('selected author', authors_info_panel),
     ),
 ])
 

diff --git a/src/diffinsights_web/datastore/timeline.py b/src/diffinsights_web/datastore/timeline.py
@@ -8,7 +8,6 @@
 
 from diffinsights_web.utils.notifications import warning_notification
 
-
 DATASET_DIR = 'data/examples/stats'
 
 
@@ -144,6 +143,25 @@ def add_pm_count_perc(resampled_df: pd.DataFrame,
         elif col.startswith('-:'):
             resampled_df.loc[:, col_perc] = resampled_df[col] / resampled_df['-:count']
 
+    for col in pm_count_cols:
+        if col in {'-:count', '+:count'}:  # '-:count' or '+:count'
+            continue
+
+        # previous loop ensured that both "-:<column>" and "+:<column>" exists
+        if col.startswith('-:'):  # we need only one of those
+            continue
+
+        col_base = col[2:]  # remove "+:" prefix
+        col_base_perc = f"{col_base} [%]"
+        if col_base_perc in resampled_df.columns:
+            # print(f"  SKIP {col_base_perc}")
+            continue
+
+        resampled_df.loc[:, col_base_perc] = (
+                (resampled_df[f"-:{col_base}"] + resampled_df[f"+:{col_base}"]) /
+                (resampled_df['-:count'] + resampled_df['+:count'])
+        )
+
     #print(f"  returned DataFrame(<{hex(id(resampled_df))}>)")
     return resampled_df
 
@@ -185,6 +203,68 @@ def resample_timeline(timeline_df: pd.DataFrame,
     return add_pm_count_perc(df_agg, pm_count_cols)
 
 
+def author_timeline_df(resample_by_author_df: pd.DataFrame, author_id: str) -> pd.DataFrame:
+    return resample_by_author_df.loc[author_id]
+
+
+@pn.cache
+def get_date_range(timeline_df: pd.DataFrame, from_date_str: str):
+    # TODO: create reactive component or bound function to compute from_date to avoid recalculations
+    # TODO: use parsed `from_date` instead of using raw `from_date_str`
+    min_date = timeline_df['author_date'].min()
+    if from_date_str:
+        from_date = pd.to_datetime(from_date_str, dayfirst=True, utc=True)
+        min_date = max(min_date, from_date)
+
+    ## DEBUG
+    #print(f"get_date_range(timeline_df=<{hex(id(timeline_df))}, {from_date_str=}>):")
+    #print(f"  {min_date=}, {timeline_df['author_date'].max()=}")
+
+    return (
+        min_date,
+        timeline_df['author_date'].max(),
+    )
+
+
+@pn.cache
+def get_value_range(timeline_df: pd.DataFrame, column: str = 'n_commits'):
+    return (
+        timeline_df[column].min(),
+        timeline_df[column].max(),
+    )
+
+
+# NOTE: consider putting the filter earlier in the pipeline (needs profiling / benchmarking?)
+# TODO: replace `from_date_str` (raw string) with `from_date` (parsed value)
+def filter_df_by_from_date(resampled_df: pd.DataFrame,
+                           from_date_str: str,
+                           date_column: Optional[str] = None) -> pd.DataFrame:
+    from_date: Optional[pd.Timestamp] = None
+    if from_date_str:
+        try:
+            # the `from_date_str` is in DD.MM.YYYY format
+            from_date = pd.to_datetime(from_date_str, dayfirst=True, utc=True)
+        except ValueError as err:
+            # NOTE: should not happen, value should be validated earlier
+            warning_notification(f"from={from_date_str!r} is not a valid date: {err}")
+
+    filtered_df = resampled_df
+    if from_date is not None:
+        if date_column is None:
+            filtered_df = resampled_df[resampled_df.index >= from_date]
+        else:
+            if pd.api.types.is_timedelta64_dtype(resampled_df[date_column]):
+                filtered_df = resampled_df[resampled_df[date_column] >= from_date]
+            elif pd.api.types.is_numeric_dtype(resampled_df[date_column]):
+                # assume numeric date column is UNIX timestamp
+                filtered_df = resampled_df[resampled_df[date_column] >= from_date.timestamp()]
+            else:
+                warning_notification(f"unsupported type {resampled_df.dtypes[date_column]!r} "
+                                     f"for column {date_column!r}")
+
+    return filtered_df
+
+
 # mapping form display name to frequency alias
 # see table in https://pandas.pydata.org/docs/user_guide/timeseries.html#dateoffset-objects
 time_series_frequencies = {
@@ -204,6 +284,37 @@ def resample_timeline(timeline_df: pd.DataFrame,
 }
 
 
+def authors_info_df(timeline_df: pd.DataFrame,
+                    column: str = 'n_commits',
+                    from_date_str: str = '') -> pd.DataFrame:
+    info_columns = list(agg_func_mapping().keys())
+
+    # sanity check
+    if column not in info_columns:
+        column = info_columns[0]
+
+    filtered_df = filter_df_by_from_date(timeline_df, from_date_str,
+                                         date_column='author.timestamp')
+
+    df = filtered_df\
+        .groupby(by='author.email')[info_columns + ['author.name']]\
+        .agg({
+            col: 'sum' for col in info_columns
+        } | {
+            # https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
+            'author.name': pd.Series.mode,
+        })\
+        .sort_values(by=column, ascending=False)\
+        .rename(columns={
+            '+:count': 'p_count',
+            '-:count': 'm_count',
+            'author.name': 'author_name',
+        })
+
+    #print(f" -> {df.columns=}, {df.index.name=}")
+    return df
+
+
 class TimelineDataStore(pn.viewable.Viewer):
     dataset_dir = param.Foldername(
         constant=True,

diff --git a/src/diffinsights_web/utils/avatars.py b/src/diffinsights_web/utils/avatars.py
@@ -0,0 +1,25 @@
+import hashlib
+from urllib.parse import urlencode
+
+import panel as pn
+
+
+@pn.cache
+def gravatar_url(email: str, size: int = 16) -> str:
+    # https://docs.gravatar.com/api/avatars/python/
+
+    # Set default parameters
+    # ...
+
+    # Encode the email to lowercase and then to bytes
+    email_encoded = email.lower().encode('utf-8')
+
+    # Generate the SHA256 hash of the email
+    email_hash = hashlib.sha256(email_encoded).hexdigest()
+
+    # https://docs.gravatar.com/api/avatars/images/
+    # Construct the URL with encoded query parameters
+    query_params = urlencode({'s': str(size)})  # NOTE: will be needed for 'd' parameter
+    url = f"https://www.gravatar.com/avatar/{email_hash}?{query_params}"
+
+    return url
diff --git a/src/diffinsights_web/utils/humanize.py b/src/diffinsights_web/utils/humanize.py
@@ -0,0 +1,24 @@
+"""Provide human-readable value, together with machine-readable HTML metadata/microdata"""
+import os
+
+import pandas as pd
+
+
+def html_date_humane(date: pd.Timestamp) -> str:
+    date_format = '%d %a %Y'
+    if os.name == 'nt':
+        date_format = '%#d %a %Y'
+    elif os.name == 'posix':
+        date_format = '%-d %a %Y'
+
+    return f'<time datetime="{date.isoformat()}">{date.strftime(date_format)}</time>'
+
+
+def html_int_humane(val: int) -> str:
+    thousands_sep = " "  # Unicode thin space (breakable in HTML), &thinsp;
+
+    res = f'{val:,}'
+    if thousands_sep != ",":
+        res = res.replace(",", thousands_sep)
+
+    return f'<data value="{val}" style="white-space: nowrap;">{res}</data>'