Skip to content

Commit

Permalink
Merge pull request #54 from ncusi/visualization - headers and authors…
Browse files Browse the repository at this point in the history
… grid

Visualization: Add "type.<line kind> [%]", main plot header, and authors grid
  • Loading branch information
jnareb authored Nov 26, 2024
2 parents 2200c2c + 52774cb commit e0f18f0
Show file tree
Hide file tree
Showing 8 changed files with 513 additions and 74 deletions.
56 changes: 51 additions & 5 deletions src/diffinsights_web/apps/contributors.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,17 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from typing import Optional

Check failure on line 3 in src/diffinsights_web/apps/contributors.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

src/diffinsights_web/apps/contributors.py:3:20: F401 `typing.Optional` imported but unused

import pandas as pd

Check failure on line 5 in src/diffinsights_web/apps/contributors.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

src/diffinsights_web/apps/contributors.py:5:18: F401 `pandas` imported but unused
import panel as pn

import diffinsights_web.utils.notifications as notifications
from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir
from diffinsights_web.datastore.timeline import TimelineDataStore, find_dataset_dir, author_timeline_df

Check failure on line 9 in src/diffinsights_web/apps/contributors.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

src/diffinsights_web/apps/contributors.py:9:86: F401 `diffinsights_web.datastore.timeline.author_timeline_df` imported but unused
from diffinsights_web.utils.notifications import onload_callback
from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, TimelineDataFrameEnum
from diffinsights_web.views.info import ContributorsHeader
from diffinsights_web.views.authorsgrid import AuthorInfo, AuthorsGrid

Check failure on line 11 in src/diffinsights_web/apps/contributors.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

src/diffinsights_web/apps/contributors.py:11:48: F401 `diffinsights_web.views.authorsgrid.AuthorInfo` imported but unused
from diffinsights_web.views.dataexplorer import TimelineJSONViewer, TimelinePerspective, TimelineDataFrameEnum, \
perspective_pane
from diffinsights_web.views.info import ContributorsHeader, RepoPlotHeader
from diffinsights_web.views.plots.timeseries import TimeseriesPlot
from diffinsights_web.widgets.caching import ClearCacheButton

Expand All @@ -33,6 +38,26 @@
column_name=page_header.select_contribution_type_widget,
from_date_str=page_header.select_period_from_widget,
)
timeseries_plot_header = RepoPlotHeader(
freq=data_store.resample_frequency_widget,
column_name=page_header.select_contribution_type_widget,
plot=timeseries_plot,
)
#authors_info_panel = AuthorInfo(
# data_store=data_store,
# authors_info_df=timeseries_plot.authors_info_df_rx,
#)
top_n_widget = pn.widgets.Select(
name="top N",
options=[4, 10, 32],
value=4,
)
authors_grid = AuthorsGrid(
data_store=data_store,
main_plot=timeseries_plot,
authors_info_df=timeseries_plot.authors_info_df_rx,
top_n=top_n_widget,
)

# Create the dashboard layout
template = pn.template.MaterialTemplate(
Expand All @@ -41,15 +66,26 @@
favicon="favicon.svg",
sidebar=[
data_store,
pn.layout.Divider(),
#*authors_info_panel.widgets(),
top_n_widget,

pn.layout.Divider(), # - - - - - - - - - - - - -

timeseries_plot.select_plot_theme_widget,
ClearCacheButton(),
],
main=[
pn.Column(
page_header,
),
timeseries_plot,
pn.Card(
pn.Column(
timeseries_plot_header,
timeseries_plot,
),
collapsible=False, hide_header=True,
),
authors_grid,
],
)
timeline_perspective = TimelinePerspective(data_store=data_store)
Expand All @@ -60,6 +96,16 @@
('data', timeline_perspective.panel(TimelineDataFrameEnum.TIMELINE_DATA)),
('resampled', timeline_perspective.panel(TimelineDataFrameEnum.RESAMPLED_DATA)),
('by author+resampled', timeline_perspective.panel(TimelineDataFrameEnum.BY_AUTHOR_DATA)),
(
'authors info',
perspective_pane(
df=timeseries_plot.authors_info_df_rx,
title=pn.rx("Authors info for repo={repo!r}, from={from_date!r}") \
.format(repo=data_store.select_repo_widget,
from_date=page_header.select_period_from_widget)
)
),
#('selected author', authors_info_panel),
),
])

Expand Down
113 changes: 112 additions & 1 deletion src/diffinsights_web/datastore/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

from diffinsights_web.utils.notifications import warning_notification


DATASET_DIR = 'data/examples/stats'


Expand Down Expand Up @@ -144,6 +143,25 @@ def add_pm_count_perc(resampled_df: pd.DataFrame,
elif col.startswith('-:'):
resampled_df.loc[:, col_perc] = resampled_df[col] / resampled_df['-:count']

for col in pm_count_cols:
if col in {'-:count', '+:count'}: # '-:count' or '+:count'
continue

# previous loop ensured that both "-:<column>" and "+:<column>" exists
if col.startswith('-:'): # we need only one of those
continue

col_base = col[2:] # remove "+:" prefix
col_base_perc = f"{col_base} [%]"
if col_base_perc in resampled_df.columns:
# print(f" SKIP {col_base_perc}")
continue

resampled_df.loc[:, col_base_perc] = (
(resampled_df[f"-:{col_base}"] + resampled_df[f"+:{col_base}"]) /
(resampled_df['-:count'] + resampled_df['+:count'])
)

#print(f" returned DataFrame(<{hex(id(resampled_df))}>)")
return resampled_df

Expand Down Expand Up @@ -185,6 +203,68 @@ def resample_timeline(timeline_df: pd.DataFrame,
return add_pm_count_perc(df_agg, pm_count_cols)


def author_timeline_df(resample_by_author_df: pd.DataFrame, author_id: str) -> pd.DataFrame:
return resample_by_author_df.loc[author_id]


@pn.cache
def get_date_range(timeline_df: pd.DataFrame, from_date_str: str):
# TODO: create reactive component or bound function to compute from_date to avoid recalculations
# TODO: use parsed `from_date` instead of using raw `from_date_str`
min_date = timeline_df['author_date'].min()
if from_date_str:
from_date = pd.to_datetime(from_date_str, dayfirst=True, utc=True)
min_date = max(min_date, from_date)

## DEBUG
#print(f"get_date_range(timeline_df=<{hex(id(timeline_df))}, {from_date_str=}>):")
#print(f" {min_date=}, {timeline_df['author_date'].max()=}")

return (
min_date,
timeline_df['author_date'].max(),
)


@pn.cache
def get_value_range(timeline_df: pd.DataFrame, column: str = 'n_commits'):
return (
timeline_df[column].min(),
timeline_df[column].max(),
)


# NOTE: consider putting the filter earlier in the pipeline (needs profiling / benchmarking?)
# TODO: replace `from_date_str` (raw string) with `from_date` (parsed value)
def filter_df_by_from_date(resampled_df: pd.DataFrame,
from_date_str: str,
date_column: Optional[str] = None) -> pd.DataFrame:
from_date: Optional[pd.Timestamp] = None
if from_date_str:
try:
# the `from_date_str` is in DD.MM.YYYY format
from_date = pd.to_datetime(from_date_str, dayfirst=True, utc=True)
except ValueError as err:
# NOTE: should not happen, value should be validated earlier
warning_notification(f"from={from_date_str!r} is not a valid date: {err}")

filtered_df = resampled_df
if from_date is not None:
if date_column is None:
filtered_df = resampled_df[resampled_df.index >= from_date]
else:
if pd.api.types.is_timedelta64_dtype(resampled_df[date_column]):
filtered_df = resampled_df[resampled_df[date_column] >= from_date]
elif pd.api.types.is_numeric_dtype(resampled_df[date_column]):
# assume numeric date column is UNIX timestamp
filtered_df = resampled_df[resampled_df[date_column] >= from_date.timestamp()]
else:
warning_notification(f"unsupported type {resampled_df.dtypes[date_column]!r} "
f"for column {date_column!r}")

return filtered_df


# mapping form display name to frequency alias
# see table in https://pandas.pydata.org/docs/user_guide/timeseries.html#dateoffset-objects
time_series_frequencies = {
Expand All @@ -204,6 +284,37 @@ def resample_timeline(timeline_df: pd.DataFrame,
}


def authors_info_df(timeline_df: pd.DataFrame,
column: str = 'n_commits',
from_date_str: str = '') -> pd.DataFrame:
info_columns = list(agg_func_mapping().keys())

# sanity check
if column not in info_columns:
column = info_columns[0]

filtered_df = filter_df_by_from_date(timeline_df, from_date_str,
date_column='author.timestamp')

df = filtered_df\
.groupby(by='author.email')[info_columns + ['author.name']]\
.agg({
col: 'sum' for col in info_columns
} | {
# https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
'author.name': pd.Series.mode,
})\
.sort_values(by=column, ascending=False)\
.rename(columns={
'+:count': 'p_count',
'-:count': 'm_count',
'author.name': 'author_name',
})

#print(f" -> {df.columns=}, {df.index.name=}")
return df


class TimelineDataStore(pn.viewable.Viewer):
dataset_dir = param.Foldername(
constant=True,
Expand Down
25 changes: 25 additions & 0 deletions src/diffinsights_web/utils/avatars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import hashlib
from urllib.parse import urlencode

import panel as pn


@pn.cache
def gravatar_url(email: str, size: int = 16) -> str:
# https://docs.gravatar.com/api/avatars/python/

# Set default parameters
# ...

# Encode the email to lowercase and then to bytes
email_encoded = email.lower().encode('utf-8')

# Generate the SHA256 hash of the email
email_hash = hashlib.sha256(email_encoded).hexdigest()

# https://docs.gravatar.com/api/avatars/images/
# Construct the URL with encoded query parameters
query_params = urlencode({'s': str(size)}) # NOTE: will be needed for 'd' parameter
url = f"https://www.gravatar.com/avatar/{email_hash}?{query_params}"

return url
24 changes: 24 additions & 0 deletions src/diffinsights_web/utils/humanize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Provide human-readable value, together with machine-readable HTML metadata/microdata"""
import os

import pandas as pd


def html_date_humane(date: pd.Timestamp) -> str:
date_format = '%d %a %Y'
if os.name == 'nt':
date_format = '%#d %a %Y'
elif os.name == 'posix':
date_format = '%-d %a %Y'

return f'<time datetime="{date.isoformat()}">{date.strftime(date_format)}</time>'


def html_int_humane(val: int) -> str:
thousands_sep = " " # Unicode thin space (breakable in HTML), &thinsp;

res = f'{val:,}'
if thousands_sep != ",":
res = res.replace(",", thousands_sep)

return f'<data value="{val}" style="white-space: nowrap;">{res}</data>'
Loading

0 comments on commit e0f18f0

Please sign in to comment.