diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..62519cd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+_site
+.sass-cache
+.jekyll-cache
+.jekyll-metadata
+vendor
+.bundle
diff --git a/404.html b/404.html
new file mode 100644
index 0000000..086a5c9
--- /dev/null
+++ b/404.html
@@ -0,0 +1,25 @@
+---
+permalink: /404.html
+layout: default
+---
+
+<style type="text/css" media="screen">
+  .container {
+    margin: 10px auto;
+    max-width: 600px;
+    text-align: center;
+  }
+  h1 {
+    margin: 30px 0;
+    font-size: 4em;
+    line-height: 1;
+    letter-spacing: -1px;
+  }
+</style>
+
+<div class="container">
+  <h1>404</h1>
+
+  <p><strong>Page not found :(</strong></p>
+  <p>The requested page could not be found.</p>
+</div>
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..f3daae7
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,34 @@
+source "https://rubygems.org"
+# Hello! This is where you manage which Jekyll version is used to run.
+# When you want to use a different version, change it below, save the
+# file and run `bundle install`. Run Jekyll with `bundle exec`, like so:
+#
+#     bundle exec jekyll serve
+#
+# This will help ensure the proper Jekyll version is running.
+# Happy Jekylling!
+# gem "jekyll", "~> 4.2.2"
+# This is the default theme for new Jekyll sites. You may change this to anything you like.
+gem "minima", "~> 2.5.1"
+# If you want to use GitHub Pages, remove the "gem "jekyll"" above and
+# uncomment the line below. To upgrade, run `bundle update github-pages`.
+gem "github-pages", "~> 228", group: :jekyll_plugins
+# If you have any plugins, put them here!
+group :jekyll_plugins do
+  gem "jekyll-feed", "~> 0.15.1"
+end
+
+# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem
+# and associated library.
+platforms :mingw, :x64_mingw, :mswin, :jruby do
+  gem "tzinfo", "~> 1.2"
+  gem "tzinfo-data"
+end
+
+# Performance-booster for watching directories on Windows
+gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin]
+
+# Lock `http_parser.rb` gem to `v0.6.x` on JRuby builds since newer versions of the gem
+# do not have a Java counterpart.
+gem "http_parser.rb", "~> 0.6.0", :platforms => [:jruby]
+gem "webrick"
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 0000000..4ec67c6
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,266 @@
+GEM
+  remote: https://rubygems.org/
+  specs:
+    activesupport (7.0.7)
+      concurrent-ruby (~> 1.0, >= 1.0.2)
+      i18n (>= 1.6, < 2)
+      minitest (>= 5.1)
+      tzinfo (~> 2.0)
+    addressable (2.8.5)
+      public_suffix (>= 2.0.2, < 6.0)
+    coffee-script (2.4.1)
+      coffee-script-source
+      execjs
+    coffee-script-source (1.11.1)
+    colorator (1.1.0)
+    commonmarker (0.23.10)
+    concurrent-ruby (1.2.2)
+    dnsruby (1.70.0)
+      simpleidn (~> 0.2.1)
+    em-websocket (0.5.3)
+      eventmachine (>= 0.12.9)
+      http_parser.rb (~> 0)
+    ethon (0.16.0)
+      ffi (>= 1.15.0)
+    eventmachine (1.2.7)
+    execjs (2.8.1)
+    faraday (2.7.10)
+      faraday-net_http (>= 2.0, < 3.1)
+      ruby2_keywords (>= 0.0.4)
+    faraday-net_http (3.0.2)
+    ffi (1.15.5)
+    forwardable-extended (2.6.0)
+    gemoji (3.0.1)
+    github-pages (228)
+      github-pages-health-check (= 1.17.9)
+      jekyll (= 3.9.3)
+      jekyll-avatar (= 0.7.0)
+      jekyll-coffeescript (= 1.1.1)
+      jekyll-commonmark-ghpages (= 0.4.0)
+      jekyll-default-layout (= 0.1.4)
+      jekyll-feed (= 0.15.1)
+      jekyll-gist (= 1.5.0)
+      jekyll-github-metadata (= 2.13.0)
+      jekyll-include-cache (= 0.2.1)
+      jekyll-mentions (= 1.6.0)
+      jekyll-optional-front-matter (= 0.3.2)
+      jekyll-paginate (= 1.1.0)
+      jekyll-readme-index (= 0.3.0)
+      jekyll-redirect-from (= 0.16.0)
+      jekyll-relative-links (= 0.6.1)
+      jekyll-remote-theme (= 0.4.3)
+      jekyll-sass-converter (= 1.5.2)
+      jekyll-seo-tag (= 2.8.0)
+      jekyll-sitemap (= 1.4.0)
+      jekyll-swiss (= 1.0.0)
+      jekyll-theme-architect (= 0.2.0)
+      jekyll-theme-cayman (= 0.2.0)
+      jekyll-theme-dinky (= 0.2.0)
+      jekyll-theme-hacker (= 0.2.0)
+      jekyll-theme-leap-day (= 0.2.0)
+      jekyll-theme-merlot (= 0.2.0)
+      jekyll-theme-midnight (= 0.2.0)
+      jekyll-theme-minimal (= 0.2.0)
+      jekyll-theme-modernist (= 0.2.0)
+      jekyll-theme-primer (= 0.6.0)
+      jekyll-theme-slate (= 0.2.0)
+      jekyll-theme-tactile (= 0.2.0)
+      jekyll-theme-time-machine (= 0.2.0)
+      jekyll-titles-from-headings (= 0.5.3)
+      jemoji (= 0.12.0)
+      kramdown (= 2.3.2)
+      kramdown-parser-gfm (= 1.1.0)
+      liquid (= 4.0.4)
+      mercenary (~> 0.3)
+      minima (= 2.5.1)
+      nokogiri (>= 1.13.6, < 2.0)
+      rouge (= 3.26.0)
+      terminal-table (~> 1.4)
+    github-pages-health-check (1.17.9)
+      addressable (~> 2.3)
+      dnsruby (~> 1.60)
+      octokit (~> 4.0)
+      public_suffix (>= 3.0, < 5.0)
+      typhoeus (~> 1.3)
+    html-pipeline (2.14.3)
+      activesupport (>= 2)
+      nokogiri (>= 1.4)
+    http_parser.rb (0.8.0)
+    i18n (1.14.1)
+      concurrent-ruby (~> 1.0)
+    jekyll (3.9.3)
+      addressable (~> 2.4)
+      colorator (~> 1.0)
+      em-websocket (~> 0.5)
+      i18n (>= 0.7, < 2)
+      jekyll-sass-converter (~> 1.0)
+      jekyll-watch (~> 2.0)
+      kramdown (>= 1.17, < 3)
+      liquid (~> 4.0)
+      mercenary (~> 0.3.3)
+      pathutil (~> 0.9)
+      rouge (>= 1.7, < 4)
+      safe_yaml (~> 1.0)
+    jekyll-avatar (0.7.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-coffeescript (1.1.1)
+      coffee-script (~> 2.2)
+      coffee-script-source (~> 1.11.1)
+    jekyll-commonmark (1.4.0)
+      commonmarker (~> 0.22)
+    jekyll-commonmark-ghpages (0.4.0)
+      commonmarker (~> 0.23.7)
+      jekyll (~> 3.9.0)
+      jekyll-commonmark (~> 1.4.0)
+      rouge (>= 2.0, < 5.0)
+    jekyll-default-layout (0.1.4)
+      jekyll (~> 3.0)
+    jekyll-feed (0.15.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-gist (1.5.0)
+      octokit (~> 4.2)
+    jekyll-github-metadata (2.13.0)
+      jekyll (>= 3.4, < 5.0)
+      octokit (~> 4.0, != 4.4.0)
+    jekyll-include-cache (0.2.1)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-mentions (1.6.0)
+      html-pipeline (~> 2.3)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-optional-front-matter (0.3.2)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-paginate (1.1.0)
+    jekyll-readme-index (0.3.0)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-redirect-from (0.16.0)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-relative-links (0.6.1)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-remote-theme (0.4.3)
+      addressable (~> 2.0)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
+      rubyzip (>= 1.3.0, < 3.0)
+    jekyll-sass-converter (1.5.2)
+      sass (~> 3.4)
+    jekyll-seo-tag (2.8.0)
+      jekyll (>= 3.8, < 5.0)
+    jekyll-sitemap (1.4.0)
+      jekyll (>= 3.7, < 5.0)
+    jekyll-swiss (1.0.0)
+    jekyll-theme-architect (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-cayman (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-dinky (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-hacker (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-leap-day (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-merlot (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-midnight (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-minimal (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-modernist (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-primer (0.6.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-github-metadata (~> 2.9)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-slate (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-tactile (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-theme-time-machine (0.2.0)
+      jekyll (> 3.5, < 5.0)
+      jekyll-seo-tag (~> 2.0)
+    jekyll-titles-from-headings (0.5.3)
+      jekyll (>= 3.3, < 5.0)
+    jekyll-watch (2.2.1)
+      listen (~> 3.0)
+    jemoji (0.12.0)
+      gemoji (~> 3.0)
+      html-pipeline (~> 2.2)
+      jekyll (>= 3.0, < 5.0)
+    kramdown (2.3.2)
+      rexml
+    kramdown-parser-gfm (1.1.0)
+      kramdown (~> 2.0)
+    liquid (4.0.4)
+    listen (3.8.0)
+      rb-fsevent (~> 0.10, >= 0.10.3)
+      rb-inotify (~> 0.9, >= 0.9.10)
+    mercenary (0.3.6)
+    minima (2.5.1)
+      jekyll (>= 3.5, < 5.0)
+      jekyll-feed (~> 0.9)
+      jekyll-seo-tag (~> 2.1)
+    minitest (5.19.0)
+    nokogiri (1.15.4-x86_64-linux)
+      racc (~> 1.4)
+    octokit (4.25.1)
+      faraday (>= 1, < 3)
+      sawyer (~> 0.9)
+    pathutil (0.16.2)
+      forwardable-extended (~> 2.6)
+    public_suffix (4.0.7)
+    racc (1.7.1)
+    rb-fsevent (0.11.2)
+    rb-inotify (0.10.1)
+      ffi (~> 1.0)
+    rexml (3.2.6)
+    rouge (3.26.0)
+    ruby2_keywords (0.0.5)
+    rubyzip (2.3.2)
+    safe_yaml (1.0.5)
+    sass (3.7.4)
+      sass-listen (~> 4.0.0)
+    sass-listen (4.0.0)
+      rb-fsevent (~> 0.9, >= 0.9.4)
+      rb-inotify (~> 0.9, >= 0.9.7)
+    sawyer (0.9.2)
+      addressable (>= 2.3.5)
+      faraday (>= 0.17.3, < 3)
+    simpleidn (0.2.1)
+      unf (~> 0.1.4)
+    terminal-table (1.8.0)
+      unicode-display_width (~> 1.1, >= 1.1.1)
+    typhoeus (1.4.0)
+      ethon (>= 0.9.0)
+    tzinfo (2.0.6)
+      concurrent-ruby (~> 1.0)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.8.2)
+    unicode-display_width (1.8.0)
+    webrick (1.8.1)
+
+PLATFORMS
+  x86_64-linux
+
+DEPENDENCIES
+  github-pages (~> 228)
+  http_parser.rb (~> 0.6.0)
+  jekyll-feed (~> 0.15.1)
+  minima (~> 2.5.1)
+  tzinfo (~> 1.2)
+  tzinfo-data
+  wdm (~> 0.1.1)
+  webrick
+
+BUNDLED WITH
+   2.3.15
diff --git a/_config.yml b/_config.yml
new file mode 100644
index 0000000..c077ff8
--- /dev/null
+++ b/_config.yml
@@ -0,0 +1,53 @@
+# Welcome to Jekyll!
+#
+# This config file is meant for settings that affect your whole blog, values
+# which you are expected to set up once and rarely edit after that. If you find
+# yourself editing this file very often, consider using Jekyll's data files
+# feature for the data you need to update frequently.
+#
+# For technical reasons, this file is *NOT* reloaded automatically when you use
+# 'bundle exec jekyll serve'. If you change this file, please restart the server process.
+#
+# If you need help with YAML syntax, here are some quick references for you: 
+# https://learn-the-web.algonquindesign.ca/topics/markdown-yaml-cheat-sheet/#yaml
+# https://learnxinyminutes.com/docs/yaml/
+#
+# Site settings
+# These are used to personalize your new site. If you look in the HTML files,
+# you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
+# You can create any custom variable you would like, and they will be accessible
+# in the templates via {{ site.myvariable }}.
+
+title: "Anjan Roy"
+email: hello@itzmeanjan.in
+description: "My Thoughts, Experiments & Experiences"
+baseurl: "" # the subpath of your site, e.g. /blog
+url: "https://itzmeanjan.in" # the base hostname & protocol for your site, e.g. http://example.com
+twitter_username: meanjanroy
+github_username:  itzmeanjan
+
+# Build settings
+theme: minima
+plugins:
+  - jekyll-feed
+
+markdown: GFM
+# Exclude from processing.
+# The following items will not be processed, by default.
+# Any item listed under the `exclude:` key here will be automatically added to
+# the internal "default list".
+#
+# Excluded items can be processed by explicitly listing the directories or
+# their entries' file path in the `include:` list.
+#
+# exclude:
+#   - .sass-cache/
+#   - .jekyll-cache/
+#   - gemfiles/
+#   - Gemfile
+#   - Gemfile.lock
+#   - node_modules/
+#   - vendor/bundle/
+#   - vendor/cache/
+#   - vendor/gems/
+#   - vendor/ruby/
diff --git a/index.html b/_index.html
similarity index 100%
rename from index.html
rename to _index.html
diff --git a/_posts/2021-05-29-beginning-of-blogging.md b/_posts/2021-05-29-beginning-of-blogging.md
new file mode 100644
index 0000000..9870cc5
--- /dev/null
+++ b/_posts/2021-05-29-beginning-of-blogging.md
@@ -0,0 +1,42 @@
+---
+layout: post
+title: Beginning of Blogging
+subtitle: Practising daily writing for greater good
+author: Anjan Roy
+permalink: /pages/beginning-of-blogging
+---
+
+Finally I've decided to start blogging !
+
+For sometime I've been procrastinating to start writing down my **thoughts, experiments & experiences** --- giving myself excuses of busy schedule. But being unable to do something due to lack of time is most probably sign of time mismanagement.
+
+So I started planning how I'm going to spend my week long time --- just as sprint planning, where I sketch out what're the things I'm going to do this week, with day level granularity, on each Saturday evening. After following this ritual for last 5 months I've discovered it's not only easy to accomodate lots of tasks in 86,400 seconds but also accomplish ~100% if planned properly. I see this planning as an art, where there's nothing optimal, rather
+it's a gradual progression towards betterment, where failure is a friendly teacher ✅.
+
+When I started following this ritual, I used to sketch whole plan in my head. After few weeks I decided to use pencil & paper and was able to do better planning where I was accomplishing >90% of what I planned. 
+
+During this period, I taught myself how to wake up early morning. All it takes --- keeping alarm away from bed's reach, so that I'm forced to get out of bed & stop the alarm, rather than snoozing it.
+In last 5 months, I've worked on 3 large-scale production-grade open-source software systems. May be some otherday I'll
+talk about them. Reading more blogs; listening to podcasts; reading books are among the things I've become better at doing. I've seen improvement in the way I manage my tasks at workspace --- completing more good quality tasks in lesser timespan. Empowered by better time management, I've seen improvement
+in my relationship with other human beings & nature ( it's important 😉 ).
+
+But more important part is understanding that, scheduling is an art --- where everyday collecting feedback; accumulating them over week & on Saturday evening sitting down for making a better
+plan --- is an infinite loop.
+
+```c
+start;
+
+while(True) {
+    while(not saturday) {
+        follow;
+        collect-feedback;
+    }
+
+    learn;
+    incorporate;
+}
+```
+
+I noticed getting inside infinite loop is hard, but once you're in, it's beautiful here 🥰. I'd speak from my heart, as I'm writing this, I feel writing is indeed impactful.
+
+Have a great time !
diff --git a/_posts/2021-06-01-distributed-pubsub.md b/_posts/2021-06-01-distributed-pubsub.md
new file mode 100644
index 0000000..81d1668
--- /dev/null
+++ b/_posts/2021-06-01-distributed-pubsub.md
@@ -0,0 +1,162 @@
+---
+layout: post
+title: Designing Distributed Pub/Sub
+subtitle: A possible design of Distributed Pub/Sub System
+author: Anjan Roy
+permalink: /pages/distributed-pubsub
+---
+
+Sometime ago I started working on `pub0sub` - _Fast, Light-weight, Ordered Pub/Sub System_ --- built on top of async I/O, leveraging power of kernel event loop.
+
+The main idea behind it was to write a software ( along with SDK ) which can be used for publishing arbitrary length binary messages to N-many topics; subscribing to N-many topics --- listening for messages published on each of them; and last but not least one powerful Pub/Sub Hub ( i.e. Router ) which will easily solve C10K by leveraging power of async I/O.
+
+The aforementioned problem statement is solved, which is why I decided to update problem statement. Now it looks like `pub0sub` - _Distributed, Fast, Light-weight, Ordered Pub/Sub System_ --- solving C1M easily while leveraging power of kernel event loop & p2p networking.
+
+By making `pub0sub` distributed, I get to handle 1M concurrent connection where nodes form a mesh network for chatting about topic interest(s) & forward messages when need to --- a collaborative effort among peers. I choose to use `libp2p` for networking purpose, for being so modular --- enabling easy horizontal scalability, while taking care of stream multiplexing, security, peer-discovery etc.
+
+Here I propose primary design of system !
+
+Multiple `pub0sub` nodes can discover & connect to each other using DHT ( distributed hash table ) powered peer discovery mechanism, built right into `libp2p` and eventually form a mesh network. If network has N participant(s), each participant is going to maintain connection with other N-1 peer(s), where N > 0. These participants of p2p network are going to chat with each other over bi-directional stream. Things nodes need to talk about 👇
+
+Operation | Interpretation
+:--- | ---:
+Topic subscription | Letting peers know of interest in some topics
+Topic subscription ACK | Peer saying it has noted down & will forward published messages if sees any
+Topic unsubscription | Peer saying it has removed entry & will no more forward published messages
+Topic unsubscription ACK | Peer saying it has removed entry & will no more forward published messages
+Published message forwarding | Passing published message to interested peer
+Periodic heartbeat | Network health check
+
+As each of aforementioned operations require to pass different message formats, I'm going to define respective wire formats. But before I get into wire format, writing to/ reading from stream
+I'd like to spend some time in going through high level overview of network operation.
+
+![pic0](../images/pub0sub-high-level-arch-2.jpg)
+
+Say, two nodes form a cluster --- one node has a _topic_1_ subscriber connected to it while other one has a publisher connected to it, willing to publish message on _topic_1_. After
+first node finds out, it has one subscriber interested in messages from _topic_1_, it decides to ask its peer `0hub` node, if it sees any message targeted to _topic_1_, it should inform requester. Publisher sends publish intent to network, which triggers event saying network has received some message on _topic_1_ for which first node has interested subscriber. Two nodes chat over p2p network, resulting into message forwarding, which enables first node
+to deliver message published on _topic_1_ to its subscriber.
+
+When noticed carefully, network follows certain protocols
+
+- When subscriber shows interest in _topic_1_, `0hub` broadcasts it to peers
+- When publisher publishes message on _topic_1_, `0hub` forwards message to all interested peers
+
+Let's take another scenario.
+
+![pic1](../images/pub0sub-high-level-arch-3.jpg)
+
+Continuing previous scenario, after sometime subscriber doesn't anymore want to receive messages published on _topic_1_, so it sends unsubscription intent to network. As a result of it, respective `0hub` node decides to broadcast same to network, because it found it doesn't have any other subscribers who're interested in messages of _topic_1_. All peers who kept record of this node being interested in _topic_1_, updates their respective interest table, ensuring when in future it receives message published on _topic_1_, it won't forward to first peer.
+
+This way of showing interest to topics when peers has some subscribers to feed or announcing not interested anymore when all subscribers of certain topic unsubscribes --- allows network to pass published messages only when needed, eventually consuming lower bandwidth. I call it **Lazy Pushing**.
+
+With more peers, network interaction may look like 👇 from high level, where `0hub` nodes form p2p mesh network, other participants are mere clients.
+
+![pic2](../images/pub0sub-high-level-arch-1.jpg)
+
+Say, one subscriber shows interest in receiving messages from _{topic_1, topic_2, topic_3}_ but the `0hub` node it's connected to doesn't have any publisher of any of those topics. As soon as `0hub` node learns it has subscriber
+to feed messages of _{topic_1, topic_2, topic_3}_, following protocol it announces that intent to other peers. Each of other peers record it & as soon as they receive any message published on any of these topics they forward those to respective peers.
+
+I'll now spend some time in specifying wire-format of messages exchanged between peers.
+
+![pic3](../images/pub0sub-high-level-arch-4.jpg)
+
+Each message exchanged between peers over p2p layer, needs to have two parts
+
+- Header ( 5 bytes )
+- Body ( N bytes )
+
+Just by reading header part receiver must be able to understand two things
+
+- What kind of operation is it ?
+- How many more bytes to read from stream for consuming message body ?
+
+First question can be answered by checking very first byte of message. Each operation is denoted by unique opcode. There're 255 possible opcodes, though only 6 of them are in use as of
+now.
+
+Interpretation | Opcode
+--- | ---:
+Heartbeat | 1
+Topic subscription | 2
+Topic subscription ACK | 3
+Topic unsubscription | 4
+Topic unsubscription ACK | 5
+Message Forward | 6
+
+By reading next 4 bytes from header, receiver understands how many more bytes it should read from stream so that it can successfully deserialise message, depending upon opcode. Each opcode denotes
+different message wire-format, resulting into invocation of different deserialisation logic upon reception.
+
+Above is a high level wire-format, which is applicable for each of messages. But I'd like to define how **BODY** of message is serialised/ deserialised for different opcodes. Starting
+with how it looks like when announcing interest in listening to some topics.
+
+![pic4](../images/pub0sub-high-level-arch-5.jpg)
+
+
+Note, above image is nothing but magnification of message **BODY** when opcode ∈ {2, 4}. Requirement is peer needs to announce it wants to receive all messages published on topics, because it has
+some subscribers interested in those. Receiver side when reading from stream, knows how many bytes it needs to read from stream for completely consuming **BODY**.
+
+It starts by reading first 1-byte, where it has instruction encoded how many next bytes it should read for making one meaningful topic name. Now it has either consumed all bytes of **BODY**
+or some of them are left. If left, it'll again consume 1-byte, carrying instruction for it for figuring out what's next topic name. This way, it'll keep reading until it has exhausted all bytes
+of **BODY**. By the end it must have successfully constructed structured object in respective environment, containing topics some peer want to get notified of.
+
+Similar wire-format is followed for serialising **BODY** when announcing lack of interest in some
+topics.
+
+Both of aforementioned opcodes, expect to hear back with ACK messages i.e. opcode ∈ {3, 5}, where **BODY** can be encoded by putting binary value denoting success/ failure. These are expected to be received with in a stipulated time window after interest ( opcode 2 ) / lack of interest ( opcode 4 ) message is sent to peer. If not received, for opcode 2, it'll be resent upto N-times. If still not received, it results into connection termination with peer for not following protocol.
+
+But if peer is waiting for ACK of message with opcode 4, it doesn't resend, because of being low priority. Of course it might result into network wasting some bandwidth for passing some published message which could have been avoided. If any forwarded message
+from any topic to which peer is not interested in, is received even after lack of interest message was broadcast ( opcode 4 )
+it can be ignored by receiver. Receiving peer also sends another message to respective peer with opcode 4, stating it's not interested in these topics --- just like repeating self. This is done so that next time network can save some bandwidth.
+
+Finally I'll cover how to serialise/ deserialise forwarded message to/ from stream.
+
+![pic5](../images/pub0sub-high-level-arch-6.jpg)
+
+Start by reading first byte of **BODY**, which encodes how many topics this message is being targeted to. A message can be targeted to 255 topics at max. Receiver knows how many topics it should be reading from stream. So it starts by reading next 1 byte, encoding first topic's byte length. It knows how many next bytes to be read for figuring out first topic name. It has just read one topic name. Similarly it'll continue reading more topic names until all are read off. After N topic names are read, it'll read 4 bytes, encoding how many next bytes it needs to read for extracting out actual message content.
+
+Eventually it'll reach end, constructing structured data by consuming stream. This is how forwarded messages are recovered from stream by some peer who showed interest in getting notified when some message is published on topics of interest. After getting structured data, receipient `0hub` node can send message to subscribers connected to it directly, interested in any of topics this message is published on.
+
+**Reader may notice**, a slight difference in encoding variable number topic list, between previous two diagrams. When encoding to be forwarded message ( opcode 6 ), peer encodes topic count in first 1-byte of **BODY** part of message. This is required, otherwise during deserialisation receiver won't be able to understand where in stream it should stop reading topic names & start reading 4-byte lengthy actual message content's length field.
+
+But same is not required for message sent with opcode ∈ {2, 4}, because there's nothing more to read after topic name list and receiver already knows length of **BODY** part of message, so it knows how long to read from stream.
+
+Let's go through one example
+
+Say `0hub` peer want to announce its interest in messages published on _topic_1_, _topic_2_. Serialised message for this operation looks like
+
+Message Part | Field Name | Field Byte Length | Field Value
+--- | :-: | ---: | ---:
+Header | Opcode | 1 | 2
+Header | Body Length | 4 | 16
+Body | Topic-1 Length | 1 | 7
+Body | Topic-1 Name	| 7 | topic_1
+Body | Topic-2 Length | 1 | 7
+Body | Topic-2 Name	| 7 | topic_2
+
+21 bytes of data to be sent to each peer, resulting into (N-1) * 21 bytes of data broadcast in total, where N > 0 & N is #-of participants in mesh network.
+
+Similarly by following aforementioned example, message of lack of interest to topics ( opcode 4 ) can be published on network.
+
+Finally I'll go through one last example showing serialisation of to be forwarded message i.e. opcode 6. Assuming this message is published on _topic_1_, _topic_2_ & content of message is _hello_.
+
+Message Part | Field Name | Field Byte Length | Field Value
+--- | :-: | --: | --:
+Header | Opcode | 1 | 6
+Header | Body Length | 4 | 26
+Body | Topic Count | 1 | 2
+Body | Topic-1 Length | 1 | 7
+Body | Topic-1 Name | 7 | topic_1
+Body | Topic-2 Length | 1 | 7
+Body | Topic-2 Name | 7 | topic_2
+Body | Data Length | 4 | 5
+Body | Data | 5 | hello
+
+This results into sending 31 bytes of data to each of those peers who showed interest to _topic_1_, _topic_2_. Not to all N-1 remaining participants of mesh network --- **Lazy Pushing** at work.
+
+Peers need to periodically send heartbeat messages for checking health of long-lived network connections to other peers. Opcode 1 is reserved for this purpose, where **BODY** of message ∈ {ping,pong}.
+
+If reader has covered whole proposal, they probably understand this is by no means a final version of design. Improvements like not forming strongly connected mesh helps in reducing huge bandwidth cost --- can be taken into consideration to further
+enhance protocol. Message authentication can be added so that peers only accept connection request from other peers who are trusted, when such setup is desired.
+
+Existing <span class="highlight">pub0sub</span> implementation is <a target="_blank" href="https://github.com/itzmeanjan/pub0sub">here</a>.
+
+Your feedback will be invaluable. Have a great time !
diff --git a/_posts/2021-06-06-sound-of-heartbeat-over-tcp.md b/_posts/2021-06-06-sound-of-heartbeat-over-tcp.md
new file mode 100644
index 0000000..008b0b5
--- /dev/null
+++ b/_posts/2021-06-06-sound-of-heartbeat-over-tcp.md
@@ -0,0 +1,76 @@
+---
+layout: post
+title: Sound of Heartbeat over TCP
+subtitle: Incorporating heartbeat over TCP in Pub/Sub System
+author: Anjan Roy
+permalink: /pages/sound-of-heartbeat-over-tcp
+---
+
+Few weeks back I started working on `pub0sub` - Fast, Light-weight, Ordered Pub/Sub System, leveraging power of kernel event loop, addressing C10K while running on a consumer grade machine.
+
+`pub0sub` can easily handle > 10k concurrent connections even on consumer grade machine, because it doesn't follow conventional way of writing TCP servers in Go. Generally, one go-routine accepts TCP connection & spawns new go-routine for handling connection throughout its lifetime. This way if objective is to handle > 10k concurrent connections, there're > 10k go-routines. Go scheduler needs to perform expensive context switching for running go-routines on underlying OS threads. For > 10k go-routines cost of context switching is pretty high, when no useful task gets accomplished. Also stack memory requirement for > 10k go-routines is not something neglectable.
+
+Avoiding aforementioned path helps in discovering another potential way, where I can ask kernel event loop to watch file descriptors of interest & only inform when some action need to be taken. At a time any of two completion events can happen on socket { **READ**, **WRITE** } --- either pre-scheduled reading or writing from socket has been completed, giving opportunity to act on it & schedule next operation. There's no more > 10k go-routines, rather only 2 go-routines --- one used for listening & accepting TCP connections; another for watching & responding to I/O events. 
+
+It'll be perfectly okay to add more watcher go-routines ( static, done at system startup phase ), each managing its own kernel event loop and watching some delegated sockets. But in that case newly accepted connections need to be fairly distributed among all event watching loops otherwise some of them becomes **hotspot**, resulting into performance degradation. Some watcher does more socket watching, some does less. Even it's possible some topics
+in pub/sub are popular and all subscribers interested in those topics needs to be distributed across available watchers. For orchestrating N-sockets on available M-watchers, _where N >>> M_, I need to keep additional state information in listener go-routine. It can also be further explored whether dynamic watcher adding/ removing in runtime as per system state brings any improvement or not.
+
+Back to current implementation, as a result, lesser time spent in context switching more time spent doing actual work. As soon as new connection is accepted, it's delegated to watcher. On the other hand watcher waits for I/O completion events &  as soon as some of them are available to act on; it starts looping over them one-by-one --- processing each & scheduling next action on socket.
+
+![pic0](../images/sound-of-heartbeat-kernel-event-loop.jpg)
+
+This model of writing TCP server is performant, but brings in some complexities. Previously I could manage each connection's whole life-cycle in its own go-routine --- seperation of concern was well respected. As a result implementation was easier to reason about.
+
+Say in first model, server has two clients --- each being managed in its own go-routine. Pub/Sub Hub is waiting to read message from respective sockets, where each message has two parts
+
+Part | Size ( in bytes ) | Purpose
+--- | --- | ---
+Envelope | 5 | Keeps OPCODE, BodyLength
+Body | N | Keeps BodyLength -bytes actual data
+
+For one client, go-routine reading envelope and for another one body is being read after envelope reading is done. I'd like to highlight, messages are seperated in two parts because it helps
+in determining what's length of variable sized body, where envelope length is fixed at 5-bytes.
+
+![pic1](../images/sound-of-heartbeat-go-routine-reading-message.jpg)
+
+But attaining same behaviour when delegating reading from/ writing to sockets to watcher is little more involved.
+
+For reading message envelope, request is issued; watcher informs when envelope is read. Then envelope is deserialised to
+figure out how many more bytes to read from socket for consuming message body. N-bytes body reading is again delegated to watcher, which informs as soon as it's done. Now if there're M-clients connected at this moment each of them may be reading any possible part of message. There could be also different kinds of messages --- where OPCODE is encoded in envelope along with body length. Reading handler function needs to remember where it left off last time & what exactly it was doing then, so that it can keep processing later part of message. This calls for additional state keeping --- resulting into more memory allocation than first model.
+
+It's like when first time envelope is read, reading handler function understands what's intention of client and how many more bytes it needs to read from socket to construct message. It puts an entry in **ongoing reading table**, _indexed by socket_, along with OPCODE so that appropriate deserialisation
+handler can be invoked when body reading will be completed; issues N-bytes body reading request; moves on to next event processing step.
+
+After sometime when body reading is done on this socket, watcher informs, socket is looked up in **ongoing reading table**
+to understand what has happened till now & what to do next with body. And finally intended action is taken on received message and socket entry from **ongoing reading table** is removed.
+
+![pic2](../images/sound-of-heartbeat-watcher-reading-message.jpg)
+
+In `pub0sub` there're two kinds of clients i.e. { publisher, subscriber }. Each of them interact with Pub/Sub server i.e. `0hub` with different intention resulting into different message format. Opcode helps read handler understand how to deserialise message body or what kind of actions to take on deserialised, structured message & how to eventually respond back to client.
+
+![pic3](../images/sound-of-heartbeat-message-format.jpg)
+
+With all these pieces Pub/Sub Hub implementation `0hub` shows quite promising performance. I've tested it with 16k concurrent connections on consumer grade machine. I believe if it's tested in containerised environment where virtual overlay networking can be easily used and more ports ( = more clients ) in total are available, `0hub` will break its own record. Some other day I'd like to run that experiment.
+
+Recently I started noticing issue with long lived TCP connections --- resulting into abnormal connection termination. I suspect this is due to long lived TCP connections might be idle for long time if publishers are not publishing often or subscribers have subscribed to some infrequently update receiving topics. To address this situation I plan to add periodic heartbeat message passing between client & hub. Heartbeat messages will be of 5-bytes --- _only envelope, no body_. If there's no body, it denotes last 4-bytes of message envelope will be holding 0, only first byte contains opcode ---  10 for ping, 11 for pong. For maintaining backward compatibility envelope size can't be changed, which is why I'm wasting 4 bytes in envelope by only storing 0.
+
+![pic4](../images/sound-of-heartbeat-ping-pong.jpg)
+
+Every 30 seconds `0hub` sends PING ( opcode = 10 ) message to all connected
+publishers & subcribers and expects to hear back with PONG ( opcode = 11 ) message. For all those who responded
+back, their next health check to be scheduled at t+30. Others who didn't respond back, they'll be
+pinged again upto 3 times at max, each after 30 seconds delay. If they still don't respond back, hub terminates connection
+with them while cleaning up all resources associated with respective client.
+
+This will hopefully help hub in maintaining connection & related resources only for healthy & active clients, while enabling it in estimating _how many subscribers to receive published message on a topic_ with better precision. This estimation calculation will pose a challenge during implementing distributed version of `pub0sub` --- which I'll face very soon.
+
+These PING/ PONG messages are simply an overhead, though unavoidable, consuming bandwidth overtime. But I can probably
+reduce #-of health checking done. When hub and client has recently communicated for sake of their usual business procedure, it's quite evident connection is active --- health check can be avoided. Idea is to only do health check when hub hasn't heard from client for some time.
+
+If some active publisher sends message publish intent every < 30s, it can avoid explicit health check cost. On other hand subscribers
+listening to active topic i.e. frequent update receiving topic, can avoid health check because connection issues will be caught when attempting to push update. That's why I call health check **LAZY**. As each health check message & response wastes 4-bytes for sake of backward compatibility, it's better to keep its usage as low as possible.
+
+Another way I'm looking at --- it's possible to send PING ( opcode = 10 ) message from hub to client of only 1-byte length i.e. OPCODE part of whole message, but when client responds back with PONG ( opcode = 11 ) then need to send only envelope i.e. 5-bytes, as proposed 👆. This way communication pattern becomes somewhat asymmetric, but helps in saving 4-bytes, resulting into health check round-trip with 6-bytes instead of previous 10-bytes.
+
+Current version of `pub0sub` is <a href="https://github.com/itzmeanjan/pub0sub" target="_blank">here</a>.
+I'd love to get feedback and have a great time !
diff --git a/_posts/2021-06-13-speaking-tcp.md b/_posts/2021-06-13-speaking-tcp.md
new file mode 100644
index 0000000..b24d613
--- /dev/null
+++ b/_posts/2021-06-13-speaking-tcp.md
@@ -0,0 +1,355 @@
+---
+layout: post
+title: Speaking TCP
+subtitle: Analysing ways of speaking TCP
+author: Anjan Roy
+permalink: /pages/speaking-tcp
+---
+
+For last few months I've been working at TCP level more often than I generally do. During this period I designed and implemented few systems where multiple participants talk to each other over TCP while following custom application level protocol. I learned the way most of TCP applications written in ( specifically ) Golang
+can be done in a slight different way so that applications don't end up spawning one go-routine per accepted connection --- resulting into thousands of active go-routines when talking to thousands of concurrent peers. Rather than handling each peer in its own go-routine, proactively attempting to read from socket & spending most of its time in blocked mode; keeping only one socket watcher go-routine which is responsible for informing any READ/ WRITE completion event happening on any of delegated sockets --- consumes way lesser resources. It excels at reducing scope of context switching by bringing possible go-routine count to minimal. As a result of it, Golang scheduler only needs to manage a few go-routines now. Previously scheduler had to orchestrate thousands of go-routines on **N** system threads. I ran some experiments
+and result was promising --- TCP servers able to easily handle **100k** concurrent connections when following second approach.
+
+Following 3 different approaches, I develop key-value database where clients can send read/ write requests over TCP. I challenge each implementation with **100k** concurrent connections and collect statistics of their performance, resource consumption, execution trace etc. under load; all running on consumer-grade machines in containerised environment i.e. Docker.
+
+- One go-routine per connection
+- One watcher for all sockets
+- N ( >1 ) watchers for all sockets
+
+The application I develop is quite simple but it captures the essence of a TCP application. It's a remote _( not necessarily geographically )_ in-memory KV database, to which clients connect
+over TCP & maintain that connection throughout their life time. During their life time they do any of two possible operations in a randomised manner.
+
+- **READ** - Attempt to read VALUE associated with supplied KEY
+- **WRITE** - Attempt to associate VALUE with KEY
+
+In both of the cases clients expect to hear back from server. In response frame VALUE associated with KEY is returned. For WRITE request, VALUE in response frame must be equal to what's sent in request frame. On server side all reading/ writing is done in concurrent safe manner --- by acquiring mutex locks. Only for write request r/w lock is held i.e. _critical section of code_, otherwise normal read-only lock is held --- allowing fulfilment of multiple READ requests concurrently.
+
+![pic0](../images/speaking-tcp-kv-store.jpg)
+
+For performing desired operations, clients send structured data frames over TCP; server extracts that out from socket; performs action as specified in message envelope i.e. opcode field; responds back with response frame.
+
+Each message sent over wire is two-parts, where envelope carries operation kind i.e. {READ, WRITE} & how many more bytes server need to be read from stream to construct a structured message. Clients
+always expect to receive only one kind of frame in response.
+
+- Envelope : 3 bytes
+- Body : N ( < 65536 ) bytes
+
+For a READ frame, sent when client is interested in looking up VALUE associated with KEY, body just holds key, preceded with key length in 1 byte field. Notice, body length field in envelope is 2 bytes, allowing at max 65535 bytes of body, but in body actually 256 bytes can be written due to key length field in body being of 1 byte. This is done intensionally for keeping illustration simple.
+
+Practically max READ frame size over wire will be
+
+Field | Max Thoeretical Size ( in bytes ) | Max Practical Size ( in bytes )
+--- | --- | ---
+Envelope | 3 | 3
+Body | 65536 | 256
+Total | 65538 | 259
+
+![pic1](../images/speaking-tcp-read-frame.jpg)
+
+WRITE frame carries little more data, which is sent when client is interested in associating VALUE with some KEY, because it carries both key, value & each of them are preceded with respective length in 1 byte field. Same scene here, practically WRITE frame's body will be at max 512 bytes though it's allowed to be at max 65535 bytes theoretically, as written in body length field in stream.
+
+Limits WRITE frame size will be
+
+Field | Max Thoeretical Size ( in bytes ) | Max Practical Size ( in bytes )
+--- | --- | ---
+Envelope | 3 | 3
+Body | 65536 | 512
+Total | 65538 | 515
+
+![pic2](../images/speaking-tcp-write-frame.jpg)
+
+In response of READ/ WRITE request client expects to receive one RESPONSE frame, where VALUE associated with KEY is encoded, where length of VALUE precedes it, encoded 1 byte --- signaling client how many more bytes to read from stream to construct response. Good thing about response frame, it doesn't waste any space, just allows sending 255 bytes VALUE at max.
+
+Field | Max Thoeretical Size ( in bytes ) | Max Practical Size ( in bytes )
+--- | --- | ---
+Envelope | 2 | 2
+Body | 255 | 255
+Total | 257 | 257
+
+![pic3](../images/speaking-tcp-response-frame.jpg)
+
+Now I'd like to spend some time in specifying how each of 3 approaches work.  For ease of addressing, I'll refer to them from now on as _{**v1** => 1, **v2** => 2, **v3** => 3}_.
+
+Model **v1** is popular way of writing TCP servers in Go, where one listener go-routine keep listening on a _host:port_; accepts connection & spawns new go-routine for handling connection throughout its life time. This model respects seperation of concern well & operations happening on socket are easier to reason about due to clean structure. But one thing to notice, each go-routine alive for handling concurrent connections, spends a lot of its time in blocked state --- proactively waiting to read
+from socket.
+
+![pic4](../images/speaking-tcp-model-v1.jpg)
+
+Model **v2** is slightly different than **v1**, where rather than spawning one go-routine per accepted connection, all accepted connections are delegated to one watcher go-routine, which runs one kernel event loop and learns about READ/ WRITE completion events on sockets being watched. Every now and then event loop informs watcher go-routine of READ/ WRITE completion events, providing with opportunity to take action on accomplished task and schedule next operation on socket asynchronously.
+
+This mode of operation has some similarity with _libuv_ --- which powers NodeJS's event loop.
+
+![pic5](../images/speaking-tcp-model-v2.jpg)
+
+I'd call model **v3** a generic version of model **v2**, where N-watcher go-routines run N-many kernel event loops and each accepted connection is delegated to one of these available watchers for rest of their life time. Whenever READ/ WRITE completion event ocurrs on some socket, event loop notifies respective watcher go-routine, which invokes _handle{READ,WRITE}_ method
+to take action on completed event and schedule next operation on socket, to be completed asynchronously.
+
+Using this model calls for socket orchestrating technique --- connections are fairly distributed among all available watcher go-routines. Goal of orchestration is not creating hot-spots i.e.
+some watcher go-routine managing lots of sockets while some has got few. This defeats whole purpose of model **v3**. One naive orchestration technique will be using modular arithmetic, where
+M-th accepted connection is delegated to M % N -th watcher go-routine, where M > 0, N > 0, N = #-of watcher go-routines.
+
+One problem I see with this scheme is, assuming peer connections are generally long-lived some watcher might end-up managing all those long-lived peers while some other watcher go-routine
+probably received those sockets which were unfortunately not long-lived, will manage few sockets --- creating imbalance in socket watching delegation i.e. hotspot resulting into bad performance. What I think can be done, rather than blindly orchestrating sockets using naive round-robin technique, 
+better to keep one feedback loop from watcher go-routines, so that they can inform listener go-routine of their current status i.e. how many delegated sockets are they managing now ?, how many of them are active in terms of READ/ WRITE operation frequency --- rolling average over finite timespan ? etc., allowing listener go-routine to make more informed decision before it delegates accepted connection to some watcher. This brings in management flexibility.
+
+![pic6](../images/speaking-tcp-model-v3.jpg)
+
+It's time to run these models on real mahine and collect statistics. I've prepared parallel benchmarking testcases, where in each round of benchmarking one client connects to TCP server and sends two frames in order. First frame is read request for some KEY, waits for response, consumes it _( if some other client has already set VALUE for that KEY )_; then it sends write request with a KEY, VALUE pair, waits for response, expecting to see VALUE in response matching what it sent in write request. Each benchmark is performed 8 times, to get average statistics.
+
+I do parallel benchmark of model **v1** on two machines running GNU/Linux & MacOS where for each round takes ~34k ns on GNU/Linux, but it's relatively on higher side when run on MacOS ~45k ns.
+
+```bash
+// v1 on GNU/Linux
+
+$ go test -v -run=xxx -bench V1 -count 8
+goos: linux
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v1
+cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
+BenchmarkServerV1
+BenchmarkServerV1-4   	   34150	     34866 ns/op	  29.63 MB/s	    3750 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   34316	     34922 ns/op	  29.58 MB/s	    3750 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33979	     35035 ns/op	  29.48 MB/s	    3750 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33944	     35518 ns/op	  29.08 MB/s	    3750 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33795	     35610 ns/op	  29.01 MB/s	    3750 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33493	     35604 ns/op	  29.01 MB/s	    3749 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33490	     35473 ns/op	  29.12 MB/s	    3749 B/op	      52 allocs/op
+BenchmarkServerV1-4   	   33523	     35338 ns/op	  29.23 MB/s	    3749 B/op	      52 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v1	12.452s
+```
+
+```bash
+// v1 on MacOS
+
+$ go test -v -run=xxx -bench V1 -count 8
+goos: darwin
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v1
+cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
+BenchmarkServerV1
+BenchmarkServerV1-8   	   27030	     47430 ns/op	  21.78 MB/s	    3751 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   26208	     44778 ns/op	  23.07 MB/s	    3752 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   27098	     43486 ns/op	  23.75 MB/s	    3752 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   27368	     44496 ns/op	  23.22 MB/s	    3753 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   25234	     49744 ns/op	  20.77 MB/s	    3753 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   24418	     49292 ns/op	  20.96 MB/s	    3752 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   24998	     47754 ns/op	  21.63 MB/s	    3751 B/op	      52 allocs/op
+BenchmarkServerV1-8   	   24969	     47495 ns/op	  21.75 MB/s	    3751 B/op	      52 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v1	14.223s
+```
+
+For model **v2**, MacOS takes lesser time for each round than it took in model **v1**. But that's not true for GNU/Linux --- rather it almost doubled up.
+
+```bash
+// v2 on GNU/Linux
+
+$ go test -v -run=xxx -bench V2 -count 8
+goos: linux
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v2
+cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
+BenchmarkServerV2
+BenchmarkServerV2-4   	   19852	     60069 ns/op	  34.39 MB/s	    6113 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   20102	     60362 ns/op	  34.23 MB/s	    6108 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   19983	     59815 ns/op	  34.54 MB/s	    6107 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   20096	     59202 ns/op	  34.90 MB/s	    6107 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   20307	     59099 ns/op	  34.96 MB/s	    6107 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   19944	     60038 ns/op	  34.41 MB/s	    6107 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   20209	     58666 ns/op	  35.22 MB/s	    6107 B/op	      71 allocs/op
+BenchmarkServerV2-4   	   20170	     58852 ns/op	  35.11 MB/s	    6105 B/op	      71 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v2	14.448s
+```
+
+```bash
+// v2 on MacOS
+
+$ go test -v -run=xxx -bench V2 -count 8
+goos: darwin
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v2
+cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
+BenchmarkServerV2
+BenchmarkServerV2-8   	   35652	     32646 ns/op	  63.29 MB/s	    6191 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   39087	     30548 ns/op	  67.63 MB/s	    6178 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   39044	     30425 ns/op	  67.91 MB/s	    6173 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   39390	     30321 ns/op	  68.14 MB/s	    6175 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   39427	     30540 ns/op	  67.65 MB/s	    6175 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   37016	     34478 ns/op	  59.92 MB/s	    6177 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   35229	     36566 ns/op	  56.50 MB/s	    6186 B/op	      72 allocs/op
+BenchmarkServerV2-8   	   33456	     35525 ns/op	  58.16 MB/s	    6184 B/op	      72 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v2	12.866s
+```
+
+In case of model **v3**, GNU/Linux and MacOS both of them has kept their trends intact --- for one average benchmark round completion timespan increases, for other it's decreasing, respectively.
+
+```bash
+// v3 on GNU/Linux
+
+$ go test -v -run=xxx -bench V3 -count 8
+goos: linux
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v3
+cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
+BenchmarkServerV3
+BenchmarkServerV3-4   	   15162	     79368 ns/op	  26.03 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   15229	     78720 ns/op	  26.24 MB/s	    5715 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   14929	     81184 ns/op	  25.45 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   15290	     79059 ns/op	  26.13 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   14955	     79231 ns/op	  26.08 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   15013	     78480 ns/op	  26.33 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   14869	     79838 ns/op	  25.88 MB/s	    5713 B/op	      73 allocs/op
+BenchmarkServerV3-4   	   15498	     80839 ns/op	  25.56 MB/s	    5713 B/op	      73 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v3	15.678s
+```
+
+```bash
+// v3 on MacOS
+
+$ go test -v -run=xxx -bench V3 -count 8
+goos: darwin
+goarch: amd64
+pkg: github.com/itzmeanjan/tseep/v3
+cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
+BenchmarkServerV3
+BenchmarkServerV3-8   	   41614	     28501 ns/op	  72.49 MB/s	    5715 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   41720	     28395 ns/op	  72.76 MB/s	    5717 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   43344	     27378 ns/op	  75.46 MB/s	    5716 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   43896	     28022 ns/op	  73.73 MB/s	    5712 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   42164	     30386 ns/op	  67.99 MB/s	    5713 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   37576	     32728 ns/op	  63.13 MB/s	    5718 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   37152	     32555 ns/op	  63.46 MB/s	    5714 B/op	      74 allocs/op
+BenchmarkServerV3-8   	   36784	     31925 ns/op	  64.71 MB/s	    5714 B/op	      74 allocs/op
+PASS
+ok  	github.com/itzmeanjan/tseep/v3	12.792s
+```
+
+Now I plan to stress test 3 models on both of GNU/Linux & MacOS platform with 8k concurrent connections, where each client connects to TCP server, sends read & write requests in order while waiting for their response in both of the cases.
+
+When model **v1** is stress tested, it completes lot faster on GNU/Linux, given it enjoys benefit of faster CPU.
+
+```bash
+// stress testing v1 on GNU/Linux
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV1_Stress_8k
+--- PASS: TestServerV1_Stress_8k (0.50s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v1	0.542s
+
+
+
+// stress testing v1 on MacOS
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV1_Stress_8k
+--- PASS: TestServerV1_Stress_8k (2.41s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v1	2.714s
+```
+
+With 8k concurrent connections model **v2** takes almost same time to complete on both GNU/Linux & MacOS platform.
+
+```bash
+// stress testing v2 on GNU/Linux
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV2_Stress_8k
+--- PASS: TestServerV2_Stress_8k (0.60s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v2	0.645s
+
+
+
+// stress testing v2 on MacOS
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV2_Stress_8k
+--- PASS: TestServerV2_Stress_8k (2.50s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v2	3.201s
+```
+
+Time required for completing stress testing with model **v3**
+is almost unchanged for MacOS, but for GNU/Linux it's slightly increasing.
+
+```bash
+// stress testing v3 on GNU/Linux
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV3_Stress_8k
+--- PASS: TestServerV3_Stress_8k (0.73s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v3	0.757s
+
+
+
+// stress testing v3 on MacOS
+
+$ go test -v -tags stress -run=8k
+=== RUN   TestServerV3_Stress_8k
+--- PASS: TestServerV3_Stress_8k (2.48s)
+PASS
+ok  	github.com/itzmeanjan/tseep/v3	3.095s
+```
+
+Go's trace tool is helpful in getting deeper insight into what happened when program was running. So I collect program execution trace when running test cases. These are collected on MacOS machine with Intel i5 CPU @ 2.4Ghz.
+
+While looking for how major go-routines spent their time model **v1**, I found listener go-routine _which accepts connections and spawns new go-routine for handling it_, spends a major portion of time in blocked state --- which is understandable
+because it's waiting for new connection to arrive.
+
+![pic7](../images/speaking-tcp-v1-listener-trace.png)
+
+If I now look at how its spawned connection handler go-routine spent its time, I see it has also spent most of its time in waiting for network IO. This also makes sense given the fact, in model **v1** each connection is handled in its own go-routine, resulting into each of those go-routines proactively waiting to read from socket --- waiting for network IO event.
+
+![pic8](../images/speaking-tcp-v1-handle-connection-trace.png)
+
+I look at model **v2**'s execution trace. It has two major go-routines i.e. {listener, watcher}. Listener does same job in all 3 models --- wait for incoming connection; accept it; prepare connection handling phase _( different in each model )_; keep looping, which is why network IO based blocking is evident in its trace.
+
+![pic9](../images/speaking-tcp-v2-listener-trace.png)
+
+When I look at model **v2**'s watcher go-routine trace, it doesn't spend any time in waiting for network IO --- it makes a blocking call where it waits for accumulation of a few READ/ WRITE
+completion event from underlying kernel event loop, as soon as it's done, it starts looping over them and takes necessary actions.
+This single function is equivalent of what N-many handleConnection go-routine does in model **v1**. When scheduler wait column is checked in each of these traces, it's well understandable each go-routine spawned needs to be scheduled on underlying OS threads to actually run, and scheduling is not cheap when having 100k go-routines.
+
+![pic10](../images/speaking-tcp-v2-watcher-trace.png)
+
+At last I take a look at trace of model **v3**, where I run 4 watcher go-routines each managing a subset of accepted connections. Listener go-routine's trace is similar to what I found in other models.
+
+![pic11](../images/speaking-tcp-v3-listener-trace.png)
+
+Downside of having more go-routines is scheduling cost --- here I run 4 go-routines with 4 different kernel event loop, subset of sockets are delegated to them, resulting into spending more time in scheduler wait stage. Also notice though there're 4 watcher go-routines ready to do their job, not all being used. It's because of the fact during test, when trace is collected, only one connection request is sent from client side, resulting into only one socket being managed by one of available watcher go-routines.
+
+![pic12](../images/speaking-tcp-v3-watcher-trace.png)
+
+Finally it's time for **100k** concurrent connection challenge.
+
+The problem I face is how to run 100k clients on my machine ? Given the fact network port identifier is of 16 bits, which allows me to run 65536 _( 1 << 16 )_ clients at max. Leaving lower 1024 port numbers, I still need ~40k clients. It's all happening
+because I've only one IP address i.e. _127.0.0.1_ . I can make use of some virtual networking technique, where I get a different subnet and multiple virtual machines are allocated IP address from that subnet. Each of those virtual machines run one copy of client
+application, actually to be more specific each of them run N _( < 65536 )_-clients. This way I can easily get to 100k client target.
+
+I choose Docker for its virtual networking capability, where each client container runs 16k clients, requiring only 6 containers together hitting another server container i.e. _{v1, v2, v3}_server_ able to simulate 100k concurrent connection scenario.
+
+I start with model **v1** --- total 7 containers to run, one for server, others for clients. I see CPU usage ~60%, suddenly it moves to ~100%. The memory usage is due to high number of key value lookup
+happening concurrently.
+
+![pic13](../images/speaking-tcp-docker-v1.png)
+
+Similarly model **v2** and **v3** are simulated, where one TCP server manages ~100k concurrent connections, each client attempting to randomly read/ write some randomly generated key & respective value is returned back to them in response.
+
+I notice, in model **v1**, PID count for _v1_server_ container i.e. TCP server is 33, denoting 33 OS threads created in this containerised environment, which is due to handling 100k active go-routines require lots of underlying OS threads --- sign of context switching. Now I look at same field for model **v{2, 3}**, requiring ~11 OS threads for serving 100k concurrent connections --- seemily saving some context switching cost.
+
+![pic14](../images/speaking-tcp-docker-v2.png)
+![pic15](../images/speaking-tcp-docker-v3.png)
+
+I note, each model is capable of handling 100k concurrent connection in simulated environment. Each of these models has its own benefits or downsides such using model **v1** program structuring is easier to understand, also it's natual & familiar way of writing TCP applications in Golang; while using model **v2**, chance of context switching can be avoided by drastically reducing #-of active go-routines, but it's no silver bullet. On the other hand model **v3** which is a generic version of model **v2**, is able to leverage power of more than one event loop, each managing subset of accepted connections --- sharded architecture, resulting into less mutex lock contention, given orchestration technique fits well.
+
+For almost all standard TCP applications, model **v1** is good fit, model **v2** or model **v3** _( with better orchestrator )_ can be used when extreme performance is required, while paying relatively lesser cost.
+
+I keep implementation powering these findings in <a href="https://github.com/itzmeanjan/tseep" target="_blank">this repository</a> for future reference.
+
+I plan to impose C1M challenge _( i.e. managing 1M concurrent connections )_ on these models --- some other day I'll talk about it. Have a great time !
diff --git a/_posts/2022-01-20-blake3-on-gpgpu.md b/_posts/2022-01-20-blake3-on-gpgpu.md
new file mode 100644
index 0000000..72c9415
--- /dev/null
+++ b/_posts/2022-01-20-blake3-on-gpgpu.md
@@ -0,0 +1,406 @@
+---
+layout: post
+title: BLAKE3 on GPGPU
+subtitle: In depth analysis of multiple SIMD variants for BLAKE3 parallel implementation targeting multi-core CPUs, GPGPUs using heterogeneous programming API SYCL
+author: Anjan Roy
+permalink: /pages/blake3-on-gpgpu
+---
+
+Last week I implemented multiple variants of highly parallelizable cryptographic hash function BLAKE3 using SYCL and today I'd like to present my collective understanding, which I gained while implementing/ benchmarking BLAKE3, targeting heterogeneous accelerator platform(s). BLAKE3 cryptographic hash function easily lends itself well to data parallel execution environments like SYCL/ OpenCL. Speaking from high level design point of view, it consists of following two steps.
+
+- Splitting the whole input byte array into N -many equilength ( 1024 bytes ) chunks, each of which can be independently compressed in parallel
+- Finally it requires constructing one Binary Merkle Tree with N -many leaf nodes, produced in previous step as result of chunk compression
+
+The root of tree ( 32 -bytes wide ) is desired cryptographic hash of input byte array. Both of these steps are good candidates for data parallelism. Note, step-1 produces N -many leaf nodes of Binary Merkle Tree, which are used for finding root of Merkle Tree in step-2 i.e. step-2 is data dependent on step-1.
+
+In this document, I'll be working with input byte array of length M -bytes such that `M = N * 1024, where N = 2^i, i = {1, 2, 3 ...}`. That means after execution of step-1 of BLAKE3, I should have power of 2 -many leaf nodes ( = N ), which will be used for computing root of fully balanced Binary Merkle Tree.
+This will simplify both explanation & implementation. I'll walk you through following two techniques of implementing BLAKE3.
+
+- Each SYCL work-item compressing single chunk independently
+- Each SYCL work-item compressing P = {2, 4, 8, 16} chunks parallelly
+
+ Let me start with first approach which is simpler.
+
+ Let us assume, I've 8KB input which I take as byte array ( say `const sycl::uchar *` ) and split it into 8 equal sized chunks. Now each of these 1024 -bytes wide chunks can be compressed in parallel. For doing so, I'll dispatch 8 work-items, with work-group size W ( `<= 8 && 8 % W == 0` ), where each work-item executes `compress( ... )` function, consuming 1024 -bytes input message into hash state. Once all these 8 work-items complete their execution, each of them output 64 -bytes chaining value ( which is actually BLAKE3 hash state matrix of that chunk ), from which first 32 -bytes to be taken as output chaining value of that chunk. These output chaining values are used as leaf nodes of Binary Merkle Tree, which I'm about to construct.
+
+In final step of computation, I construct a Binary Merkle Tree from N ( = 8 ) output chaining values. As Binary Merkle Tree is a hierarchical structure, I need to dispatch multiple rounds of kernels, respecting data dependency. To be more specific, in this case 3 ( `= log2(N), where N = 8` ) rounds will be required. In first dispatch round, I'll dispatch 4 work-items, who will read ( total ) 4 consecutive pairs of output chaining values and interpret each pair of chaining values as left and right child of ( to be computed ) parent node, placed right next to each other ( in `ltr` order ), as depicted below.
+
+```bash
+# cv = chaining value
+# p_cv = parent chaining value
+        
+cv   = [0, 1, 2, 3, 4, 5, 6, 7]            # merkle tree leaves
+p_cv = [p_cv_0, p_cv_1, p_cv_2, p_cv_3]    # merkle tree intermediates, just above leaves
+        
+p_cv_0     p_cv_1        p_cv_2        p_cv_3
+
+(0, 1)      (2, 3)      (4, 5)      (6, 7)
+ /   \        /  \       /   \       /   \
+/     \      /    \     /     \     /     \
+0      1    2      3    4      5    6      7
+```
+
+Computing parent node involves compressing a pair of chaining values, while setting some flags denoting that parent chaining value will be output of `compress( ... )`, where each chaining value is of 32 -bytes, making total of 64 -bytes input to `compress( ... )`. After completion of this dispatch round, we should have 4 parent nodes, who live just above leaf nodes.
+In next dispatch round, I've to ask for 2 work-items, each will compress two consecutive chaining values ( which were computed during last round ) and produce total 2 parent nodes, who live just below root of the tree ( to be computed in next dispatch round ). 
+
+```bash
+    p_cv_0                    p_cv_1
+
+((0, 1), (2, 3))        ((4, 5), (6, 7))
+     /       \                /      \
+    /         \              /        \
+(0, 1)      (2, 3)      (4, 5)      (6, 7)
+ /   \        /  \       /   \       /   \
+/     \      /    \     /     \     /     \
+0      1    2      3    4      5    6      7
+```
+
+In final round, it suffices to dispatch just a single task which takes 64 -bytes input ( read two chaining values, which are two immediate children of root of tree; these were computed during last round --- thus data dependency ) and produces 32 -bytes
+output chaining value which is root of Merkle Tree. This root is our desired BLAKE3 hash. Also note, before root of tree can be computed, flag denoting that output of `compress( ... )` function invocation is root chaining value of BLAKE3 Merkle Tree, need to be set.
+
+A pictorial demonstration might be helpful at this moment.
+
+![pic0](../images/blake3_on_gpgpu_0.jpg)
+
+Empowered with this high level knowledge of algorithmic construction of BLAKE3, it's good time to dive into often mentioned `compress( ... )` function. Simply speaking compression starts with 32 -bytes input chaining value and 64 -bytes input message, it consumes whole message into BLAKE3 hash state ( in multiple rounds, while also employing message permutation, using predefined indexing tricks ) and produces output of 64 -bytes, which is nothing but hash state after consuming whole input message inside it. First 32 -bytes of output is taken as chaining value which is either used as input to next stage of computation or as final root chaining value i.e. BLAKE3 digest. Let's emphasize on BLAKE3 hash state.
+
+BLAKE3 hash state is 64 -bytes wide; as BLAKE3 word size is 32 -bit, hash state can be represented using an array of 16 elements where each element is 32 -bit wide unsigned integer i.e. `sycl::uint`. When compressing 64 -bytes message, BLAKE3 consumes input message in 7 rounds, while at end of each round ( except last one ) permutes 64 -bytes message in a predefined way. At end of applying 7 rounds, it takes first 32 -bytes of hash state, which has now consumed permuted variants of 64 -bytes input message, as output chaining value. Each round of BLAKE3 compression consists of bit wise manipulation of 32 -bit wide hash state words.
+
+```python
+# see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L42-L52 👇
+
+# rrot( v, n ) => `v` is rotated right by n -bit places
+#
+# 1. If v is a scalar, it's bitwise rotated rightwards by n -bits
+#
+# 2. When v is vector, each lane of vector ( 32 -bit word ) is 
+# rotated n -bits rightwards
+
+# Note: Find `rotate` function in table 
+# https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_integer_functions
+# which takes either a scalar/ vector and rotates each element leftwards by n -bits
+# but for blake3 I need to rotate hash state words rightwards, so I make use of them as
+#
+# n = # -of bit places to rotate rightwards
+# v = input vector, each lane of it will be rotated
+#
+# sycl::rotate(v, 32 - n) => rotated each lane by n -bits rightwards
+#
+# If I've to rotate each lane by 7 -bits rightwards, I'll
+# invoke sycl::rotate(_, 25)
+#
+# This works because each word of blake3 is 32 -bits wide !
+#
+# See how I've used it 
+# https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3.hpp#L1570-L1573
+
+
+def g(sycl::uint state[16], size_t a, size_t b, size_t c, size_t d, sycl::uint mx, sycl::uint my):
+    state[a] = state[a] + state[b] + mx
+    state[d] = rrot(state[d] ^ state[a], 16)
+    state[c] = state[c] + state[d]
+    state[b] = rrot(state[b] ^ state[c], 12)
+    state[a] = state[a] + state[b] + my
+    state[d] = rrot(state[d] ^ state[a], 8)
+    state[c] = state[c] + state[d]
+    state[b] = rrot(state[b] ^ state[c], 7)
+
+# see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L54-L65 👇
+
+def blake3_round(sycl::uint state[16], sycl::uint msg[16]):
+    # consuming columns
+    g(state, 0, 4, 8, 12, msg[0], msg[1])
+    g(state, 1, 5, 9, 13, msg[2], msg[3])
+    g(state, 2, 6, 10, 14, msg[4], msg[5])
+    g(state, 3, 7, 11, 15, msg[6], msg[7])
+
+    # consuming diagonals.
+    g(state, 0, 5, 10, 15, msg[8], msg[9])
+    g(state, 1, 6, 11, 12, msg[10], msg[11])
+    g(state, 2, 7, 8, 13, msg[12], msg[13])
+    g(state, 3, 4, 9, 14, msg[14], msg[15])
+```
+
+Note, indices passed as argument to `g( ... )` function from `blake3_round( ... )`, which clearly shows in first four `g( ... )` function invocations, it's column-wise mixing
+first eight message words with hash state. And last four `g( ... )` function invocations are diagonally mixing remaining eight message words with hash state. It's possible to reduce four vertical mixing invocations into single function call, where all four columns are mixed parallelly, if I represent hash state as an array of 4 vectors ( SYCL <a href="https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:vector.type" target="_blank">intrinsic</a> ), where each vector is of type `sycl::uint4`, as shown below. With this new representation of hash state, diagonal mixing also enjoys boost, where all four diagonals of hash state matrix can be mixed parallelly.
+
+```cpp
+sycl::uint4 state[4] = {
+    sycl::uint4{ ... },
+    sycl::uint4{ ... },
+    sycl::uint4{ ... },
+    sycl::uint4{ ... }
+};
+```
+
+![pic1](../images/blake3_on_gpgpu_1.png)
+
+With this new representation of hash state column-wise mixing looks like below.
+
+
+```python
+# see section 5.3 of BLAKE3 specification https://github.com/BLAKE3-team/BLAKE3-specs/blob/ac78a71/blake3.pdf
+# simd style mixing
+#
+# see full SYCL implementation of blake3_round( ... )
+# https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3.hpp#L1558-L1610
+
+def blake3_round(sycl::uint4 state[4], sycl::uint msg[16]):
+    sycl::uint4 mx = { msg[0], msg[2], msg[4], msg[6] }
+    sycl::uint4 my = { msg[1], msg[3], msg[5], msg[7] }
+    
+    # column wise mixing
+    state[0] = state[0] + state[1] + mx
+    state[3] = rrot(state[3] ^ state[0], 16)
+    state[2] = state[2] * state[3]
+    state[1] = rrot(state[1] ^ state[2], 12)
+    state[0] = state[0] + state[1] + my
+    state[3] = rrot(state[3] ^ state[0], 8)
+    state[2] = state[2] * state[3]
+    state[1] = rrot(state[1] ^ state[2], 7)
+
+    # hash state diagonalisation
+    #
+    # ... to be written ...
+
+    # diagonal mixing
+    #
+    # ... to be written ...
+
+    # hash state undiagonalisation
+    #
+    # ... to be written ...
+```
+
+But keeping hash state as 4x4 matrix comes with its own requirement, where it needs to be diagonalised such that each diagonal of 4x4 matrix is now in same column, before diagonal mixing can be applied. After diagonal mixing, hash state needs to get back to its original form, which calls for undoing the diagonalisation previously performed.
+Diagonalisation involves rotating each of four vectors leftwards by row index of respective vector i.e. {0, 1, 2, 3} in 4x4 state matrix. Note, vector lane rotation doesn't rotate
+each lane content ( `sycl::uint` ), instead it rotates whole vector by N ( < 4, because each row has 4 lanes ) places.
+I make use of vector swizzle operators provided by SYCL vector intrinsic <a href="https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_swizzles" target="_blank">API</a> for rotating vector.
+
+A pictorial depiction looks like below.
+
+![pic2](../images/blake3_on_gpgpu_2.jpg)
+
+Note the color coding, which shows how diagonalisation helps in bringing each of 4 diagonals of 4x4 hash state matrix in same column. This makes applying diagonal mixing much easier ( and faster ) on 128 -bit vectors i.e. `sycl::uint4`.
+
+```cpp
+// hash state diagonalisation
+
+state[0] = state[0].xyzw() // can be skipped, doesn't make any change
+state[1] = state[1].yzwx()
+state[2] = state[2].zwxy()
+state[3] = state[3].wxyz()
+```
+
+Following code snippet can perform diagonal mixing on four 128 -bit vectors i.e. `sycl::uint4[4]`. This will consume last 8 message words of total 64 -bytes input message into hash state. Note, other than which message words are consumed, diagonal mixing is just same as column-wise mixing, because we've arranged columns to be so. This means, in implementation both of these mixings can be replaced using preprocessor directives or other compile-time
+code generation means.
+
+```python
+def blake3_round(sycl::uint4 state[4], sycl::uint msg[16]):        
+    # column wise mixing
+    # ... see above ...
+    
+    # last 8 message words to be consumed
+    sycl::uint4 mz = { msg[8], msg[10], msg[12], msg[14] }
+    sycl::uint4 mw = { msg[9], msg[11], msg[13], msg[15] }
+
+    # diagonal mixing
+    state[0] = state[0] + state[1] + mz
+    state[3] = rrot(state[3] ^ state[0], 16)
+    state[2] = state[2] * state[3]
+    state[1] = rrot(state[1] ^ state[2], 12)
+    state[0] = state[0] + state[1] + mw
+    state[3] = rrot(state[3] ^ state[0], 8)
+    state[2] = state[2] * state[3]
+    state[1] = rrot(state[1] ^ state[2], 7)
+```
+
+After diagonal mixing diagonalisation will be undone, rotating 4x4 state matrix vectors rightwards by row index of respective vector i.e. {0, 1, 2, 3} in 4x4 state matrix. Following code snippet should bring back hash state is desired form, preparing it for next round.
+
+```cpp
+// hash state undiagonalisation
+
+state[0] = state[0].xyzw() // can be skipped, doesn't make any change
+state[1] = state[1].wxyz()
+state[2] = state[2].zwxy()
+state[3] = state[3].yzwx()
+```
+
+![pic3](../images/blake3_on_gpgpu_3.jpg)
+
+During compression of 64 -bytes message input, after application of each round of mixing, sixteen message words ( 16 x 4 = 64 -bytes total message ) are permuted in following manner and permuted output is used as input message words during next round.
+
+```python
+# see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L40
+
+const size_t MSG_PERMUTATION[16] = {2, 6, 3, 10, 7, 0, 4, 13,
+                                    1, 11, 12, 5, 9, 14, 15, 8}
+
+# see https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1623-L1635
+
+def permute(sycl::uint msg[16]):
+    # temporary memory allocation to help permute
+    sycl::uint perm[16] = [0] * 16
+    
+    # permute, loop can be fully unrolled, no loop carried dependency
+    # pragma unroll 16
+    for i in range(16):
+        perm[i] = msg[MSG_PERMUTATION[i]]
+
+    # copy back, loop can be fully unrolled, no loop carried dependency
+    # pragma unroll 16
+    for i in range(16):
+        msg[i] = perm[i]
+```
+
+Now let us go back to chunk compression, where we had 1024 -bytes input ( as `sycl::uchar*` ); each `compress( ... )` function invocation takes 64 contiguous bytes to mix with 4x4 hash state matrix, that means we've to iterate 16 times for processing whole chunk. Each of these 64 -bytes are called blocks, 16 of them make a chunk. For first block in a chunk, a predefined constant input chaining value is used, but all subsequent 15 blocks use previous block's 32 -bytes output chaining value as its input chaining value. Note, BLAKE3 hash state is 64 -bytes, input chaining value is 32 -bytes, so remaining 32 -bytes of hash state ( i.e. last two rows of 4x4 hash state matrix ) comes from predefined constants & other parameters passed to `compress( ... )`, which includes flags denoting whether this block is first/ last of this chunk or if output chaining value will be parent/ root node of BLAKE3 Merkle Tree, block length, chunk index etc.. After applying `compress( ... )` first 32 -bytes of hash state is taken as output chaining value, to be used as input chaining value of next block in same chunk. Below is a pictorial demonstration for ease of
+understanding.
+
+> Definition of [`compress( ... )`](https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1728-L1780) function
+
+> Predefined constant input chaining [values](https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3_consts.hpp#L8-L9) I mentioned above
+
+> These are the [flags](https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3_consts.hpp#L16-L20) used in blake3
+
+![pic4](../images/blake3_on_gpgpu_4.jpg)
+
+Last block of each chunk produces 32 -bytes output chaining value, which is considered to be leaf node of Binary Merkle Tree. After all chunks are compressed & we've N -many output chaining values, interpreted as leaf nodes of Binary Merkle Tree. Usual parallel Binary Merkle Tree construction algorithm can be applied. Note, for
+merging two consecutive chaining values ( at leaf level ) into single parent chaining value, `compress( ... )` function is invoked with a predefined constant input chaining value, along with two consecutive chaining values being interpreted as 64 -bytes input message. Some input flags are passed to denote that its output chaining value will be a parent node. While computing root chaining value ( read target digest of input byte array ), two immediate child nodes just below root are compressed into single chaining value, while passing some flags to denote this is root node being computed.
+
+> See [here](https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1782-L1803), how two child nodes are merged into single parent ( chaining value )
+
+> Also see [here](https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1805-L1812), how two immediate children nodes of root of BLAKE3 merkle tree are merged into root ( chaining value )
+
+It's good time to give second approach a go, where each SYCL work-item compresses more than one chunk.
+
+Remember in first approach of parallel BLAKE3, I used a 4x4 matrix of 64 -bytes for representing hash state, when compressing each block of total 1024 -bytes wide chunk. But this time, hash state is represented using a 16 x N matrix, where N = {2, 4, 8, 16} and i-th row of state matrix holds N -many different chunk's hash state word at index **i**; so N -many different chunk's hash states are represented in N -many columns of 16 x N shared state matrix. That means, with N = 4, sixteen 128 -bit vectors will be used for representing whole hash state of 4 chunks. Programmatically I'd like to represent it using following syntax.
+
+```cpp
+// representing N = 4, chunk's hash state
+// each column, represents hash state of i^th chunk, where i ∈ [0, N)
+//
+// s_i_j = j^th chunk's hash state at index i, when each chunk's hash state looks like
+// sycl::uint s_i[16] = { s_0, s_1, s_2, s_3, s_4, s_5, s_6, s_7, s_8, s_9, s_10, s_11, s_12, s_13, s_14, s_15 }
+
+sycl::uint4 state[16] = {
+    sycl::uint4{ s_0_0, s_0_1, s_0_2, s_0_3 },
+    sycl::uint4{ s_1_0, s_1_1, s_1_2, s_1_3 },
+    sycl::uint4{ s_2_0, s_2_1, s_2_2, s_2_3 },
+    sycl::uint4{ s_3_0, s_3_1, s_3_2, s_3_3 },
+    sycl::uint4{ s_4_0, s_4_1, s_4_2, s_4_3 },
+    sycl::uint4{ s_5_0, s_5_1, s_5_2, s_5_3 },
+    sycl::uint4{ s_6_0, s_6_1, s_6_2, s_6_3 },
+    sycl::uint4{ s_7_0, s_7_1, s_7_2, s_7_3 },
+    sycl::uint4{ s_8_0, s_8_1, s_8_2, s_8_3 },
+    sycl::uint4{ s_9_0, s_9_1, s_9_2, s_9_3 },
+    sycl::uint4{ s_10_0, s_10_1, s_10_2, s_10_3 },
+    sycl::uint4{ s_11_0, s_11_1, s_11_2, s_11_3 },
+    sycl::uint4{ s_12_0, s_12_1, s_12_2, s_12_3 },
+    sycl::uint4{ s_13_0, s_13_1, s_13_2, s_13_3 },
+    sycl::uint4{ s_14_0, s_14_1, s_14_2, s_14_3 },
+    sycl::uint4{ s_15_0, s_15_1, s_15_2, s_15_3 },
+};
+```
+
+![pic5](../images/blake3_on_gpgpu_5.jpg)
+
+With N ( = 4 ) chunks being compressed together, each SYCL work-item mixes total 4096 -bytes of input message into hash state, each 1024 -bytes chunk splitted in 16 blocks, each of width 64 -bytes. There'll be sixteen rounds required for compressing 4096 -bytes.  In each round, i-th block of all N chunks are compressed together. Note color coding used in following demonstration, where I attempt to show you how message words ( 32 -bit wide ) from each block are chosen to construct 128 -bit vectors ( using `sycl::uint4` ) which are using during column-wise and diagonal mixing. I'd also like you to note that, there's no diagonalisation and undiagonalisation steps required in this SIMD style mixing, because each chunk's hash state is actually a 16 word vector, which is a column of 16 x N state matrix.
+After first block is processed, which consumes 64 -bytes message from each of four chunks ( i.e. first block of each chunk ), output chaining value of four chunks are prepared by taking 8 x N state matrix, where lower 8 x N portion of matrix ( read last 8 rows ) is dropped. This should produce 32 -bytes output chaining value for each chunk, which will be used as input chaining values for those respective chunks when processing block<sub>i+1</sub>
+from all N ( = 4 ) chunks.
+
+After all sixteen blocks from all chunks are compressed into hash state, 16 x N state matrix is truncated to 8 x N matrix ( by dropping last 8 rows ), which holds N -many output chaining values of N -many chunks. These N -many output chaining values are considered as N -many leaf nodes of BLAKE3 Merkle Tree, which will be constructed once all work-items complete compressing N -many chunks each.
+
+Binary Merklization algorithm doesn't anyhow change in second approach.
+
+![pic6](../images/blake3_on_gpgpu_6.jpg)
+
+Note, when N = 2, sixteen 64 -bit wide SIMD registers are used for representing hash state of two chunks, which are compressed in parallel. Similarly, for N = {4, 8, 16} sixteen {128, 256, 512} -bit registers ( respectively ) will be used for representing hash state of N chunks. On modern CPUs which support `avx512*` instructions 512 -bit vectors can help boosting this style of leveraging arbitrary many SIMD lanes.
+
+For understanding opportunities of using SIMD for parallelizing BLAKE3 on relatively large input byte arrays, I suggest you take a look at BLAKE3 <a href="https://github.com/BLAKE3-team/BLAKE3-specs/blob/ac78a71/blake3.pdf" target="_blank">specification</a>'s
+section 5.3.
+
+As you've now better understanding of aforementioned two approaches for computing BLAKE3 hash, I'd like to present you with benchmark results. In following tables, you'll see I'm taking random input of N -bytes; transferring whole input to accelerator's accompanying memory; invoking BLAKE3 kernel ( `approach_1` variant ) with on-device data pointer; waiting for all computation steps to complete and finally transferring 32 -bytes digest ( which is output chaining value of root node of Binary Merkle Tree in BLAKE3 hash ) to preallocated memory on host. Note, all these numbers represent mean value obtained after executing same kernel with same input size/ arguments K ( = 8 ) -many times.
+
+Input Size | Accelerator | Kernel Execution Time | Host -> Device Tx Time | Host <- Device Tx Time
+--- | --- | --- | --- | ---
+64 MB | Tesla V100-SXM2-16GB | 844.598250 us | 6.166145 ms | 6.973250 us
+64 MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 6.239875 ms | 9.797500 ms | 2.525625 us
+64 MB | Intel(R) Iris(R) Xe MAX Graphics | 4.974242 ms | 17.749401 ms | 1.319500 us
+128 MB | Tesla V100-SXM2-16GB | 1.800964 ms | 12.269974 ms | 7.080000 us
+128 MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 8.187520 ms | 20.664062 ms | 1.242000 us
+128 MB | Intel(R) Iris(R) Xe MAX Graphics | 9.812348 ms | 35.475108 ms | 1.319500 us
+256 MB | Tesla V100-SXM2-16GB | 3.267731 ms | 24.462952 ms | 6.805500 us
+256 MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 8.853032 ms | 32.455801 ms | 1.047125 us
+256 MB | Intel(R) Iris(R) Xe MAX Graphics | 19.465823 ms | 70.886068 ms | 1.293500 us
+512 MB | Tesla V100-SXM2-16GB | 5.998047 ms | 48.833740 ms | 6.713750 us
+512 MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 14.807205 ms | 48.242437 ms | 1.063000 us
+512 MB | Intel(R) Iris(R) Xe MAX Graphics | 39.271700 ms | 141.716997 ms | 1.313000 us
+1024 MB | Tesla V100-SXM2-16GB | 11.915527 ms | 97.573730 ms | 8.423000 us
+1024 MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 22.864140 ms | 79.047688 ms | 1.088500 us
+1024 MB | Intel(R) Iris(R) Xe MAX Graphics | 77.556440 ms | 283.341799 ms | 1.534000 us
+
+In above table, you should see three accelerators, I targeted for benchmarking BLAKE3 SIMD _approach_1_, where two of them are GPUs  from two different vendors and one is 64 -core CPU from Intel. You'll notice, Nvidia's Tesla V100 GPU performs best on all input sizes. Being a 64 -core CPU, in majority of cases it performs better compared to Intel's Irix Xe Max GPU. Note, when input size is 64 MB, Intel GPU perform's little better than 64 -core CPU. But as input size increases kernel execution time on Intel GPU starts to quickly increase, though on Intel CPU execution time for 64 MB, 128 MB and 256 MB input sizes are pretty close to each other.
+
+As output size is constant ( read 32 -bytes ), device to host data transfer cost is not much of concern. But input data size is variable, host to device input data transfer cost can help us in answering _is it worth transferring large byte array to accelerator for computing BLAKE3 hash ?_
+
+Comparing between multiple accelerators ( with same input size ), it shows as input data size increases host to device data transfer cost increases quickly for GPU ( even surpasses input data transfer cost on CPU for same size ), which makes sense because those accelerators are connected to host over PCIe bus. When comparing input data transfer cost of Nvidia's GPU and Intel's CPU, I see until 512 MB input size, cost was lesser for GPU, but at 512 MB input size both of them take around same time. For both GPUs from two different vendors, I see their host to device data transfer cost increases linearly as input size is doubled, because both of them are connected to host CPU using PCIe, which doesn't have high bandwidth. Due to these relatively high input data transfer costs, it may not always benefit using this accelerated BLAKE3 implementation, where explicitly data needs to be transferred to accelerator's local DRAM, and it may end up defeating whole purpose
+of speeding up. Just to make it more evident, notice in above table, for 1 GB input size on Nvidia Telsa V100 GPU, input transfer is ~8x costlier than actual computation of BLAKE3 hash.
+
+Lastly I'd like to draw your attention to device to host data transfer cost ( transferring 32 -bytes digest back to host ), where you should notice, on Nvidia's Tesla V100 GPU it's ~(6 - 7)x more expensive to transfer 32 -bytes ( over PCIe ) to host, when compared to Intel's GPU.
+
+In _approach_2_ of computing BLAKE3, I'm compressing {2, 4, 8, 16} chunks together & in following table I present kernel execution time and host <-> device data transfer cost, by timing SYCL events obtained as result of submitting respective commands on SYCL queue, which has profiling enabled.
+
+Input Size | Accelerator | SIMD Width | Kernel Execution Time | Host -> Device Tx Time | Host <- Device Tx Time
+--- | --- | --- | --- | --- | ---
+64MB | Tesla V100-SXM2-16GB | 64 -bit | 1.016358 ms | 6.172363 ms | 7.568375 us
+64MB | Tesla V100-SXM2-16GB | 128 -bit | 923.828375 us | 6.168457 ms | 7.323875 us
+64MB | Tesla V100-SXM2-16GB | 256 -bit | 1.318848 ms | 6.168945 ms | 7.812625 us
+64MB | Tesla V100-SXM2-16GB | 512 -bit | 2.055176 ms | 6.176270 ms | 10.254000 us
+64MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 64 -bit	| 7.193866 ms | 13.689200 ms | 4.531500 us
+64MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 128 -bit | 6.739462 ms | 14.008103 ms | 2.967625 us
+64MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 256 -bit | 7.261953 ms | 14.829467 ms | 2.978000 us
+64MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 512 -bit | 11.546031 ms | 13.229008 ms | 1.385125 us
+64MB | Intel(R) Iris(R) Xe MAX Graphics | 64 -bit	| 3.106389 ms | 17.748458 ms | 1.365000 us
+64MB | Intel(R) Iris(R) Xe MAX Graphics | 128 -bit | 28.628951 ms | 17.749823 ms | 1.332500 us
+64MB | Intel(R) Iris(R) Xe MAX Graphics | 256 -bit | 56.188691 ms | 17.748861 ms | 1.326000 us
+64MB | Intel(R) Iris(R) Xe MAX Graphics | 512 -bit | 105.559818 ms | 17.749823 ms | 1.365000 us
+256MB | Tesla V100-SXM2-16GB | 64 -bit | 3.539550 ms | 24.455078 ms | 7.080250 us
+256MB | Tesla V100-SXM2-16GB | 128 -bit | 4.190674 ms | 24.442871 ms | 7.080000 us
+256MB | Tesla V100-SXM2-16GB | 256 -bit | 5.203370 ms | 24.459961 ms | 7.568250 us
+256MB | Tesla V100-SXM2-16GB | 512 -bit | 13.925293 ms | 24.453369 ms | 7.568375 us
+256MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 64 -bit | 10.928828 ms | 33.915237 ms | 967.625000 ns
+256MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 128 -bit | 8.854166 ms | 32.901272 ms | 976.500000 ns
+256MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 256 -bit | 10.290331 ms | 33.643110 ms | 1.030125 us
+256MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 512 -bit | 18.790299 ms | 33.722702 ms | 966.125000 ns
+256MB | Intel(R) Iris(R) Xe MAX Graphics | 64 -bit | 11.941254 ms | 70.892191 ms | 1.326000 us
+256MB | Intel(R) Iris(R) Xe MAX Graphics | 128 -bit | 110.007846 ms | 70.894857 ms | 1.339000 us
+256MB | Intel(R) Iris(R) Xe MAX Graphics | 256 -bit | 245.655891 ms | 70.883748 ms | 1.391000 us
+256MB | Intel(R) Iris(R) Xe MAX Graphics | 512 -bit | 475.246200 ms | 70.886621 ms | 1.332500 us
+1024MB | Tesla V100-SXM2-16GB | 64 -bit | 11.715087 ms | 97.482910 ms | 9.765625 us
+1024MB | Tesla V100-SXM2-16GB | 128 -bit | 12.184326 ms | 97.552734 ms | 8.300750 us
+1024MB | Tesla V100-SXM2-16GB | 256 -bit | 18.732911 ms | 97.577148 ms | 7.812250 us
+1024MB | Tesla V100-SXM2-16GB | 512 -bit | 52.898436 ms | 97.524170 ms | 8.056625 us
+1024MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 64 -bit | 35.084335 ms | 76.414034 ms | 903.625000 ns
+1024MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 128 -bit | 25.805447 ms | 79.800968 ms | 1.052625 us
+1024MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 256 -bit | 28.765474 ms | 80.076494 ms | 1.017875 us
+1024MB | Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz | 512 -bit | 61.698307 ms | 75.633475 ms | 1.107500 us
+1024MB | Intel(R) Iris(R) Xe MAX Graphics | 64 -bit | 47.515533 ms | 283.342299 ms | 1.319500 us
+1024MB | Intel(R) Iris(R) Xe MAX Graphics | 128 -bit | 431.324205 ms | 283.344665 ms | 1.306500 us
+1024MB | Intel(R) Iris(R) Xe MAX Graphics | 256 -bit | 938.669251 ms | 283.345946 ms | 2.164500 us
+1024MB | Intel(R) Iris(R) Xe MAX Graphics | 512 -bit | 1.843786 s | 283.342267 ms | 2.164500 us
+
+In above table, host <-> device data transfer cost is not something I'm interested in, but instead I want you to notice how changing SIMD lane count from 2 to 16 ( by doubling ), affects kernel execution time on some accelerator with some specific input size. For example, let us zoom into row where input size is 256 MB; on Tesla V100 GPU as number of SIMD lanes used are doubled ( i.e. # -of chunks being compressed together by each SYCL work-item is doubled ), kernel execution time is doubled. Now notice, how kernel execution performs on Intel Iris Xe Max GPU, for input size 256 MB, using both approach_{1, 2} ( with SIMD lanes {2, 4, 8, 16} ). See with approach_1, kernel execution takes ~19 ms, but when 2 chunks are compressed together ( using approach_2 ), resources are better utilized, results in improved kernel execution time of ~12 ms. But kernel execution becomes ~9x costlier as soon as 4 chunks are compressed together, because now each work-item compresses 4 chunks in parallel, while representing hash state using sixteen 128 -bit wide vectors. Thus each work-item ends up using too many resources ( read register files for keeping hash state ), which results in register spilling --- portions of hash state are now placed on far distant and high latency global memory. Accessing global memory involves going through multiple memory hierarchies i.e. global memory and two levels of caches etc., each of which adds its own latency. After compressing 4 chunks together, as I keep increasing
+\# -of chunks to compress together ( say 8/ 16 ), execution time also keeps getting doubled up. Notice, for 1024 MB input size, on Intel Iris Xe Max GPU, when 2 chunks are compressed together, kernel execution time is ~47 ms, but when 4 chunks are compressed together execution time suffers heavily --- increasing ~9x ! This trend follows, as kernel execution time is doubled and quadrupled ( compared to compressing two chunks together ) when 8 and 16 chunks are compressed in parallel, respectively.
+
+Now let us explore whether similar kind of behaviour is visible when compressing multiple chunks together on Intel CPU. Let me begin with input size 64 MB, when only single chunk is compressed by each SYCL work-item ( read approach_1 ), kernel execution time is ~6 ms. As I increase # -of chunks being compressed together execution time stays almost same until we reach SIMD_LANE_COUNT = 16. Due to presence of limited number of 512 -bit registers, not many work-items can execute at a time, which is reflected when sixteen 512 -bit registers are used for representing hash state, compressing 16 chunks in parallel. When I look at input size both 256 MB & 1024 MB, I see similar pattern, where compressing 4 chunks together performs better than compressing {2, 8, 16} chunks together. When 4 chunks are compressed together sixteen 128 -bit vectors are used by each SYCL work-item for representing hash state. Now there are 32 of these 512 -bit register files on each core of this machine, which means on each core 8 SYCL work-items should proceed in parallel, without any register spilling.
+
+With these benchmark results in mind, I'll say, approach_1 of BLAKE3, where each SYCL work-item compresses single chunk performs much better ( almost always ) compared to approach_2, where it's possible to compress 2/ 4/ 8/ 16 chunks in parallel, by each SYCL work-item.
+
+For sake of reproducibility and future reference, I keep blake3's SYCL implementation <a href="https://github.com/itzmeanjan/blake3/tree/724085b2f3d14aa7e9403f756312ccb2890c299d" target="_blank">here</a>. Here I've presented benchmark results of only three accelerators, but you can find more of them for different input sizes <a href="https://github.com/itzmeanjan/blake3/tree/724085b2f3d14aa7e9403f756312ccb2890c299d#benchmark" target="_blank">here</a>. You may be interested in BLAKE3 implementation approach_{1, 2}, which you can find in this <a href="https://github.com/itzmeanjan/blake3/blob/724085b2f3d14aa7e9403f756312ccb2890c299d/include/blake3.hpp" target="_blank">file</a>.
+
+Note, this implementation might be helpful when hashing large input sizes, but _does it really benefit using this implementation when host -> device data transfer cost is much higher compared to kernel execution time ?_ is an important question.
+
+If you happen to be interested in using BLAKE3 for constructing Merkle Tree, you may check out my other <a href="https://github.com/itzmeanjan/merklize-blake3/tree/f14f03945a0664ae59ac5c0fdea61623c58949bd" target="_blank">project</a>. Binary Merklization using BLAKE3 is much easier as it requires me to hash only 64 -bytes input and produce 32 -bytes digest, which is an intermediate node of Merkle Tree. This kind of
+hashing is called 2-to-1 hashing, where two BLAKE3 digests are concatenated & hashed. You may notice, 2-to-1 hashing is simpler ( and cheaper ) because I need to compress only one chunk, which has only one block in itself. And that chunk is root node of BLAKE3's internal Merkle Tree, meaning BLAKE3's internal Merklization requirement is not a requirement anymore. I need to just make a single call to `compress( ... )` function with proper flags to denote that this is the only node in BLAKE3's internal Merkle Tree --- compute-wise cheap ! So for generating all intermediate nodes of Binary Merkle Tree when N -many ( read N is power of 2 ) leaf nodes are provided ( read each leaf node is a BLAKE3 digest ), using BLAKE3 hash, above linked implementation can come handy. Benchmark details of Binary Merklization implementation using BLAKE3 for different input sizes on different accelerator platforms can be found <a href="https://github.com/itzmeanjan/merklize-blake3/tree/f14f03945a0664ae59ac5c0fdea61623c58949bd#benchmarks" target="_blank">here</a>. While we're at it, it can be interesting to take a look at ( and compare ) Binary Merklization <a href="https://github.com/itzmeanjan/vectorized-rescue-prime/tree/77e371ef2fb11ba7d7369005a60a0888393729f0#benchmark" target="_blank">implementation</a> using Rescue Prime hash, I wrote few weeks ago. It's clearly visible how performant BLAKE3 is compared to Rescue Prime, when Binary Merklization benchmarks are compared !
+
+Have a great time !
diff --git a/index.md b/index.md
new file mode 100644
index 0000000..fc5ef1f
--- /dev/null
+++ b/index.md
@@ -0,0 +1,70 @@
+---
+layout: default
+title: Home
+permalink: /
+---
+
+## About
+
+Hello 👋, my name is **Anjan Roy**. I am a Software Engineer, originally from **India**.
+
+In 2018, after completing my under-graduate in Computer Science, I started practising software development while continuing post-graduate study in Computer Science. In 2020, after earning *M.Sc. in Computer Science*, I moved to industry full-time. 
+
+I'm working as a **Senior Cryptography Engineer** @ [Cryptography Research Center, Technology Innovation Institute, Abu Dhabi](https://tii.ae/cryptography).
+
+Currently I practise *Applied Cryptography*. Being an avid open source enthusiast, I myself develop and maintain some open source projects. Find them [here](https://github.com/itzmeanjan).
+
+## Education
+
+I did my **B.Sc. & M.Sc. in Computer Science** from Dept. of Computer & System Sciences, [Visva-Bharati, Santiniketan, India](https://visvabharati.ac.in/index.html). 
+
+You may be interested in taking a look at my [CV](./cv/myCV.pdf).
+
+## Skills
+
+- Languages :
+    - C, C++, Rust
+    - Golang, Python, JavaScript (NodeJS)
+    - Solidity, Miden VM Assembly
+    - Bash
+    - SQL
+- Technologies :
+    - Compilers : GCC, Clang, Rustc etc.
+    - Build Tools : Make, CMake
+    - Vectorization : AVX, AVX2, AVX512 and NEON intrinsics
+    - Profiling : GNU/Linux Kernel's Perf Tool
+    - Backend : REST, JSON-RPC, GraphQL, gRPC
+    - Networking : TCP, UDP, QUIC, Websocket, libp2p
+    - Database : PostgreSQL, MySQL, CockroachDB, MongoDB, Redis
+    - Caching : Memcached, Redis
+    - Stream Processing : Apache Kafka, Redis Streams
+    - Message Format : Protocol Buffer, MessagePack, CBOR, GOB, JSON, XML
+    - VCS : Git
+    - OS : GNU/Linux, Mac OS
+
+## Experience
+
+1) **Senior Cryptography Engineer @ Cryptography Research Center, Technology Innovation Institute, Abu Dhabi** *[ June, 2023 - Present ]*
+    - Working with Symmetric/ Asymmetric (non Post-Quantum) Cryptography Team
+2) **Blockchain Engineer @ Polygon (Previously Matic Network)** *[ July, 2020 - May, 2023 ]*
+    - Majorly contributed to the implementation of various symmetric and asymmetric cryptographic primitives in Miden VM
+    - Helped in building light-client for KZG polynomial commitment based data availability blockchain, using Rust and Substrate
+    - Wrote and maintained Solidity smart contracts, powering L1 <-> L2 asset transfer -- which once secured > $7B assets
+    - Developed simple and reliable micro-service based cross-blockchain transaction life-cycle tracker
+    - Designed and developed fast-finality, cross-blockchain, trusted asset transfer bridge, for one ecosystem project
+3) **Freelancer** *[ July, 2018 - June, 2020 ]*
+    - Wrote custom ERC20 contract (with locking, staking) for leading marketplace
+    - Reduced latency by 53% after introducing Memcached, Redis in existing service
+    - Built JSON-RPC/ REST services with ExpressJS, PostgreSQL
+    - Helped researcher explore large dataset using Matplotlib, Seaborn, Numpy
+    - Helped client target large user base by building multi-platform app in Flutter
+
+## Interest
+
+- Gossip protocols over large scale P2P network
+- Applied Cryptography - Symmetric, Public Key ( mostly Post-Quantum )
+- Highly Concurrent Systems
+
+## Hobby
+
+During offtime, I love to watch movies, tv series. Sometimes I read books or write blog posts - sharing my thoughts.
diff --git a/pages/beginning-of-blogging.html b/pages/beginning-of-blogging.html
deleted file mode 100644
index 3a2d5d9..0000000
--- a/pages/beginning-of-blogging.html
+++ /dev/null
@@ -1,121 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-  <title>
-    Beginning of Blogging
-  </title>
-  <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-  <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Beginning of Blogging">
-  <meta prefix="og: http://ogp.me/ns#" property="og:description" content="Practising daily writing for greater good">
-  <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-  <meta property="twitter:card" content="summary_large_image">
-  <meta property="twitter:url" content="https://itzmeanjan.in/">
-  <meta property="twitter:title" content="Beginning of Blogging">
-  <meta property="twitter:description" content="Practising daily writing for greater good">
-  <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta property="twitter:site" content="@meanjanry">
-  <meta name="description" content="Practising daily writing for greater good">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-  <meta name="author" content="Anjan Roy">
-  <meta name="keywords" 
-    content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-  <meta name="theme-color" content="darkslategrey">
-  <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-  <link rel="stylesheet" type="text/css" href="../styles/index.css">
-  <script src="../styles/code.js"></script>
-</head>
-
-<body>
-  <div id="parentDiv">
-    <div id="navBar">
-      <nav>
-        <a class = "navLink" href="/"><big>H</big>ome</a> |
-        <a class = "navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> | 
-        <a class = "navLink" href="/pages/blog.html"><big>B</big>log</a> |
-        <a class = "navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-      </nav>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1 class="blogHeader">
-            Beginning of Blogging
-        </h1>
-        <h3>Created : May 29, 2021</h3>
-      </article>
-    </div>
-    <div class="childDiv">
-        <article>
-          <p class="blogText">
-            Finally I've decided to start blogging !
-          </p>
-          <br>
-          <p class="blogText">
-              For sometime I've been procrastinating to start writing down my <i><b>thoughts, experiments & experiences</b></i>
-              --- giving myself excuses of busy schedule. But being unable to do something due to lack of time is 
-              most probably sign of time mis-management.
-              <br>
-              <br>
-              So I started planning how I'm going to spend my week long time --- just as sprint planning, where I sketch
-              out what're the things I'm going to do this week, with day level granularity, on each Saturday evening. After following
-              this ritual for last 5 months I've discovered it's not only easy to accomodate lots of tasks in 86,400 seconds
-              but also accomplish 100% if planned properly. I see this planning as an art, where there's nothing optimal, rather
-              it's a gradual progression towards betterment, where failure is a friendly teacher ✅.
-              <br>
-              <br>
-              When I started following this ritual, I used to sketch whole plan in my head. After few weeks I decided to use pencil & paper
-              and was able to do better planning where I was accomplishing >90% of what I planned. 
-              <br>
-              <br>
-              During this period, I taught myself how to wake up early morning. All it takes --- keeping alarm away
-              from bed's reach, so that I'm forced to get out of bed & stop the alarm, rather than snoozing it.
-              In last 5 months, I've worked on 3 large-scale production-grade open-source software systems. May be some otherday I'll
-              talk about them. Reading more blogs; listening to podcasts; reading books are among the things
-              I've become better at doing. I've seen improvement in the way I manage my tasks at workspace --- completing more
-              good quality tasks in lesser timespan. Empowered by better time management, I've seen improvement
-              in my relationship with other human beings & nature ( it's important 😉 ).
-              <br>
-              <br>
-              But more important part is understanding that, scheduling is an art --- where everyday collecting
-              feedback; accumulating them over week & on Saturday evening sitting down for making a better
-              plan --- is an infinite loop.
-              <br>
-          </p>
-          <div class="microlight"> start;
-
- while(True) {
-  while(not saturday) {
-    follow;
-    collect-feedback;
-  }
-
-  learn;
-  incorporate;
- }</div>
-            <p class="blogText">
-              I noticed getting inside infinite loop is hard, but once you're in, it's beautiful here 🥰.
-              I'd speak from my heart, as I'm writing this, I feel writing is indeed impactful.
-              <br>
-              <br>
-              Have a great time !
-            </p>
-        </article>
-    </div>
-  </div>
-  <div id="footerDiv">
-    <footer>
-      <p id="footerText">
-        &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink" target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-      </p>
-    </footer>
-  </div>
-</body>
-
-</html>
diff --git a/pages/blake3-on-gpgpu.html b/pages/blake3-on-gpgpu.html
deleted file mode 100644
index 0c7a3b8..0000000
--- a/pages/blake3-on-gpgpu.html
+++ /dev/null
@@ -1,965 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-    <title>
-        BLAKE3 on GPGPU
-    </title>
-    <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-    <meta prefix="og: http://ogp.me/ns#" property="og:title" content="BLAKE3 on GPGPU">
-    <meta prefix="og: http://ogp.me/ns#" property="og:description" content="In depth analysis of multiple SIMD variants for BLAKE3 parallel implementation targeting multi-core CPUs, GPGPUs using heterogeneous programming API SYCL">
-    <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url"
-        content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-    <meta property="twitter:card" content="summary_large_image">
-    <meta property="twitter:url" content="https://itzmeanjan.in/">
-    <meta property="twitter:title" content="BLAKE3 on GPGPU">
-    <meta property="twitter:description" content="In depth analysis of multiple SIMD variants for BLAKE3 parallel implementation targeting multi-core CPUs, GPGPUs using heterogeneous programming API SYCL">
-    <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta property="twitter:site" content="@meanjanry">
-    <meta name="description" content="In depth analysis of multiple SIMD variants for BLAKE3 parallel implementation targeting multi-core CPUs, GPGPUs using heterogeneous programming API SYCL">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-    <meta name="author" content="Anjan Roy">
-    <meta name="keywords" content="sycl, blake3, merkle tree, cryptographic, hash, data, parallel, programming, simd, arbitrary lane simd, multi-core cpu, gpgpu, cpp, dpcpp, loop, unrolling, compiler, optimization">
-    <meta name="theme-color" content="darkslategrey">
-    <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-    <link rel="stylesheet" type="text/css" href="../styles/index.css">
-    <script src="../styles/code.js"></script>
-</head>
-
-<body>
-    <div id="parentDiv">
-        <div id="navBar">
-            <nav>
-                <a class="navLink" href="/"><big>H</big>ome</a> |
-                <a class="navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-                <a class="navLink" href="/pages/blog.html"><big>B</big>log</a> |
-                <a class="navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-            </nav>
-        </div>
-        <div class="childDiv">
-            <article>
-                <h1 class="blogHeader">
-                    BLAKE3 on GPGPU
-                </h1>
-                <h3>Created : January 20, 2022</h3>
-            </article>
-        </div>
-        <div class="childDiv">
-            <article>
-              <p class="blogText">
-                Last week I implemented multiple variants of highly parallelizable cryptographic hash function BLAKE3
-                using SYCL and today I'd like to present my collective understanding, which I gained while implementing/ benchmarking
-                BLAKE3, targeting heterogeneous accelerator platform(s). BLAKE3 cryptographic hash function easily lends itself well
-                to data parallel execution environments like SYCL/ OpenCL. Speaking from high level design point of view, it consists of 
-                following two steps.
-              </p>
-              <ol>
-                <li>Splitting the whole input byte array into N -many equilength ( 1024 bytes ) chunks, each of which can be independently compressed in parallel</li>
-                <li>Finally it requires constructing one Binary Merkle Tree with N -many leaf nodes, produced in previous step as result of chunk compression</li>
-              </ol>
-              <p class="blogText">
-                The root of tree ( 32 -bytes wide ) is desired cryptographic hash of input byte array. Both of these steps are good candidates for data parallelism. Note, step-1 produces
-                N -many leaf nodes of Binary Merkle Tree, which are used for finding root of Merkle Tree in step-2 i.e. step-2 is data dependent on step-1. 
-                <br>
-                <br>
-                In this document, I'll be working with input byte array of length M -bytes such that <span class="highlight"><tt>M = N * 1024, where N = 2<sup>i</sup>, i = {1, 2, 3 ...}</tt></span>. That means
-                after execution of step-1 of BLAKE3, I should have power of 2 -many leaf nodes ( = N ), which will be used for computing root of fully balanced Binary Merkle Tree.
-                This will simplify both explanation & implementation. I'll walk you through following two techniques of implementing BLAKE3.
-              </p>
-              <ol>
-                <li>Each SYCL work-item compressing single chunk independently</li>
-                <li>Each SYCL work-item compressing P = {2, 4, 8, 16} chunks parallelly</li>
-              </ol>
-              <p class="blogText">
-                  Let me start with first approach which is simpler.
-                  <br>
-                  <br>
-                  Let us assume, I've 8KB input which I take as byte array ( say <span class="highlight"><tt>const sycl::uchar *</tt></span> ) and split it into 8 equal sized chunks.
-                  Now each of these 1024 -bytes wide chunks can be compressed in parallel. For doing so, I'll dispatch 8 work-items, with work-group size W ( <tt><= 8 && 8 % W == 0</tt> ),
-                  where each work-item executes <span class=highlight><tt>compress( ... )</tt></span> function, consuming 1024 -bytes input message into hash state.
-                  Once all these 8 work-items complete their execution, each of them output 64 -bytes chaining value ( which is actually BLAKE3 hash state matrix of that chunk ), 
-                  from which first 32 -bytes to be taken
-                  as output chaining value of that chunk. These output chaining values are used as leaf nodes of Binary Merkle Tree, which I'm about to construct.
-                  <br>
-                  <br>
-                  In final step of computation, I construct a Binary Merkle Tree from N ( = 8 ) output chaining values. 
-                  As Binary Merkle Tree is a hierarchical structure, I need to dispatch multiple rounds of kernels, respecting data dependency. 
-                  To be more specific, in this case 3 ( <tt>= log<sub>2</sub>(N), where N = 8</tt> ) rounds will be required.
-                  In first dispatch round, I'll dispatch 4 work-items, who will read ( total ) 4 consecutive pairs of output chaining values and interpret each pair of chaining values as left and
-                  right child of ( to be computed ) parent node, placed right next to each other ( in ltr order ), as depicted below.
-              </p>
-              <div class="microlight">
-    # cv = chaining value
-    # p_cv = parent chaining value
-            
-    cv   = [0, 1, 2, 3, 4, 5, 6, 7]            # merkle tree leaves
-    p_cv = [p_cv_0, p_cv_1, p_cv_2, p_cv_3]    # merkle tree intermediates, just above leaves
-            
-    p_cv_0     p_cv_1        p_cv_2        p_cv_3
-
-    (0, 1)      (2, 3)      (4, 5)      (6, 7)
-     /   \        /  \       /   \       /   \
-    /     \      /    \     /     \     /     \
-    0      1    2      3    4      5    6      7
-              </div>
-              <p class="blogText">
-                  Computing parent node involves compressing a pair of chaining values, while setting some flags denoting that parent chaining value will be output of <tt>compress( ... )</tt>, 
-                  where each chaining value is of 32 -bytes,
-                  making total of 64 -bytes input to <tt>compress( ... )</tt>. After completion of this dispatch round, we should have 4 parent nodes, who live just above leaf nodes.
-                  In next dispatch round, I've to ask for 2 work-items, each will compress two consecutive chaining values ( which were computed during last round )
-                  and produce total 2 parent nodes, who live just below root of the tree ( to be computed in next dispatch round ). 
-              </p>
-              <div class="microlight">
-        p_cv_0                    p_cv_1
-
-    ((0, 1), (2, 3))        ((4, 5), (6, 7))
-         /       \                /      \
-        /         \              /        \
-    (0, 1)      (2, 3)      (4, 5)      (6, 7)
-     /   \        /  \       /   \       /   \
-    /     \      /    \     /     \     /     \
-    0      1    2      3    4      5    6      7
-              </div>
-              <p class="blogText">
-                In final round, it suffices to dispatch just a single task which takes
-                64 -bytes input ( read two chaining values, which are two immediate children of root of tree; these were computed during last round --- thus data dependency ) and produces 32 -bytes
-                output chaining value which is root of Merkle Tree. This root is our desired BLAKE3 hash. Also note, before root of tree can be computed, flag denoting
-                that output of <tt>compress( ... )</tt> function invocation is root chaining value of BLAKE3 Merkle Tree, need to be set.
-                <br>
-                <br>
-                A pictorial demonstration might be helpful at this moment.
-              </p>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_0.jpg">
-              <p class="blogText">
-                  Empowered with this high level knowledge of algorithmic construction of BLAKE3, it's good time to dive into often mentioned <span class="highlight"><tt>compress( ... )</tt></span>
-                  function. Simply speaking compression starts with 32 -bytes input chaining value and 64 -bytes input message, it consumes whole message into BLAKE3 hash state
-                  ( in multiple rounds, while also employing message permutation, using predefined indexing tricks )
-                  and produces output of 64 -bytes, which is nothing but hash state after consuming whole input message inside it. First 32 -bytes of output is taken as chaining value
-                  which is either used as input to next stage of computation or as final root chaining value i.e. BLAKE3 digest. Let's emphasize on BLAKE3 hash state.
-              </p>
-              <p class="blogText">
-                  BLAKE3 hash state is 64 -bytes wide; as BLAKE3 word size is 32 -bit, hash state can be represented using an array of 16 elements where each element is 32 -bit wide
-                  unsigned integer i.e. <tt>sycl::uint</tt>. When compressing 64 -bytes message, BLAKE3 consumes input message in 7 rounds, while at end of each round ( except last one )
-                  permutes 64 -bytes message in a predefined way. At end of applying 7 rounds, it takes first 32 -bytes of hash state, which has now consumed permuted variants of 64 -bytes input
-                  message, as output chaining value. Each round of BLAKE3 compression consists of bit wise manipulation of 32 -bit wide hash state words.
-              </p>
-              <div class="microlight">
-    # see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L42-L52 👇
-    
-    # rrot( v, n ) => `v` is rotated right by n -bit places
-    #
-    # 1. If v is a scalar, it's bitwise rotated rightwards by n -bits
-    #
-    # 2. When v is vector, each lane of vector ( 32 -bit word ) is 
-    # rotated n -bits rightwards
-
-    # Note: Find `rotate` function in table 
-    # https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_integer_functions
-    # which takes either a scalar/ vector and rotates each element leftwards by n -bits
-    # but for blake3 I need to rotate hash state words rightwards, so I make use of them as
-    #
-    # n = # -of bit places to rotate rightwards
-    # v = input vector, each lane of it will be rotated
-    #
-    # sycl::rotate(v, 32 - n) => rotated each lane by n -bits rightwards
-    #
-    # If I've to rotate each lane by 7 -bits rightwards, I'll
-    # invoke sycl::rotate(_, 25)
-    #
-    # This works because each word of blake3 is 32 -bits wide !
-    #
-    # See how I've used it 
-    # https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3.hpp#L1570-L1573
-
-
-    def g(sycl::uint state[16], size_t a, size_t b, size_t c, size_t d, sycl::uint mx, sycl::uint my):
-        state[a] = state[a] + state[b] + mx
-        state[d] = rrot(state[d] ^ state[a], 16)
-        state[c] = state[c] + state[d]
-        state[b] = rrot(state[b] ^ state[c], 12)
-        state[a] = state[a] + state[b] + my
-        state[d] = rrot(state[d] ^ state[a], 8)
-        state[c] = state[c] + state[d]
-        state[b] = rrot(state[b] ^ state[c], 7)
-
-    # see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L54-L65 👇
-
-    def blake3_round(sycl::uint state[16], sycl::uint msg[16]):
-        # consuming columns
-        g(state, 0, 4, 8, 12, msg[0], msg[1])
-        g(state, 1, 5, 9, 13, msg[2], msg[3])
-        g(state, 2, 6, 10, 14, msg[4], msg[5])
-        g(state, 3, 7, 11, 15, msg[6], msg[7])
-
-        # consuming diagonals.
-        g(state, 0, 5, 10, 15, msg[8], msg[9])
-        g(state, 1, 6, 11, 12, msg[10], msg[11])
-        g(state, 2, 7, 8, 13, msg[12], msg[13])
-        g(state, 3, 4, 9, 14, msg[14], msg[15])
-              </div>
-              <p class="blogText">
-                  Note indices passed as argument to <tt>g( ... )</tt> function from <tt>blake3_round( ... )</tt>, which clearly shows in first four <tt>g( ... )</tt> function invocations, it's column-wise mixing
-                  first eight message words with hash state. And last four <tt>g( ... )</tt> function invocations are diagonally mixing remaining eight message words with hash state. It's possible to
-                  reduce four vertical mixing invocations into single function call, where all four columns are mixed parallelly, 
-                  if I represent hash state as an array of 4 vectors ( SYCL <a class="blogLink" href="https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:vector.type" target="_blank">intrinsic</a> ), where each vector is of type <tt>sycl::uint4</tt>, as shown below. With this new representation of hash state,
-                  diagonal mixing also enjoys boost, where all four diagonals of hash state matrix can be mixed parallelly.
-              </p>
-              <div class="microlight">
-    sycl::uint4 state[4] = {
-        sycl::uint4{ ... },
-        sycl::uint4{ ... },
-        sycl::uint4{ ... },
-        sycl::uint4{ ... }
-    };
-              </div>
-              <img class="imgCenterUpdt" src="../images/blake3_on_gpgpu_1.png">
-              <p class="blogText">
-                  With this new representation of hash state column-wise mixing looks like below.
-              </p>
-              <div class="microlight">
-    # see section 5.3 of BLAKE3 specification https://github.com/BLAKE3-team/BLAKE3-specs/blob/ac78a71/blake3.pdf
-    # simd style mixing
-    #
-    # see full SYCL implementation of blake3_round( ... )
-    # https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3.hpp#L1558-L1610
-
-    def blake3_round(sycl::uint4 state[4], sycl::uint msg[16]):
-        sycl::uint4 mx = { msg[0], msg[2], msg[4], msg[6] }
-        sycl::uint4 my = { msg[1], msg[3], msg[5], msg[7] }
-        
-        # column wise mixing
-        state[0] = state[0] + state[1] + mx
-        state[3] = rrot(state[3] ^ state[0], 16)
-        state[2] = state[2] * state[3]
-        state[1] = rrot(state[1] ^ state[2], 12)
-        state[0] = state[0] + state[1] + my
-        state[3] = rrot(state[3] ^ state[0], 8)
-        state[2] = state[2] * state[3]
-        state[1] = rrot(state[1] ^ state[2], 7)
-
-        # hash state diagonalisation
-        #
-        # ... to be written ...
-
-        # diagonal mixing
-        #
-        # ... to be written ...
-
-        # hash state undiagonalisation
-        #
-        # ... to be written ...
-              </div>
-              <p class="blogText">
-                  But keeping hash state as 4x4 matrix comes with its own requirement, where it needs to be diagonalised such that each diagonal of 4x4 matrix is now in same column, before
-                  diagonal mixing can be applied. After diagonal mixing, hash state needs to get back to its original form, which calls for undoing the diagonalisation previously performed.
-                  Diagonalisation involves rotating each of four vectors leftwards by row index of respective vector i.e. {0, 1, 2, 3} in 4x4 state matrix. Note, vector lane rotation doesn't rotate
-                  each lane content ( <tt>sycl::uint</tt> ), instead it rotates whole vector by N ( < 4, because each row has 4 lanes ) places.
-                  I make use of vector swizzle operators provided by SYCL vector intrinsic <a class="blogLink" href="https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#_swizzles" target="_blank">API</a> for rotating vector.
-                  <br>
-                  <br>
-                  A pictorial depiction looks like below.
-              </p>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_2.jpg">
-              <p class="blogText">
-                Note the color coding, which shows how diagonalisation helps in bringing each of 4 diagonals of 4x4 hash state matrix in same column. This makes applying diagonal mixing
-                much easier ( and faster ) on 128 -bit vectors i.e. <tt>sycl::uint4</tt>.
-              </p>
-              <div class="microlight">
-    # hash state diagonalisation
-
-    state[0] = state[0].xyzw() // can be skipped, doesn't make any change
-    state[1] = state[1].yzwx()
-    state[2] = state[2].zwxy()
-    state[3] = state[3].wxyz()
-              </div>
-              <p class="blogText">
-                  Following code snippet can perform diagonal mixing on four 128 -bit vectors i.e. <tt>sycl::uint4[4]</tt>. This will consume last 8 message words of total 64 -bytes
-                  input message into hash state. Note, other than which message words are consumed, diagonal mixing is just same as column-wise mixing, because
-                  we've arranged columns to be so. This means, in implementation both of these mixings can be replaced using preprocessor directives or other compile-time
-                  code generation means.
-              </p>
-              <div class="microlight">
-    def blake3_round(sycl::uint4 state[4], sycl::uint msg[16]):        
-        # column wise mixing
-        # ... see above ...
-        
-        # last 8 message words to be consumed
-        sycl::uint4 mz = { msg[8], msg[10], msg[12], msg[14] }
-        sycl::uint4 mw = { msg[9], msg[11], msg[13], msg[15] }
-
-        # diagonal mixing
-        state[0] = state[0] + state[1] + mz
-        state[3] = rrot(state[3] ^ state[0], 16)
-        state[2] = state[2] * state[3]
-        state[1] = rrot(state[1] ^ state[2], 12)
-        state[0] = state[0] + state[1] + mw
-        state[3] = rrot(state[3] ^ state[0], 8)
-        state[2] = state[2] * state[3]
-        state[1] = rrot(state[1] ^ state[2], 7)
-              </div>
-              <p class="blogText">
-                  After diagonal mixing diagonalisation will be undone, rotating 4x4 state matrix vectors rightwards by row index of respective vector i.e. {0, 1, 2, 3} in 4x4 state matrix. Following code snippet should bring back
-                  hash state is desired form, preparing it for next round.
-              </p>
-              <div class="microlight">
-    # hash state undiagonalisation
-
-    state[0] = state[0].xyzw() // can be skipped, doesn't make any change
-    state[1] = state[1].wxyz()
-    state[2] = state[2].zwxy()
-    state[3] = state[3].yzwx()
-              </div>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_3.jpg">
-              <p class="blogText">
-                  During compression of 64 -bytes message input, after application of each round of mixing, sixteen message words ( 16 x 4 = 64 -bytes total message ) are permuted in following manner
-                  and permuted output is used as input message words during next round.
-              </p>
-              <div class="microlight">
-    # see https://github.com/BLAKE3-team/BLAKE3/blob/da4c792/reference_impl/reference_impl.rs#L40
-    
-    const size_t MSG_PERMUTATION[16] = {2, 6, 3, 10, 7, 0, 4, 13,
-                                    1, 11, 12, 5, 9, 14, 15, 8}
-
-    # see https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1623-L1635
-
-    def permute(sycl::uint msg[16]):
-        # temporary memory allocation to help permute
-        sycl::uint perm[16] = [0] * 16
-        
-        # permute, loop can be fully unrolled, no loop carried dependency
-        # pragma unroll 16
-        for i in range(16):
-            perm[i] = msg[MSG_PERMUTATION[i]]
-
-        # copy back, loop can be fully unrolled, no loop carried dependency
-        # pragma unroll 16
-        for i in range(16):
-            msg[i] = perm[i]
-              </div>
-              <p class="blogText">
-                  Now let us go back to chunk compression, where we had 1024 -bytes input ( as <tt>sycl::uchar *</tt> ); each <tt>compress( ... )</tt> function invocation
-                  takes 64 contiguous bytes to mix with 4x4 hash state matrix, that means we've to iterate 16 times for processing whole chunk. Each of these 64 -bytes
-                  are called blocks, 16 of them make a chunk. For first block in a chunk, a predefined constant input chaining value is used, but all subsequent 15 blocks use
-                  previous block's 32 -bytes output chaining value as its input chaining value. Note, BLAKE3 hash state is 64 -bytes, input chaining value is 32 -bytes, so remaining 32 -bytes
-                  of hash state ( i.e. last two rows of 4x4 hash state matrix ) comes from predefined constants & other parameters passed to <tt>compress( ... )</tt>, which includes flags denoting whether this block
-                  is first/ last of this chunk or if output chaining value will be parent/ root node of BLAKE3 Merkle Tree, block length, chunk index etc.. After applying <tt>compress( ... )</tt>
-                  first 32 -bytes of hash state is taken as output chaining value, to be used as input chaining value of next block in same chunk. Below is a pictorial demonstration for ease of
-                  understanding.
-              </p>
-              <div class="microlight">
-    # definition of compress( ... ) function
-    # https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1728-L1780
-    
-    # predefined constant input chaining values I mentioned above
-    # https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3_consts.hpp#L8-L9
-    
-    # these are flags used in blake3
-    # https://github.com/itzmeanjan/blake3/blob/1c58f6a/include/blake3_consts.hpp#L16-L20
-              </div>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_4.jpg">
-              <p class="blogText">
-                  Last block of each chunk produces 32 -bytes output chaining value, which is considered to be leaf node of Binary Merkle Tree. After all chunks are compressed & we've
-                  N -many output chaining values, interpreted as leaf nodes of Binary Merkle Tree. Usual parallel Binary Merkle Tree construction algorithm can be applied. Note, for
-                  merging two consecutive chaining values ( at leaf level ) into single parent chaining value, <tt>compress( ... )</tt> function is invoked with a predefined constant input
-                  chaining value, along with two consecutive chaining values being interpreted as 64 -bytes input message. Some input flags are passed to denote that its output chaining value will be a parent node.
-                  While computing root chaining value ( read target digest of input byte array ), two immediate child nodes just below root are compressed
-                  into single chaining value, while passing some flags to denote this is root node being computed.
-              </p>
-              <div class="microlight">
-    see how two child nodes are merged into single parent ( chaining value ) 
-    # https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1782-L1803
-    
-    also see how two immediate children nodes of root of BLAKE3 merkle tree
-    are merged into root ( chaining value )
-    # https://github.com/itzmeanjan/blake3/blob/e7019ed/include/blake3.hpp#L1805-L1812
-              </div>
-              <p class="blogText">
-                  It's good time to give second approach a go, where each SYCL work-item compresses more than one chunk.
-                  <br>
-                  <br>
-                  Remember in first approach of parallel BLAKE3, I used a 4x4 matrix of 64 -bytes for representing hash state, when compressing each block of total 1024 -bytes wide chunk.
-                  But this time, hash state is represented using a 16 x N matrix, where N = {2, 4, 8, 16} and i<sup>th</sup> row of state matrix holds N -many different chunk's hash state
-                  word at index <b>i</b>; so N -many different chunk's hash states are represented in N -many columns of 16 x N shared state matrix. That means, with N = 4, sixteen 128 -bit
-                  vectors will be used for representing whole hash state of 4 chunks. Programmatically I'd like to represent it using following syntax.
-              </p>
-              <div class="microlight">
-    # representing N = 4, chunk's hash state
-    # each column, represents hash state of i^th chunk, where i ∈ [0, N)
-    #
-    # s_i_j = j^th chunk's hash state at index i, when each chunk's hash state looks like
-    # sycl::uint s_i[16] = { s_0, s_1, s_2, s_3, s_4, s_5, s_6, s_7, s_8, s_9, s_10, s_11, s_12, s_13, s_14, s_15 }
-
-    sycl::uint4 state[16] = {
-        sycl::uint4{ s_0_0, s_0_1, s_0_2, s_0_3 },
-        sycl::uint4{ s_1_0, s_1_1, s_1_2, s_1_3 },
-        sycl::uint4{ s_2_0, s_2_1, s_2_2, s_2_3 },
-        sycl::uint4{ s_3_0, s_3_1, s_3_2, s_3_3 },
-        sycl::uint4{ s_4_0, s_4_1, s_4_2, s_4_3 },
-        sycl::uint4{ s_5_0, s_5_1, s_5_2, s_5_3 },
-        sycl::uint4{ s_6_0, s_6_1, s_6_2, s_6_3 },
-        sycl::uint4{ s_7_0, s_7_1, s_7_2, s_7_3 },
-        sycl::uint4{ s_8_0, s_8_1, s_8_2, s_8_3 },
-        sycl::uint4{ s_9_0, s_9_1, s_9_2, s_9_3 },
-        sycl::uint4{ s_10_0, s_10_1, s_10_2, s_10_3 },
-        sycl::uint4{ s_11_0, s_11_1, s_11_2, s_11_3 },
-        sycl::uint4{ s_12_0, s_12_1, s_12_2, s_12_3 },
-        sycl::uint4{ s_13_0, s_13_1, s_13_2, s_13_3 },
-        sycl::uint4{ s_14_0, s_14_1, s_14_2, s_14_3 },
-        sycl::uint4{ s_15_0, s_15_1, s_15_2, s_15_3 },
-    };
-              </div>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_5.jpg">
-              <p class="blogText">
-                  With N ( = 4 ) chunks being compressed together, each SYCL work-item mixes total 4096 -bytes of input message into hash state, 
-                  each 1024 -bytes chunk splitted in 16 blocks, each of width 64 -bytes. There'll be sixteen rounds required for compressing 4096 -bytes. 
-                  In each round, i<sup>th</sup> block of all N chunks are compressed together. Note color coding used in following demonstration, where
-                  I attempt to show you how message words ( 32 -bit wide ) from each block are chosen to construct 128 -bit vectors ( using <tt>sycl::uint4</tt> )
-                  which are using during column-wise and diagonal mixing. I'd also like you to note that, there's no diagonalisation and undiagonalisation steps
-                  required in this SIMD style mixing, because each chunk's hash state is actually a 16 word vector, which is a column of 16 x N state matrix.
-                  After first block is processed, which consumes 64 -bytes message from each of four chunks ( i.e. first block of each chunk ), output chaining value
-                  of four chunks are prepared by taking 8 x N state matrix, where lower 8 x N portion of matrix ( read last 8 rows ) is dropped. This should produce
-                  32 -bytes output chaining value for each chunk, which will be used as input chaining values for those respective chunks when processing block<sub>i+1</sub>
-                  from all N ( = 4 ) chunks.
-                  <br>
-                  <br>
-                  After all sixteen blocks from all chunks are compressed into hash state, 16 x N state matrix is truncated to 8 x N matrix ( by dropping last 8 rows ), 
-                  which holds N -many output chaining values of N -many chunks. These N -many output chaining values are considered as N -many leaf nodes of BLAKE3 Merkle Tree,
-                  which will be constructed once all work-items complete compressing N -many chunks each.
-                  <br>
-                  <br>
-                  Binary Merklization algorithm doesn't anyhow change in second approach.
-              </p>
-              <img class="imgCenter" src="../images/blake3_on_gpgpu_6.jpg">
-              <p class="blogText">
-                  Note, when N = 2, sixteen 64 -bit wide SIMD registers are used for representing hash state of two chunks, which are compressed in parallel. Similarly, for N = {4, 8, 16}
-                  sixteen {128, 256, 512} -bit registers ( respectively ) will be used for representing hash state of N chunks. On modern CPUs which support <tt>avx512*</tt> instructions
-                  512 -bit vectors can help boosting this style of leveraging arbitrary many SIMD lanes.
-                  <br>
-                  <br>
-                  For understanding opportunities of using SIMD for parallelizing BLAKE3 on relatively large input byte arrays, I suggest you take a look at BLAKE3
-                  <a class="blogLink" href="https://github.com/BLAKE3-team/BLAKE3-specs/blob/ac78a71/blake3.pdf" target="_blank">specification</a>'s
-                  section 5.3.
-              </p>
-              <p class="blogText">
-                  As you've now better understanding of aforementioned two approaches for computing BLAKE3 hash, I'd like to present you with benchmark results. In following tables, you'll
-                  see I'm taking random input of N -bytes; transferring whole input to accelerator's accompanying memory; invoking BLAKE3 kernel with on-device data pointer; waiting for
-                  all computation steps to complete and finally transferring 32 -bytes digest ( which is output chaining value of root node of Binary Merkle Tree in BLAKE3 hash ) to preallocated
-                  memory on host. Note, all these numbers represent mean value obtained after executing same kernel with same input size/ arguments K ( = 8 ) -many times.
-              </p>
-              <table class="centeredTable">
-                  <tr>
-                      <th colspan="5">
-                          BLAKE3 Hash using <tt>approach_1</tt>
-                      </th>
-                  </tr>
-                  <tr>
-                      <th>
-                          Input Size
-                      </th>
-                      <th>
-                          Accelerator
-                      </th>
-                      <th>
-                          Kernel Execution Time
-                      </th>
-                      <th>
-                          Host -> Device Tx Time
-                      </th>
-                      <th>
-                          Host <- Device Tx Time
-                      </th>
-                  </tr>
-                  <tr>
-                      <td rowspan="4">64 MB</td>
-                  </tr>
-                  <tr>                           
-                    <td>Tesla V100-SXM2-16GB</td>
-                    <td>844.598250 us</td>
-                    <td>6.166145 ms</td>
-                    <td>6.973250 us</td>
-                  </tr>
-                  <tr>
-                    <td>Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                    <td>6.239875 ms</td>
-                    <td>9.797500 ms</td>
-                    <td>2.525625 us</td>
-                  </tr>
-                  <tr>                                                  
-                    <td>Intel(R) Iris(R) Xe MAX Graphics</td>
-                    <td>4.974242 ms</td>
-                    <td>17.749401 ms</td>
-                    <td>1.319500 us</td>
-                  </tr>
-
-                <tr>
-                    <td rowspan="4">128 MB</td>
-                </tr>
-                <tr>
-                  <td>Tesla V100-SXM2-16GB</td>
-                  <td>1.800964 ms</td>
-                  <td>12.269974 ms</td>
-                  <td>7.080000 us</td>
-                </tr>
-                <tr>
-                  <td>Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                  <td>8.187520 ms</td>
-                  <td>20.664062 ms</td>
-                  <td>1.242000 us</td>
-                </tr>
-                <tr>
-                  <td>Intel(R) Iris(R) Xe MAX Graphics</td>
-                  <td>9.812348 ms</td>
-                  <td>35.475108 ms</td>
-                  <td>1.319500 us</td>
-                </tr>
-                <tr>
-                    <td rowspan="4">256 MB</td>
-                </tr>
-                <tr>              
-                  <td>Tesla V100-SXM2-16GB</td>
-                  <td>3.267731 ms</td>
-                  <td>24.462952 ms</td>
-                  <td>6.805500 us</td>
-                </tr>
-                <tr>
-                  <td>Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                  <td>8.853032 ms</td>
-                  <td>32.455801 ms</td>
-                  <td>1.047125 us</td>
-                </tr>
-                <tr>                         
-                  <td>Intel(R) Iris(R) Xe MAX Graphics</td>
-                  <td>19.465823 ms</td>
-                  <td>70.886068 ms</td>
-                  <td>1.293500 us</td>
-                </tr>
-                <tr>
-                    <td rowspan="4">512 MB</td>
-                </tr>
-                <tr>
-                  <td>Tesla V100-SXM2-16GB</td>
-                  <td>5.998047 ms</td>
-                  <td>48.833740 ms</td>
-                  <td>6.713750 us</td>
-                </tr>
-                <tr>
-                  <td>Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                  <td>14.807205 ms</td>
-                  <td>48.242437 ms</td>
-                  <td>1.063000 us</td>
-                </tr>
-                <tr>                         
-                  <td>Intel(R) Iris(R) Xe MAX Graphics</td>
-                  <td>39.271700 ms</td>
-                  <td>141.716997 ms</td>
-                  <td>1.313000 us</td>
-                </tr>
-                <tr>
-                    <td rowspan="4">1024 MB</td>
-                </tr>
-                <tr>
-                  <td>Tesla V100-SXM2-16GB</td>
-                  <td>11.915527 ms</td>
-                  <td>97.573730 ms</td>
-                  <td>8.423000 us</td>
-                </tr>
-                <tr>
-                  <td>Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                  <td>22.864140 ms</td>
-                  <td>79.047688 ms</td>
-                  <td>1.088500 us</td>
-                </tr>
-                <tr>                         
-                  <td>Intel(R) Iris(R) Xe MAX Graphics</td>
-                  <td>77.556440 ms</td>
-                  <td>283.341799 ms</td>
-                  <td>1.534000 us</td>
-                </tr>
-              </table>
-              <p class="blogText">
-                  In above table, you should see three accelerators I targeted for benchmarking BLAKE3 SIMD <tt>approach_1</tt>, where two of them are GPUs 
-                  from two different vendors and one is 64 -core CPU from Intel. You'll notice, Nvidia's Tesla V100 GPU performs best on all input sizes.
-                  Being a 64 -core CPU, in majority of cases it performs better compared to Intel's Irix Xe Max GPU. Note, when input size is 64 MB, Intel GPU
-                  perform's little better than 64 -core CPU. But as input size increases kernel execution time on Intel GPU starts to quickly increase, though
-                  on Intel CPU execution time for 64 MB, 128 MB and 256 MB input sizes are pretty close to each other.
-                  <br>
-                  <br>
-                  As output size is constant ( read 32 -bytes ), device to host data transfer cost is not much of concern. But input data size is variable,
-                  host to device input data transfer cost can help us in answering <i>is it worth transferring large byte array to accelerator for computing BLAKE3 hash ?</i>
-                  <br>
-                  <br>
-                  Comparing between multiple accelerators ( with same input size ), it shows as input data size increases host to device data transfer cost
-                  increases quickly for GPU ( even surpasses input data transfer cost on CPU for same size ), which makes sense because those accelerators
-                  are connected to host over PCIe bus. When comparing input data transfer cost of Nvidia's GPU and Intel's CPU, I see until 512 MB input size,
-                  cost was lesser for GPU, but at 512 MB input size both of them take around same time. For both GPUs from two different vendors, I see
-                  their host to device data transfer cost increases linearly as input size is doubled, because both of them are connected to host CPU using PCIe,
-                  which doesn't have high bandwidth. Due to these relatively high input data transfer costs, it may not always benefit using this accelerated
-                  BLAKE3 implementation, where explicitly data needs to be transferred to accelerator's local DRAM, and it may end up defeating whole purpose
-                  of speeding up. Just to make it more evident, notice in above table, for 1 GB input size on Nvidia Telsa V100 GPU, input transfer is ~8x costlier than actual
-                  computation of BLAKE3 hash.
-                  <br>
-                  <br>
-                  Lastly I'd like to draw your attention to device to host data transfer cost ( transferring 32 -bytes digest back to host ), where
-                  you should notice, on Nvidia's Tesla V100 GPU it's ~(6 - 7)x more expensive to transfer 32 -bytes ( over PCIe ) to host, when compared to
-                  Intel's GPU.
-              </p>
-              <p class="blogText">
-                  In approach_2 of computing BLAKE3, I'm compressing {2, 4, 8, 16} chunks together & in following table I present kernel execution time and host <-> device data transfer
-                  cost, by timing SYCL events obtained as result of submitting respective commands on SYCL queue, which has profiling enabled.
-              </p>
-              <table class="centeredTable">
-                <tr>
-                    <th colspan="6">
-                        BLAKE3 Hash using <tt>approach_2</tt>, compressing {2, 4, 8, 16} chunks together
-                    </th>
-                </tr>
-                <tr>
-                    <th>
-                        Input Size
-                    </th>
-                    <th>
-                        Accelerator
-                    </th>
-                    <th>
-                        SIMD Width
-                    </th>
-                    <th>
-                        Kernel Execution Time
-                    </th>
-                    <th>
-                        Host -> Device Tx Time
-                    </th>
-                    <th>
-                        Host <- Device Tx Time
-                    </th>
-                </tr>
-                <tr>
-                    <td rowspan="16">64 MB</td>
-                </tr>
-                <tr>                           
-                  <td rowspan="5">Tesla V100-SXM2-16GB</td>
-                </tr>
-                <tr>
-                    <td>64 -bit</td>
-                    <td>1.016358 ms</td>
-                    <td>6.172363 ms</td>
-                    <td>7.568375 us</td>
-                </tr>
-                <tr>
-                    <td>128 -bit</td>
-                    <td>923.828375 us</td>
-                    <td>6.168457 ms</td>
-                    <td>7.323875 us</td>
-                </tr>
-                <tr>                           
-                    <td>256 -bit</td>
-                    <td>1.318848 ms</td>
-                    <td>6.168945 ms</td>
-                    <td>7.812625 us</td>
-                </tr>
-                <tr>                          
-                    <td>512 -bit</td>
-                    <td>2.055176 ms</td>
-                    <td>6.176270 ms</td>
-                    <td>10.254000 us</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                </tr>
-                <tr>
-                    <td>64 -bit</td>
-                    <td>7.193866 ms</td>
-                    <td>13.689200 ms</td>
-                    <td>4.531500 us</td>
-                </tr>
-                <tr>
-                    <td>128 -bit</td>
-                    <td>6.739462 ms</td>
-                    <td>14.008103 ms</td>
-                    <td>2.967625 us</td>
-                </tr>
-                <tr>                         
-                    <td>256 -bit</td>
-                    <td>7.261953 ms</td>
-                    <td>14.829467 ms</td>
-                    <td>2.978000 us</td>
-                </tr>
-                <tr>                         
-                    <td>512 -bit</td>
-                    <td>11.546031 ms</td>
-                    <td>13.229008 ms</td>
-                    <td>1.385125 us</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Iris(R) Xe MAX Graphics</td>
-                </tr>
-                <tr>
-                                                             
-                    <td>64 -bit</td>
-                    <td>3.106389 ms</td>
-                    <td>17.748458 ms</td>
-                    <td>1.365000 us</td>
-                </tr>
-                <tr>
-                    <td>128 -bit</td>
-                    <td>28.628951 ms</td>
-                    <td>17.749823 ms</td>
-                    <td>1.332500 us</td>
-                </tr>
-                <tr>
-                    <td>256 -bit</td>
-                    <td>56.188691 ms</td>
-                    <td>17.748861 ms</td>
-                    <td>1.326000 us</td>
-                </tr>
-                <tr>
-                    <td>512 -bit</td>
-                    <td>105.559818 ms</td>
-                    <td>17.749823 ms</td>
-                    <td>1.365000 us</td>
-                </tr>
-                <tr>
-                    <td rowspan="16">256 MB</td>
-                </tr>
-                <tr>                           
-                  <td rowspan="5">Tesla V100-SXM2-16GB</td>
-                </tr>
-                <tr>
-                                                             
-                    <td>64 -bit</td>
-                    <td>3.539550 ms</td>
-                    <td>24.455078 ms</td>
-                    <td>7.080250 us</td>
-                </tr>
-                <tr>                         
-                    <td>128 -bit</td>
-                    <td>4.190674 ms</td>
-                    <td>24.442871 ms</td>
-                    <td>7.080000 us</td>
-                </tr>
-                <tr>
-                    <td>256 -bit</td>
-                    <td>5.203370 ms</td>
-                    <td>24.459961 ms</td>
-                    <td>7.568250 us</td>
-                </tr>
-                <tr>                                                   
-                    <td>512 -bit</td>
-                    <td>13.925293 ms</td>
-                    <td>24.453369 ms</td>
-                    <td>7.568375 us</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                </tr>
-                <tr>
-                    <td>64 -bit</td>
-                    <td>10.928828 ms</td>
-                    <td>33.915237 ms</td>
-                    <td>967.625000 ns</td>
-                </tr>
-                <tr>                        
-                    <td>128 -bit</td>
-                    <td>8.854166 ms</td>
-                    <td>32.901272 ms</td>
-                    <td>976.500000 ns</td>
-                </tr>
-                <tr> 
-                    <td>256 -bit</td>
-                    <td>10.290331 ms</td>
-                    <td>33.643110 ms</td>
-                    <td>1.030125 us</td>
-                </tr>
-                <tr>                                     
-                    <td>512 -bit</td>
-                    <td>18.790299 ms</td>
-                    <td>33.722702 ms</td>
-                    <td>966.125000 ns</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Iris(R) Xe MAX Graphics</td>
-                </tr>
-                <tr>                                                     
-                    <td>64 -bit</td>
-                    <td>11.941254 ms</td>
-                    <td>70.892191 ms</td>
-                    <td>1.326000 us</td>
-                </tr>
-                <tr>                        
-                    <td>128 -bit</td>
-                    <td>110.007846 ms</td>
-                    <td>70.894857 ms</td>
-                    <td>1.339000 us</td>
-                </tr>
-                <tr>
-                                                             
-                    <td>256 -bit</td>
-                    <td>245.655891 ms</td>
-                    <td>70.883748 ms</td>
-                    <td>1.391000 us</td>
-                </tr>
-                <tr>                         
-                    <td>512 -bit</td>
-                    <td>475.246200 ms</td>
-                    <td>70.886621 ms</td>
-                    <td>1.332500 us</td>
-                </tr>
-                <tr>
-                    <td rowspan="16">1024 MB</td>
-                </tr>
-                <tr>                           
-                  <td rowspan="5">Tesla V100-SXM2-16GB</td>
-                </tr>
-                <tr>                                            
-                    <td>64 -bit</td>
-                    <td>11.715087 ms</td>
-                    <td>97.482910 ms</td>
-                    <td>9.765625 us</td>
-                </tr>
-                <tr>                                                  
-                    <td>128 -bit</td>
-                    <td>12.184326 ms</td>
-                    <td>97.552734 ms</td>
-                    <td>8.300750 us</td>
-                </tr>
-                <tr>                         
-                    <td>256 -bit</td>
-                    <td>18.732911 ms</td>
-                    <td>97.577148 ms</td>
-                    <td>7.812250 us</td>
-                </tr>
-                <tr>                                                            
-                    <td>512 -bit</td>
-                    <td>52.898436 ms</td>
-                    <td>97.524170 ms</td>
-                    <td>8.056625 us</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz</td>
-                </tr>
-                <tr>                       
-                    <td>64 -bit</td>
-                    <td>35.084335 ms</td>
-                    <td>76.414034 ms</td>
-                    <td>903.625000 ns</td>
-                </tr>
-                <tr>                                          
-                    <td>128 -bit</td>
-                    <td>25.805447 ms</td>
-                    <td>79.800968 ms</td>
-                    <td>1.052625 us</td>
-                </tr>
-                <tr>                          
-                    <td>256 -bit</td>
-                    <td>28.765474 ms</td>
-                    <td>80.076494 ms</td>
-                    <td>1.017875 us</td>
-                </tr>
-                <tr>                                                             
-                    <td>512 -bit</td>
-                    <td>61.698307 ms</td>
-                    <td>75.633475 ms</td>
-                    <td>1.107500 us</td>
-                </tr>
-                <tr>                           
-                    <td rowspan="5">Intel(R) Iris(R) Xe MAX Graphics</td>
-                </tr>
-                <tr>                                                                           
-                    <td>64 -bit</td>
-                    <td>47.515533 ms</td>
-                    <td>283.342299 ms</td>
-                    <td>1.319500 us</td>
-                </tr>
-                <tr>                                       
-                    <td>128 -bit</td>
-                    <td>431.324205 ms</td>
-                    <td>283.344665 ms</td>
-                    <td>1.306500 us</td>
-                </tr>
-                <tr>                                            
-                    <td>256 -bit</td>
-                    <td>938.669251 ms</td>
-                    <td>283.345946 ms</td>
-                    <td>2.164500 us</td>
-                </tr>
-                <tr>                                                
-                    <td>512 -bit</td>
-                    <td>1.843786 s</td>
-                    <td>283.342267 ms</td>
-                    <td>2.164500 us</td>
-                </tr>
-            </table>
-            <p class="blogText">
-                In above table, host <-> device data transfer cost is not something I'm interested in, but instead I want you to notice how changing SIMD lane count from 2 to 16 ( by doubling ),
-                affects kernel execution time on some accelerator with some specific input size. For example, let us zoom into row where input size is 256 MB; on Tesla V100 GPU
-                as number of SIMD lanes used are doubled ( i.e. # -of chunks being compressed together by each SYCL work-item is doubled ), kernel execution time is doubled.
-                Now notice, how kernel execution performs on Intel Iris Xe Max GPU, for input size 256 MB, using both approach_{1, 2} ( with SIMD lanes {2, 4, 8, 16} ). See with
-                approach_1, kernel execution takes ~19 ms, but when 2 chunks are compressed together ( using approach_2 ), resources are better utilized, results in improved kernel
-                execution time of ~12 ms. But kernel execution becomes ~9x costlier as soon as 4 chunks are compressed together, because now each work-item compresses 4 chunks in parallel,
-                while representing hash state using sixteen 128 -bit wide vectors. Thus each work-item ends up using too many resources ( read register files for keeping hash state ), which
-                results in register spilling --- portions of hash state are now placed on far distant and high latency global memory. Accessing global memory involves going through
-                multiple memory hierarchies i.e. global memory and two levels of caches etc., each of which adds its own latency. After compressing 4 chunks together, as I keep increasing
-                # -of chunks to compress together ( say 8/ 16 ), execution time also keeps getting doubled up. Notice, for 1024 MB input size, on Intel Iris Xe Max GPU, when 2 chunks are
-                compressed together, kernel execution time is ~47 ms, but when 4 chunks are compressed together execution time suffers heavily --- increasing ~9x ! This trend follows,
-                as kernel execution time is doubled and quadrupled ( compared to compressing two chunks together ) when 8 and 16 chunks are compressed in parallel, respectively.
-                <br>
-                <br>
-                Now let us explore whether similar kind of behaviour is visible when compressing multiple chunks together on Intel CPU. Let me begin with input size 64 MB, when only single chunk
-                is compressed by each SYCL work-item ( read approach_1 ), kernel execution time is ~6 ms. As I increase # -of chunks being compressed together execution time stays almost same until we reach
-                SIMD_LANE_COUNT = 16. Due to presence of limited number of 512 -bit registers, not many work-items can execute at a time, which is reflected when sixteen 512 -bit registers
-                are used for representing hash state, compressing 16 chunks in parallel. When I look at input size both 256 MB & 1024 MB, I see similar pattern, where compressing 4 chunks together
-                performs better than compressing {2, 8, 16} chunks together. When 4 chunks are compressed together sixteen 128 -bit vectors are used by each SYCL work-item
-                for representing hash state. Now there are 32 of these 512 -bit register files on each core of this machine, which means on each core 8 SYCL work-items should proceed in parallel,
-                without any register spilling.
-            </p>
-            <p class="blogText">
-                With these benchmark results in mind, I'll say, approach_1 of BLAKE3, where each SYCL work-item compresses single chunk performs much better ( almost always ) compared
-                to approach_2, where it's possible to compress 2/ 4/ 8/ 16 chunks in parallel, by each SYCL work-item.
-            </p>
-            <p class="blogText">
-                For sake of reproducibility and future reference, I keep blake3's SYCL implementation <a class="blogLink" href="https://github.com/itzmeanjan/blake3/tree/724085b2f3d14aa7e9403f756312ccb2890c299d" target="_blank">here</a>.
-                Here I've presented benchmark results of only three accelerators, but you can find more of them for different input sizes <a class="blogLink" href="https://github.com/itzmeanjan/blake3/tree/724085b2f3d14aa7e9403f756312ccb2890c299d#benchmark" target="_blank">here</a>.
-                You may be interested in BLAKE3 implementation approach_{1, 2}, which you can find in this <a class="blogLink" href="https://github.com/itzmeanjan/blake3/blob/724085b2f3d14aa7e9403f756312ccb2890c299d/include/blake3.hpp" target="_blank">file</a>.
-                <br>
-                <br>
-                Note, this implementation might be helpful when hashing large input sizes, but <i>does it really benefit using this implementation when host -> device data transfer
-                cost is much higher compared to kernel execution time ?</i> is an important question.
-                <br>
-                <br>
-                If you happen to be interested in using BLAKE3 for constructing Merkle Tree, you may
-                check out my other <a class="blogLink" href="https://github.com/itzmeanjan/merklize-blake3/tree/f14f03945a0664ae59ac5c0fdea61623c58949bd" target="_blank">project</a>. Binary
-                Merklization using BLAKE3 is much easier as it requires me to hash only 64 -bytes input and produce 32 -bytes digest, which is an intermediate node of Merkle Tree. This kind of
-                hashing is called 2-to-1 hashing, where two BLAKE3 digests are concatenated & hashed. You may notice, 2-to-1 hashing is simpler ( and cheaper ) because I need to compress
-                only one chunk, which has only one block in itself. And that chunk is root node of BLAKE3's internal Merkle Tree, meaning BLAKE3's internal Merklization requirement is not a
-                requirement anymore. 
-                I need to just make a single call to <tt>compress( ... )</tt>
-                function with proper flags to denote that this is the only node in BLAKE3's internal Merkle Tree --- compute-wise cheap ! So for generating all intermediate nodes of Binary Merkle Tree
-                when N -many ( read N is power of 2 ) leaf nodes are provided ( read each leaf node is a BLAKE3 digest ), using BLAKE3 hash, above linked implementation can come handy. Benchmark details 
-                of Binary Merklization implementation using BLAKE3 for different input sizes on different accelerator platforms can be found <a class="blogLink" href="https://github.com/itzmeanjan/merklize-blake3/tree/f14f03945a0664ae59ac5c0fdea61623c58949bd#benchmarks" target="_blank">here</a>.
-                While we're at it, it can be interesting to take a look at ( and compare ) Binary Merklization <a class="blogLink" href="https://github.com/itzmeanjan/vectorized-rescue-prime/tree/77e371ef2fb11ba7d7369005a60a0888393729f0#benchmark" target="_blank">implementation</a> 
-                using Rescue Prime hash, I wrote few weeks ago. It's clearly visible how performant BLAKE3 is compared to Rescue Prime, when Binary Merklization benchmarks are compared !
-            </p>
-            <p class="blogText">
-                Have a great time !
-            </p>
-            </article>
-        </div>
-    </div>
-    <div id="footerDiv">
-        <footer>
-            <p id="footerText">
-                &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink"
-                    target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-            </p>
-        </footer>
-    </div>
-</body>
-
-</html>
diff --git a/pages/blog.html b/pages/blog.html
deleted file mode 100644
index 3556b98..0000000
--- a/pages/blog.html
+++ /dev/null
@@ -1,326 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-  <title>
-    Anjan Roy, Software Engineer, India
-  </title>
-  <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-  <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Anjan Roy, Software Engineer, India">
-  <meta prefix="og: http://ogp.me/ns#" property="og:description" content="My Thoughts, Experiments & Experiences">
-  <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-  <meta name="description" content="My Thoughts, Experiments & Experiences">
-  <meta property="twitter:card" content="summary_large_image">
-  <meta property="twitter:url" content="https://itzmeanjan.in/">
-  <meta property="twitter:title" content="Anjan Roy, Software Engineer, India">
-  <meta property="twitter:description" content="My Thoughts, Experiments & Experiences">
-  <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta property="twitter:site" content="@meanjanry">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-  <meta name="author" content="Anjan Roy">
-  <meta name="keywords" 
-    content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-  <meta name="theme-color" content="darkslategrey">
-  <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-  <link rel="stylesheet" type="text/css" href="../styles/index.css">
-</head>
-
-<body>
-  <div id="parentDiv">
-    <div id="navBar">
-      <nav>
-        <a class = "navLink" href="/"><big>H</big>ome</a> |
-        <a class = "navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-        <a class = "navLink" href="https://gist.github.com/itzmeanjan" target="_blank"><big>G</big>ists</a> |
-        <a class = "navLink" href="/pages/blog.html"><big>B</big>log</a> |
-        <a class = "navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-      </nav>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/blake3-on-gpgpu.html">BLAKE3 on GPGPU</a>
-        </h1>
-        <p>
-            January 20, 2022
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/evaluate-merklizaion-design-performance-in-sycl.html">Evaluation of Merklization Design and Performance in SYCL</a>
-        </h1>
-        <p>
-            December 31, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/opencl-accelerated-merkle-tree-construction.html">OpenCL Accelerated Merkle Tree Construction</a>
-        </h1>
-        <p>
-            December 19, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/optimizing-parallel-similarity-transformation.html">Optimizing Parallel Similarity Transformation</a>
-        </h1>
-        <p>
-            November 27, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-similarity-transform-method.html">Finding Maximum Eigen Value using Parallel Similarity Transform Method</a>
-        </h1>
-        <p>
-            November 08, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/improving-parallel-cholesky-factorization.html">Improving Parallel Cholesky Factorization</a>
-        </h1>
-        <p>
-            November 01, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-cholesky-factorization.html">Parallel Cholesky Factorization on GPGPU</a>
-        </h1>
-        <p>
-            October 23, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/improved-parallel-lu-decomposition.html">Improving Parallel LU Decomposition by ~20x</a>
-        </h1>
-        <p>
-            October 15, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-lu-decomposition.html">Parallel LU Decomposition on GPGPU</a>
-        </h1>
-        <p>
-            October 11, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/solving-system-of-linear-equations-gauss-jordan-gpgpu.html">Solving System of Linear Equations using SYCL DPC++ (Gauss Jordan Method)</a>
-        </h1>
-        <p>
-            October 03, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/speeding-up-matrix-multiplication-on-gpgpu.html">Speeding up Matrix Multiplication on GPGPU, using SYCL DPC++</a>
-        </h1>
-        <p>
-            September 25, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/computing-julia-set-on-accelerator-using-sycl-dpcpp.html">Computing Julia Set on Accelerator, using SYCL DPC++</a>
-        </h1>
-        <p>
-            September 19, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-root-finding-using-sycl-dpcpp.html">Parallel Root Finding, using SYCL DPC++</a>
-        </h1>
-        <p>
-            September 12, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-matrix-multiplication-with-vulkan-compute.html">Parallel Matrix Multiplication on GPGPU, using Vulkan Compute API</a>
-        </h1>
-        <p>
-            September 06, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/matrix-transposition-with-vulkan-compute.html">Matrix Transposition on GPGPU, with Vulkan Compute API</a>
-        </h1>
-        <p>
-            August 30, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/parallel-rlnc-decoding.html">Parallel Decoding for Random Linear Network Codes</a>
-        </h1>
-        <p>
-            August 16, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/rlnc-for-storage.html">Erasure Coded Storage with RLNC</a>
-        </h1>
-        <p>
-            August 09, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/rlnc-cost-optimization-with-ga.html">RLNC Cost Optimization with Genetic Algorithm</a>
-        </h1>
-        <p>
-            July 23, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/p2p-sync-with-network-coding.html">P2P Sync - A Network Coding based Protocol Design</a>
-        </h1>
-        <p>
-            July 15, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/rlnc-in-depth.html">Random Linear Network Coding - Encoder, Decoder, Recoder</a>
-        </h1>
-        <p>
-            July 10, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/understanding-rlnc.html">Understanding Random Linear Network Coding</a>
-        </h1>
-        <p>
-            July 04, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/adaptive-perception-of-network-topology.html">Adaptive Perception of Network Topology</a>
-        </h1>
-        <p>
-            June 28, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/perceiving-p2p-topology.html">Perceiving P2P Network Topology</a>
-        </h1>
-        <p>
-            June 20, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/speaking-tcp.html">Speaking TCP</a>
-        </h1>
-        <p>
-            June 13, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/sound-of-heartbeat-over-tcp.html">Sound of Heartbeat over TCP</a>
-        </h1>
-        <p>
-            June 06, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          <a class="blogLink" href="/pages/distributed-pubsub.html">Designing Distributed Pub/Sub</a>
-        </h1>
-        <p>
-            June 01, 2021
-        </p>
-      </article>
-    </div>
-    <div class="childDiv">
-        <article>
-          <h1>
-            <a class="blogLink" href="/pages/beginning-of-blogging.html">Beginning of Blogging</a>
-          </h1>
-          <p>
-              May 29, 2021
-          </p>
-        </article>
-    </div>
-  </div>
-  <div id="footerDiv">
-    <footer>
-      <p id="footerText">
-        &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink" target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-      </p>
-    </footer>
-  </div>
-</body>
-
-</html>
diff --git a/pages/blog.md b/pages/blog.md
new file mode 100644
index 0000000..6d7c1e8
--- /dev/null
+++ b/pages/blog.md
@@ -0,0 +1,33 @@
+---
+layout: page
+title: Blog
+permalink: /pages/blog
+---
+
+- [BLAKE3 on GPGPU](/pages/blake3-on-gpgpu) on January 20, 2022
+- [Evaluation of Merklization Design and Performance in SYCL](/pages/evaluate-merklizaion-design-performance-in-sycl.html) on December 31, 2021
+- [OpenCL Accelerated Merkle Tree Construction](/pages/opencl-accelerated-merkle-tree-construction.html) on December 19, 2021
+- [Optimizing Parallel Similarity Transformation](/pages/optimizing-parallel-similarity-transformation.html) on November 27, 2021
+- [Finding Maximum Eigen Value using Parallel Similarity Transform Method](/pages/parallel-similarity-transform-method.html) on November 08, 2021
+- [Improving Parallel Cholesky Factorization](/pages/improving-parallel-cholesky-factorization.html) on November 01, 2021
+- [Parallel Cholesky Factorization on GPGPU](/pages/parallel-cholesky-factorization.html) on October 23, 2021
+- [Improving Parallel LU Decomposition by ~20x](/pages/improved-parallel-lu-decomposition.html) on October 15, 2021
+- [Parallel LU Decomposition on GPGPU](/pages/parallel-lu-decomposition.html) on October 11, 2021
+- [Solving System of Linear Equations using SYCL DPC++ (Gauss Jordan Method)](/pages/solving-system-of-linear-equations-gauss-jordan-gpgpu.html) on October 03, 2021
+- [Speeding up Matrix Multiplication on GPGPU, using SYCL DPC++](/pages/speeding-up-matrix-multiplication-on-gpgpu.html) on September 25, 2021
+- [Computing Julia Set on Accelerator, using SYCL DPC++](/pages/computing-julia-set-on-accelerator-using-sycl-dpcpp.html) on September 19, 2021
+- [Parallel Root Finding, using SYCL DPC++](/pages/parallel-root-finding-using-sycl-dpcpp.html) on September 12, 2021
+- [Parallel Matrix Multiplication on GPGPU, using Vulkan Compute API](/pages/parallel-matrix-multiplication-with-vulkan-compute.html) on September 06, 2021
+- [Matrix Transposition on GPGPU, with Vulkan Compute API](/pages/matrix-transposition-with-vulkan-compute.html) on August 30, 2021
+- [Parallel Decoding for Random Linear Network Codes](/pages/parallel-rlnc-decoding.html) on August 16, 2021
+- [Erasure Coded Storage with RLNC](/pages/rlnc-for-storage.html) on August 09, 2021
+- [RLNC Cost Optimization with Genetic Algorithm](/pages/rlnc-cost-optimization-with-ga.html) on July 23, 2021
+- [P2P Sync - A Network Coding based Protocol Design](/pages/p2p-sync-with-network-coding.html) on July 15, 2021
+- [Random Linear Network Coding - Encoder, Decoder, Recoder](/pages/rlnc-in-depth.html) on July 10, 2021
+- [Understanding Random Linear Network Coding](/pages/understanding-rlnc.html) on July 04, 2021
+- [Adaptive Perception of Network Topology](/pages/adaptive-perception-of-network-topology.html) on June 28, 2021
+- [Perceiving P2P Network Topology](/pages/perceiving-p2p-topology.html) on June 20, 2021
+- [Speaking TCP](/pages/speaking-tcp) on June 13, 2021
+- [Sound of Heartbeat over TCP](/pages/sound-of-heartbeat-over-tcp) on June 06, 2021
+- [Designing Distributed Pub/Sub](/pages/distributed-pubsub) on June 01, 2021
+- [Beginning of Blogging](/pages/beginning-of-blogging) on May 29, 2021
diff --git a/pages/contact.html b/pages/contact.html
deleted file mode 100644
index 5c85923..0000000
--- a/pages/contact.html
+++ /dev/null
@@ -1,72 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-  <title>
-    Anjan Roy, Software Engineer, India
-  </title>
-  <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-  <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Anjan Roy, Software Engineer, India">
-  <meta prefix="og: http://ogp.me/ns#" property="og:description" content="My Thoughts, Experiments & Experiences">
-  <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-  <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-  <meta name="description" content="My Thoughts, Experiments & Experiences">
-  <meta property="twitter:card" content="summary_large_image">
-  <meta property="twitter:url" content="https://itzmeanjan.in/">
-  <meta property="twitter:title" content="Anjan Roy, Software Engineer, India">
-  <meta property="twitter:description" content="My Thoughts, Experiments & Experiences">
-  <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-  <meta property="twitter:site" content="@meanjanry">
-  <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-  <meta name="author" content="Anjan Roy">
-  <meta name="keywords" 
-    content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-  <meta name="theme-color" content="darkslategrey">
-  <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-  <link rel="stylesheet" type="text/css" href="../styles/index.css">
-</head>
-
-<body>
-  <div id="parentDiv">
-    <div id="navBar">
-      <nav>
-        <a class = "navLink" href="/"><big>H</big>ome</a> |
-        <a class = "navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-        <a class = "navLink" href="https://gist.github.com/itzmeanjan" target="_blank"><big>G</big>ists</a> |
-        <a class = "navLink" href="/pages/blog.html"><big>B</big>log</a> |
-        <a class = "navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-      </nav>
-    </div>
-    <div class="childDiv">
-      <article>
-        <h1>
-          contact
-        </h1>
-        <p>
-          <b><big>E</big>mail</b> : hello [at] itzmeanjan [dot] in
-          <br><br>
-          <b><big>T</big>elegram</b> : https://t.me/itzmeanjan
-          <br><br>
-          <b><big>T</big>witter</b> : https://twitter.com/meanjanroy
-          <br><br>
-          <b><big>L</big>inkedIn</b> : https://www.linkedin.com/in/itzmeanjan
-        </p>
-      </article>
-    </div>
-  </div>
-  <div id="footerDiv">
-    <footer>
-      <p id="footerText">
-        &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink" target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-      </p>
-    </footer>
-  </div>
-</body>
-
-</html>
diff --git a/pages/contact.md b/pages/contact.md
new file mode 100644
index 0000000..e4f1d6f
--- /dev/null
+++ b/pages/contact.md
@@ -0,0 +1,155 @@
+---
+layout: page
+title: Contact
+permalink: /pages/contact
+---
+
+Platform | Address
+:-- | --:
+Email | hello@itzmeanjan.in
+Telegram | https://t.me/itzmeanjan
+Twitter | https://twitter.com/meanjanroy
+LinkedIn | https://www.linkedin.com/in/itzmeanjan
+
+---
+
+#### SSH Public Key
+
+```bash
+$ ssh-keygen -f ~/.ssh/key.pair -y # Get SSH public key from private key
+
+ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOVUHTAdqrXRyPWmOa8Ri8vFBK1Qmw4AwVQAzPdwjJGo hello@itzmeanjan.in
+```
+
+#### GPG Key Id(s)
+
+```bash
+$ gpg --list-keys --keyid-format long # List all GPG keys
+
+/home/anjan/.gnupg/pubring.kbx
+------------------------------
+pub   rsa4096/06B35D7B157875E1 2022-04-24 [SCEA]
+      FCD6557C5B06539D88128F2406B35D7B157875E1
+uid                 [ unknown] Anjan Roy <hello@itzmeanjan.in>
+sub   rsa4096/10EB1439BE2E1142 2022-10-30 [S] [expires: 2023-10-30]
+sub   rsa4096/7332A62D3362FF92 2022-10-30 [E] [expires: 2023-10-30]
+sub   rsa4096/ED006755D0724926 2022-10-30 [A] [expires: 2023-10-30]
+```
+
+> **Note** Retrieve my PGP Keys using my email address `gpg --auto-key-locate keyserver --locate-keys hello@itzmeanjan.in`. Read more about it [here](https://keys.openpgp.org/about/usage).
+
+#### PGP Public Key
+
+```bash
+$ gpg --armor --export 06B35D7B157875E1 # Export PGP public keys
+
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+
+mQINBGJk1IIBEACoqW5XCqoq6ePFPwX1YucmWyTHFNm+NlFZWFDRD2H3KPyZwtmR
+6gDe0/wmzpHsMmbWiBwwOPsyIObjfF7sZQ72tLsV/i0tUGRnWTDTjSGUxYyPS5p9
+yi4qbplBd+K9ADxuQQgQtqMsHvxZnQqnkdGmy4CXvYTHSd5zFiCDA/QUJwz8h+b1
+wq7edPVmg4nuLDrj5auh8KGRNBnIVhbnLNgbi1q1j4KWsJ21ZHS2ed9nLOs71AWm
+usoObGYZY1r7aqHVLPhSPEsIfRib2wCsDy8RbBSPZMz11jEz/8Ri/9NaSitN/8tV
+MP6AuY3I3w9O8vMfCAXvyl8n7kF5avPqRq0H+2o0+ssdlHlzO0d92pCbgezswup9
+7tgQ8yLimCCdlgbhq+RcMPHwI9HNJXpvHyrCRh1KpJ0jGBj7Klbb/XWrSlNgtB87
+/hCz8jutqsQLmJZ4sPj16vmPHNkjm3TvleTQUcEKit7tWXK6Ta46oQs6sJfmbQbY
+0GmCAqpvFGyz2gHlwnPl8z+b/gmGOlrTzjsIYaN2LjTl5d0n/krG1u6jUNEp+W7I
+adyYIjXJgLtP8MtOrPp8Im3X8uV4uXAVL8EwB1PEPTZZgfazrZZ97nlh6E5UA3nz
+TxEfNdbsWt/1dDf9bHCeoCMfR+UnKfW6XIj/cpV3x9brYmffluiuEhsaZQARAQAB
+tB9BbmphbiBSb3kgPGhlbGxvQGl0em1lYW5qYW4uaW4+iQJOBBMBCgA4BQsJCAcC
+BhUKCQgLAgQWAgMBAh4BAheAFiEE/NZVfFsGU52IEo8kBrNdexV4deEFAmNdgCEC
+Gy8ACgkQBrNdexV4deH1vRAAnuNDK+gK2nl3wlouknklezTb2p1pS70srweHk2Kz
+E5P7h/5uxnsVPrODfGJVPs/F1bvsNnS2XEWTTtvkCkIydRPcOdK8GgjFmomwxG6x
+oTZ+DuwYa6qGa0mRCDScNxflcwTaCVMGgg6tAAIZF2qd/8NHpBDlUQewqTTGUmVQ
+fc6OwrYfPSwQ0xAWSWVv96zg2Gcf/oMIhK/jPk6AjdMBg6qtqprudy4dfvHwwrhz
+NOUtC/lzG0gvxTQpmqqjH1w2gp5VqsQRk5NHYMYk1uyBvVBdZ9OSC+ybfFDIOtVt
+Wcwek8aIz9jV9LHZoID0P6nLzyXXTpJUEXH0VMXD6UN3V75rTuP9L6+GPibx/2Yz
+EvWHxxgZkqxaSGLCmQT4WBUR7v1moPS1FztdEnYb3884kwLgXV3+IPbJPouwxP8d
+TkR3kDoFYcq+aqk2iov7a8JJWnQtxmezgjMpdn+bfuupfeIKDsnZSUrDNi9jUpSP
+A0OYQeRxKmuIKVb4GFASuAHEqCM0OY4UWB2tvS2njOvngMYQsVHj1UtLP80BcR4b
+QVpvkAtulKfNel40Ef2PeHFVpx9v+EythQ2OhBMNliz5lVhJgwLZWVUQstB9WAMa
+WE/Jm19U7I//xEplfSc8Vgch6B+Qz9gV7+4/W42fRc+9R2DgZBcfV4U1vKSFX3EL
++/65Ag0EY14MtAEQAKIlHdeRSOYsZNwOeYeOuA6spPJPX5/jykL7YV9VijAmDiVf
+ivEPnLn+IiKKWudzmlxxMg560H9EA5mgNy3NkPNjsHr1YTvxx84cTm62pDp5yo/g
+vMHRIAyT/Lag++9X3d1qvQFdSHPkkxTQXcYbBAAS5J3ovBBUjpx73+X8WAWiVMFJ
+JPBr2V3PqvfkNS6FwuiU1g+WfCIIHsPLca4vIMz/nT+DHwi+oE0Ub65sBfrJ2ITk
+sSoVEYZEkVvlXiz24NmCn9tQA+AgLCjNNp90NUzx49UOdNxp+cW8HdRXfjo3x4Iy
+zkJKOlUR3rOypXTnkQnTWvd/a/SOWNDrSb4kkMtl3MZJdxBiF6qqMeelQPE8tNNz
+3hWQ9Byc4KZ0ZWpRWct4wg54cKmh13y9razXe8NaEWBSNcPpcYB0P5H4j+EJO4qR
+5fKpBMUmmygelQEmd0MR3UsLYdpiXHENE7ZJtjR+6KsgmfpU/iDlBAEBHRE7gKO1
+vojc/5EnjPXZYWC+8u2K1moybuCC52qVp4KuQ11stsbJRHL905MK9ceFJEkvhX48
+CXSOMOiupo1nGt7bUKcef/Nl3V+iaMLETL7vq93DX/ezzIKWrkyXH1YhJlULFjUC
+sPN91E74TDkQSK9o0lYNHxKEeldGtgcpJ8tptdysB3178GEj96l36Yf0dQ2fABEB
+AAGJBHIEGAEIACYWIQT81lV8WwZTnYgSjyQGs117FXh14QUCY14MtAIbAgUJAeEz
+gAJACRAGs117FXh14cF0IAQZAQgAHRYhBNO6OFl2cTEvXj4ltRDrFDm+LhFCBQJj
+Xgy0AAoJEBDrFDm+LhFCzisP/RWh3wSDZofuh0+9stG6OwzBvgESxAjsT0ZU8es2
+vBFKYPgYCL40l8QCiU+7OjjX6pNW7UwD+UQZZRiFmFhW2g1NEj7k11TAejRWClO0
+KTVgjr3p+USMVHlUvL7G1dpYo7YTMvPO+DpC1qQezfwYv1tne0/k3/pS1w0LOHH9
+w98IMYVszhQRlcsknpshLy+O4Sg58DXicwEI3qpphS4DuSSNxFZLMyqcezE2nVPY
+HUubhssF0ad7B5Sjynm8gQTNzHIFXtoggzLGnXCylfu5hA860XbMJEpXIgG8YPBL
+yE/7uG5lbKdpqfpb0F9Tk6p5A8rzKBDfIHYY5DHwl0AI3KHgoQMoHxUxw7o673EM
+c0HZ2Tg6Ej+tuBHMIw+fVZfGM3LMOz9lkPx3qDMqdQ17y0tmCdgTAHaGcgNO8Mtx
+hwPY/EbuTv0QMBgSVVp3P2QrmNYhciZ/u8jawKweItWDyoXDELh3btmiw3krRALQ
+6RhJc6mYEf488cU08N3/59+HKvCq8jt0rBwW2fZLN82wqSovcYN8cZj67rEjYscp
+5SQbsBZVj+WPQ5TW4z6SspYrFtHu3gBcKPuX4STdd19ymQA07TZ9pTLMWK4fgSml
+AZq6URwnvs7YMpbCu9UKU6ZGnfWPO5O9CbBq8KEasWQD+lA9Rqm0foHokroJNRb2
+cbov9nEQAI++3iTcezUaFpTCAneSeXpHn5HhJVYBruVrVSiY4ulX7Xt1ROGjTXYk
+y26LPsL8/fDOXVVUab+M/8iLk5scScPMVwnMb8gTou5n/Df+a/wOXdFa4TTnwQWT
+ZvRScHGh6L9imPJ79uLmLXX9U+lTZKUpzkhR8th5baarsR4MHxXO9IOfGCTan7aq
+x5ea8Xz4IaikV5LPQsSQ8we/cixfciSU5VH+KIVODy3H6htFOX+GJGn19yu9X6Xa
+SvgkAOFRwjowFafQKsamaGjkTIf/dT8IRPsa5fPds1+vo/sQ4wP1exVcjSagxPA4
+lcIDUZHZvVsvu4Rlrb2L7A9gkBo5MRVD8AySUd0gMuqKCGUb5YsWYxmHV9i9zT2s
+7AxcGPpL8d77D7ptF+EvS69IVNuQk4eIG26gv3R13vzbGOP+v2htR1Y1X1K1AMpU
+VnvcS1cQ8uVSwybBiPoHfGwKOrjQVk7ftE4cAflt+pTjU7iEuT54KKxx9E9Bkmhy
+9IFRk6X/ZqyGGhU/9rePVatrlXkmagvqptaqzKkMlzA2PA/o9UF09Dr/zVzT/iGg
+kLMvSxhNoMNOTn5uPEs7wIP3XHCKcPqAd+yMDC5FPBkrKm5nx3rp9wBpJ4Yf6P7w
+I5gKq3DRlrEckX34cnVUfXZTzvRd4Bbs+Kd4w2SethlYv7GcO52muQINBGNeDMsB
+EACaP35qmExFizew8zV85f+mGBsqdbxKDu0BxDhbL50ONfDH80YXciuVbDL6vDEU
+nE81NfEaIb0A1BxbjJfi2s2XsdVZCUBV98xOtoi0YTaDSPk+W6tNa9w2sGNyxuv7
+8fQnRyyKIFpDiwdd+3khitJNZ//QX8VOtsHVs9ZFJrye38vaQUE1cUg8kKKkrEQL
+9Llset97HM6B0HyWoQRdVanryo4qkk8pjlnvvM7olU3N3uRcPoN8IPgORn02985w
+V8YnPMq6N5q6Q+sSnR29PvTqHRP8xSN1XLa6m7WXt97/Odjdyi14sDwLc2Urf/lk
+7BFaX/CL7tOZxBg7KP99QGeJaEyo0x6rJfX72sQKUI7w5YH+GlYrI/TLPCMT5Z9L
+r/8H3v88QzSUi07/GIz8AVn9hTuTjQJIRjQw9OhIccIRhUCQrt9jExG7qzlaJg5K
+mPGC48Aze9ZS60/KgkYiICmbzQpJaMOfNRccA/8OZbBAScWWslTsqypuOogdJxm/
+mlawdfXamEqwzt2Hy/k2lUtRXc+ZZtAHSCt+iU9X6vkYfjjHsp508CCfGdOjuvIW
+REJ3VsBVZOyDsPMMlAGxphZFJlZWELA3V4jGeGxvUE/NnIdbZPAx1q8Ek50Hly6L
+ofZhpHNwB1IULk+yxVG4DqP59FsXjN63MwchnloMnTQ3wwARAQABiQI8BBgBCAAm
+FiEE/NZVfFsGU52IEo8kBrNdexV4deEFAmNeDMsCGwwFCQHhM4AACgkQBrNdexV4
+deFdcxAAlI2iyjQUAwEP573dJ3ZvLQgmWo+P9ICQIazmnIqshHrYcpt3wMPspP4C
+CRl4e0sthGWXueS5nlUWDVS9y5I1XovaKJb6/IYChg55qLTxH0Sq14Bt6vZPjbQz
+a1JcLeb8HUmQpFro7jS4lCwsQKgQOtVZs+JuAJ88k2Tr97SnAc/VnGqcs5LmViaT
+hNjcyXyLeNddZefXcZG5eDLD0scI02o7L1Ul1R6ux4N4O4vMJVVkyaWDeyGL5BYo
+6szgoiIoBnYsk7j+uQEcIkyabB6m86AtQamJRENXqSN9g050A+7cgG2btI67QpDI
+NYtNB9afLM2DjHK6zNHiAEfWssgADFJcmp4tHjhrR3/vYriqn1lO5JH5aUT2eWZ6
+6B4xEsZI39E189Gx3dHiEsqGm7GgfoIqqgueBdP72vyZ75GvlOapWl7/UIjWCZPR
+86SJwaoOt5qz8mO1bW0f/pEcxEsN+0Shc8efkP981koWwLo6uBi4pSUQPtGLJDpI
+cc/qFnsU5o6x5ejFIZ872tSK1fJuM5B9fs85yVX7zrTmkNuNpGFqKk8IyHz9uOVT
+EeRyNY1bx10fuNsQfIG5FVPTkK0bQe19uAGZxBwY2y2Ub4vODgOplFUUMEtfGZ2+
+S21zlIHvEtdh7C4lRqZpF4/dw4brX/2dkfpOZpL6pUXL4OhAu3G5Ag0EY14M6QEQ
+AMGEdpd8hydjyEEuLqeGhURJWizBzsul6vKlRa4BNDILNHN8ZNZemRre0hJgsobq
+ljJiXNGaftdQ2Vn5va/7JtXVcJf9cZAFSbK4YBLMg0JGqA9DLQ0L1xNpWq1yEFpq
+X07lAgA4iCV8XleF/hFpC71eHCY3AdrATgEyef++DDo5KK+9s0drm+mkrKzH+l+N
+gJOBik7I/cp362240/oSepVBInXNCaLsHUMNYmTTLsb5AD01pDMkGbubv5rTTxY8
+WajXdbSBl6bsC8vIOKU64l2ce6eLpL7EejzCiDdz39WXC6azqkBtJGd5fqUKOzoG
+v/n/4gMvD+FcYmfFQfA6yAVPh3cw42kx2RtFxTdhnARMyxKmtTL2uwC4NGibqPC4
+YA97sDkxsW4NqmGKlc38YSaTd9uiIj00sA0mu3E3qxGrPBxH5Fs/5ksOCQPIGM9N
+YUFuRiumdqmzjvYc5QTNB5aUw7DzKFszd86VhL5hBGIgDrb7bC+3YfwKp7AIU3aX
+QGqbk6lyEEsHzKFXYigAlcvCJkr2JIQRokPDwufOyacCD9M3jju4dX862LwXqTA2
+CYOG1jv63ifKuTrDQ3LfPG/YpFE6iPEMEyB/H1+P5ElkOKqOzU09xe6nw7TxzpHS
+CiRkpVerZT++3ejOiWNsSGAUHsTc+JJQpa4jE/mHanFDABEBAAGJAjwEGAEIACYW
+IQT81lV8WwZTnYgSjyQGs117FXh14QUCY14M6QIbIAUJAeEzgAAKCRAGs117FXh1
+4ZRYD/wPsCX/XX43v5CJ1KuCIYDrK503ZXk+BoEQIwiOFsAhhILQRp3LjCB57CXg
+mljcleesu/sCw4CX6GfLHhRqgeXPInB2d9l16sY+O0plfxTk/KzSEnwZWvMYa7Z4
+ZnZiXvVv5U4k7lmw8i5Ki5Ce8LfkjHoAs8mK1T9TbvOGtySWChLAP1jK93PIXmtd
+wzgzubO2kVxzpHMzfQTt+spD+KcLvTIkdPt3Gno0RYcL5nChrvdKHvG05K+5FhiM
+fyUb/a6WjqqOLuERbBHeYTQCbOykgn0lD4BuV2e/tI190m58hupOsfggsqIypMUZ
+mgDQcEi4mbj7hUwYtlaBm4JnwvbnARa67zfRFmY3PKJfGIK4MfK0CBa1HzeMDo0G
+M1MV4VGSKloSK9OB2rDvs3/DeEx+DLZtLkNjJl5gwrPEQXcK/1AHFDLgl++fYCDX
+fQcUHujzVObsXC1A5xNTqoLHulpBYJz0fzUMlQxI7L5S60OR8k5ItVEb/CuBK08H
+wuskBg8pQXwGFSpq3ejTFTvIArdKDhRy78leDIknbjA01NRAk5aZuS7X6/5FbsUk
+Wd2vb6lJ90gjWO7F/favA4WKCaEmGTBk7KdMFvwQuklqxPCMg/HyKy853VBCDLR9
+d2bJDbl98J9K7yVnHYgKev45kDVEDGo/HGRbSiqlU60E8dGYjQ==
+=2eKJ
+-----END PGP PUBLIC KEY BLOCK-----
+```
diff --git a/pages/distributed-pubsub.html b/pages/distributed-pubsub.html
deleted file mode 100644
index 1be87ea..0000000
--- a/pages/distributed-pubsub.html
+++ /dev/null
@@ -1,501 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-    <title>
-        Designing Distributed Pub/Sub
-    </title>
-    <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-    <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Designing Distributed Pub/Sub">
-    <meta prefix="og: http://ogp.me/ns#" property="og:description" content="A possible design of Distributed Pub/Sub System">
-    <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-    <meta property="twitter:card" content="summary_large_image">
-    <meta property="twitter:url" content="https://itzmeanjan.in/">
-    <meta property="twitter:title" content="Designing Distributed Pub/Sub">
-    <meta property="twitter:description" content="A possible design of Distributed Pub/Sub System">
-    <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta property="twitter:site" content="@meanjanry">
-    <meta name="description" content="A possible design of Distributed Pub/Sub System">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-    <meta name="author" content="Anjan Roy">
-    <meta name="keywords"
-        content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-    <meta name="theme-color" content="darkslategrey">
-    <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-    <link rel="stylesheet" type="text/css" href="../styles/index.css">
-    <script src="../styles/code.js"></script>
-</head>
-
-<body>
-    <div id="parentDiv">
-        <div id="navBar">
-            <nav>
-                <a class="navLink" href="/"><big>H</big>ome</a> |
-                <a class="navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-                <a class="navLink" href="/pages/blog.html"><big>B</big>log</a> |
-                <a class="navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-            </nav>
-        </div>
-        <div class="childDiv">
-            <article>
-                <h1 class="blogHeader">
-                    Designing Distributed Pub/Sub
-                </h1>
-                <h3>Created : June 01, 2021</h3>
-            </article>
-        </div>
-        <div class="childDiv">
-            <article>
-                <p class="blogText">
-                    Sometime ago I started working on <span class="highlight">pub0sub - Fast, Light-weight, Ordered
-                        Pub/Sub
-                        System</span> --- built on top of async I/O, leveraging power of kernel event loop.
-                    <br>
-                    The main idea behind it was to write a software ( along with SDK ) which can be used for publishing
-                    arbitrary length binary messages to N-many
-                    topics; subscribing to N-many topics --- listening for messages published on each of them; and last
-                    but not least
-                    one powerful Pub/Sub Hub ( i.e. Router ) which will easily solve C10K by leveraging power of async
-                    I/O.
-                    <br>
-                    <br>
-                    The aforementioned problem statement is solved, which is why I decided to update problem statement.
-                    Now it looks like <span class="highlight">pub0sub - Distributed, Fast, Light-weight, Ordered Pub/Sub
-                        System</span> --- solving
-                    C1M easily while leveraging power of kernel event loop & p2p networking.
-                    <br>
-                    <br>
-                    By making <span class="highlight">pub0sub</span> distributed, I get to handle 1M concurrent
-                    connection
-                    where nodes form a mesh network for chatting about topic interest(s) & forward messages when need to
-                    --- a collaborative effort among peers.
-                    I choose to use <span class="highlight">libp2p</span> for networking purpose, for being so modular
-                    --- enabling easy horizontal
-                    scalability, while taking care of stream multiplexing, security, peer-discovery etc.
-                    <br>
-                    <br>
-                    Here I propose primary design of system !
-                </p>
-                <br>
-                <p class="blogText">
-                    Multiple <span class="highlight">pub0sub</span> nodes can discover & connect to each other using DHT
-                    ( distributed hash table )
-                    powered peer discovery mechanism, built right into <span class="highlight">libp2p</span>
-                    and eventually form a mesh network. If network has N participant(s), each participant is going to
-                    maintain
-                    connection with other N-1 peer(s), where N > 0. These participants of p2p network are going to chat
-                    with each
-                    other over bi-directional stream. Things nodes need to talk about 👇
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Operation</th>
-                        <th>Interpretation</th>
-                    </tr>
-                    <tr>
-                        <td>Topic subscription</td>
-                        <td>Letting peers know of interest in some topics</td>
-                    </tr>
-                    <tr>
-                        <td>Topic subscription ACK</td>
-                        <td>Peer saying it has noted down & will forward published messages if sees any</td>
-                    </tr>
-                    <tr>
-                        <td>Topic unsubscription</td>
-                        <td>Announcing not interested in topics anymore</td>
-                    </tr>
-                    <tr>
-                        <td>Topic unsubscription ACK</td>
-                        <td>Peer saying it has removed entry & will no more forward published messages</td>
-                    </tr>
-                    <tr>
-                        <td>Published message forwarding</td>
-                        <td>Passing published message to interested peer</td>
-                    </tr>
-                    <tr>
-                        <td>Periodic heartbeat</td>
-                        <td>Network health check</td>
-                    </tr>
-                </table>
-                <p class="blogText">
-                    As each of aforementioned operations require to pass different message formats, I'm going to define
-                    respective wire formats. But before I get into wire format, writing to/ reading from stream
-                    I'd like to spend some time in going through high level overview of network operation.
-                </p>
-                <br>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-2.jpg">
-                <br>
-                <p class="blogText">
-                    Say, two nodes form a cluster --- one node has a <i>topic_1</i> subscriber connected to it while
-                    other one has a publisher connected to it, willing to publish message on <i>topic_1</i>. After
-                    first node finds out, it has one subscriber interested in messages from <i>topic_1</i>, it decides
-                    to ask its peer <span class="highlight">0hub</span> node, if it sees any message targeted to
-                    <i>topic_1</i>, it should inform requester. Publisher sends publish intent to network, which
-                    triggers
-                    event saying network has received some message on <i>topic_1</i> for which first node has interested
-                    subscriber.
-                    Two nodes chat over p2p network, resulting into message forwarding, which enables first node
-                    to deliver message published on <i>topic_1</i> to its subscriber.
-                    <br>
-                    <br>
-                    When noticed carefully, network follows certain protocols
-                </p>
-                <ul>
-                    <li>When subscriber shows interest in <i>topic_1</i>, <span class="highlight">0hub</span> broadcasts it to peers</li>
-                    <li>When publisher publishes message on <i>topic_1</i>, <span class="highlight">0hub</span> forwards message to all interested peers</li>
-                </ul>
-                <br>
-                <p class="blogText">
-                    Let's take another scenario.
-                </p>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-3.jpg">
-                <br>
-                <p class="blogText">
-                    Continuing previous scenario, after sometime subscriber doesn't anymore want to receive
-                    messages published on <i>topic_1</i>, so it sends unsubscription intent to network. As a result
-                    of it, respective <span class="highlight">0hub</span> node decides to broadcast same to network, because it found it doesn't have any other
-                    subscribers who're interested in messages of <i>topic_1</i>. All peers who kept record of this node
-                    being interested in <i>topic_1</i>, updates their respective interest table, ensuring when in future
-                    it receives message published on <i>topic_1</i>, it won't forward to first peer.
-                    <br>
-                    <br>
-                    This way of showing interest to topics when peers has some subscribers to feed or
-                    announcing not interested anymore when all subscribers of certain topic unsubscribes --- allows
-                    network to pass published messages only when needed, eventually consuming lower bandwidth. I call it <b>Lazy
-                        Pushing</b>.
-                </p>
-                <br>
-                <p class="blogText">
-                    With more peers, network interaction may look like 👇 from high level, where <span class="highlight">0hub</span> nodes
-                    form p2p mesh network, other participants are mere clients.
-                </p>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-1.jpg">
-                <br>
-                <p class="blogText">
-                    Say, one subscriber shows interest in receiving messages from <i>{topic_1, topic_2, topic_3}</i>
-                    but the <span class="highlight">0hub</span> node it's connected to doesn't have any publisher
-                    of any of those topics. As soon as <span class="highlight">0hub</span> node learns it has subscriber
-                    to feed messages of
-                    <i>{topic_1, topic_2, topic_3}</i>, following protocol it announces that intent to other peers.
-                    Each of other peers record it & as soon as they receive any message published on any of these topics
-                    they forward those to respective peers.
-                </p>
-                <br>
-                <br>
-                <p class="blogText">
-                    I'll now spend some time in specifying wire-format of messages exchanged between peers.
-                </p>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-4.jpg">
-                <br>
-                <p class="blogText">
-                    Each message exchanged between peers over p2p layer, needs to have two parts
-                </p>
-                <ol>
-                    <li>Header ( 5 bytes )</li>
-                    <li>Body ( N bytes )</li>
-                </ol>
-                <p class="blogText">
-                    Just by reading header part receiver must be able to understand two things
-                </p>
-                <ol>
-                    <li>What kind of operation is it ?</li>
-                    <li>How many more bytes to read from stream for consuming message body ?</li>
-                </ol>
-                <p class="blogText">
-                    First question can be answered by checking very first byte of message. Each operation
-                    is denoted by unique opcode. There're 255 possible opcodes, though only 6 of them are in use as of
-                    now.
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Interpretation</th>
-                        <th>Opcode</th>
-                    </tr>
-                    <tr>
-                        <td>Heartbeat</td>
-                        <td>1</td>
-                    </tr>
-                    <tr>
-                        <td>Topic subscription</td>
-                        <td>2</td>
-                    </tr>
-                    <tr>
-                        <td>Topic subscription ACK</td>
-                        <td>3</td>
-                    </tr>
-                    <tr>
-                        <td>Topic unsubscription</td>
-                        <td>4</td>
-                    </tr>
-                    <tr>
-                        <td>Topic unsubscription ACK</td>
-                        <td>5</td>
-                    </tr>
-                    <tr>
-                        <td>Message Forward</td>
-                        <td>6</td>
-                    </tr>
-                </table>
-                <p class="blogText">
-                    By reading next 4 bytes from header, receiver understands how many more bytes it should read from stream
-                    so that it can successfully deserialise message, depending upon opcode. Each opcode denotes
-                    different
-                    message wire-format, resulting into invocation of different deserialisation logic upon reception.
-                    <br>
-                    <br>
-                    Above is a high level wire-format, which is applicable for each of messages. But I'd like
-                    to define how <b>BODY</b> of message is serialised/ deserialised for different opcodes. Starting
-                    with
-                    how it looks like when announcing interest in listening to some topics.
-                </p>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-5.jpg">
-                <br>
-                <p class="blogText">
-                    Note, above image is nothing but magnification of message <b>BODY</b> when opcode ∈ {2, 4}.
-                    Requirement
-                    is peer needs to announce it wants to receive all messages published on topics, because it has
-                    some subscribers interested in those. Receiver side when reading from stream, knows how many
-                    bytes it needs to read from stream for completely consuming <b>BODY</b>.
-                    <br>
-                    <br>
-                    It starts by reading first 1-byte, where it has instruction encoded how many next bytes it should
-                    read for making one meaningful topic name. Now it has either consumed all bytes of <b>BODY</b>
-                    or some of them are left. If left, it'll again consume 1-byte, carrying instruction for it for
-                    figuring out what's next topic name. This way, it'll keep reading until it has exhausted all bytes
-                    of <b>BODY</b>. By the end it must have successfully constructed structured object in respective
-                    environment, containing topics some peer want to get notified of.
-                    <br>
-                    <br>
-                    Similar wire-format is followed for serialising <b>BODY</b> when announcing lack of interest in some
-                    topics.
-                    <br>
-                    <br>
-                    Both of aforementioned opcodes, expect to hear back with ACK messages i.e. opcode ∈ {3, 5}, where
-                    <b>BODY</b> can be encoded by putting binary value denoting success/ failure. These are expected to
-                    be received with in a stipulated time window after interest ( opcode 2 ) / lack of interest ( opcode
-                    4 ) message is sent to peer. If not
-                    received, for opcode 2, it'll be resent upto N-times. If still not received, it results into
-                    connection termination
-                    with peer for not following protocol.
-                    <br>
-                    <br>
-                    But if peer is waiting for ACK of message with opcode 4, it doesn't resend, because of being low
-                    priority. Of course
-                    it might result into network wasting some bandwidth for passing some published message which could have been
-                    avoided. If any forwarded message
-                    from any topic to which peer is not interested in, is received even after lack of interest message
-                    was broadcast ( opcode 4 )
-                    it can be ignored by receiver. Receiving peer also sends another message to respective peer with
-                    opcode 4, stating it's
-                    not interested in these topics --- just like repeating self. This is done so that next time network can save some bandwidth.
-                    <br>
-                    <br>
-                    Finally I'll cover how to serialise/ deserialise forwarded message to/ from stream.
-                </p>
-                <img class="imgCenter" src="../images/pub0sub-high-level-arch-6.jpg">
-                <br>
-                <p class="blogText">
-                    Start by reading first byte of <b>BODY</b>, which encodes how many topics this message is being
-                    targeted to.
-                    A message can be targeted to 255 topics at max. Receiver knows how many topics it should be reading
-                    from stream.
-                    So it starts by reading next 1 byte, encoding first topic's byte length. It knows how many next
-                    bytes to be read
-                    for figuring out first topic name. It has just read one topic name. Similarly it'll continue reading
-                    more topic names
-                    until all are read off. After N topic names are read, it'll read 4 bytes, encoding how many next
-                    bytes it needs to
-                    read for extracting out actual message content.
-                    <br>
-                    <br>
-                    Eventually it'll reach end, constructing structured data by consuming stream. This is how forwarded
-                    messages
-                    are recovered from stream by some peer who showed interest in getting notified when some message is
-                    published
-                    on topics of interest. After getting structured data, receipient <span class="highlight">0hub</span>
-                    node can
-                    send message to subscribers connected to it directly, interested in any of topics this message is
-                    published on.
-                    <br>
-                    <br>
-                    <b>Reader may notice</b>, a slight difference in encoding variable number topic list, between
-                    previous two diagrams.
-                    When encoding to be forwarded message ( opcode 6 ), peer encodes topic count in first 1-byte of
-                    <b>BODY</b> part of message.
-                    This is required, otherwise during deserialisation receiver won't be able to understand where in
-                    stream it should stop
-                    reading topic names & start reading 4-byte lengthy actual message content's length field.
-                    <br>
-                    <br>
-                    But same is not required for message sent with opcode ∈ {2, 4}, because there's nothing more to read
-                    after topic name list
-                    and receiver already knows length of <b>BODY</b> part of message, so it knows how long to read from
-                    stream.
-                </p>
-                <br>
-                <p class="blogText">
-                    Let's go through one example
-                    <br>
-                    <br>
-                    Say <span class="highlight">0hub</span> peer want to announce its interest in messages published on
-                    <i>topic_1</i>, <i>topic_2</i>. Serialised message for this operation looks like
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Message Part</th>
-                        <th>Field Name</th>
-                        <th>Field Byte Length</th>
-                        <th>Field Value</th>
-                    </tr>
-                    <tr>
-                        <th rowspan="2">Header</th>
-                        <td>Opcode</td>
-                        <td>1</td>
-                        <td>2</td>
-                    </tr>
-                    <tr>
-                        <td>Body Length</td>
-                        <td>4</td>
-                        <td>16</td>
-                    </tr>
-                    <tr>
-                        <th rowspan="4">Body</th>
-                        <td>Topic-1 Length</td>
-                        <td>1</td>
-                        <td>7</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-1 Name</td>
-                        <td>7</td>
-                        <td>topic_1</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-2 Length</td>
-                        <td>1</td>
-                        <td>7</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-2 Name</td>
-                        <td>7</td>
-                        <td>topic_2</td>
-                    </tr>
-                </table>
-                <p class="blogText">
-                    21 bytes of data to be sent to each peer, resulting into (N-1) * 21 bytes of data
-                    broadcast in total, where N > 0 & N is #-of participants in mesh network.
-                    <br>
-                    <br>
-                    Similarly by following aforementioned example, message of lack of interest to topics ( opcode 4 )
-                    can be published on network.
-                    <br>
-                    <br>
-                    Finally I'll go through one last example showing serialisation of to be forwarded message i.e.
-                    opcode 6.
-                    Assuming this message is published on <i>topic_1</i>, <i>topic_2</i> & content of message is
-                    <i>hello</i>.
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Message Part</th>
-                        <th>Field Name</th>
-                        <th>Field Byte Length</th>
-                        <th>Field Value</th>
-                    </tr>
-                    <tr>
-                        <th rowspan="2">Header</th>
-                        <td>Opcode</td>
-                        <td>1</td>
-                        <td>6</td>
-                    </tr>
-                    <tr>
-                        <td>Body Length</td>
-                        <td>4</td>
-                        <td>26</td>
-                    </tr>
-                    <tr>
-                        <th rowspan="7">Body</th>
-                        <td>Topic Count</td>
-                        <td>1</td>
-                        <td>2</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-1 Length</td>
-                        <td>1</td>
-                        <td>7</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-1 Name</td>
-                        <td>7</td>
-                        <td>topic_1</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-2 Length</td>
-                        <td>1</td>
-                        <td>7</td>
-                    </tr>
-                    <tr>
-                        <td>Topic-2 Name</td>
-                        <td>7</td>
-                        <td>topic_2</td>
-                    </tr>
-                    <tr>
-                        <td>Data Length</td>
-                        <td>4</td>
-                        <td>5</td>
-                    </tr>
-                    <tr>
-                        <td>Data</td>
-                        <td>5</td>
-                        <td>hello</td>
-                    </tr>
-                </table>
-                <p class="blogText">
-                    This results into sending 31 bytes of data to each of those peers who showed interest to
-                    <i>topic_1</i>, <i>topic_2</i>. Not to all N-1 remaining participants of mesh network --- <b>Lazy
-                        Pushing</b>
-                    at work.
-                </p>
-                <br>
-                <p class="blogText">
-                    Peers need to periodically send heartbeat messages for checking health of long-lived network
-                    connections
-                    to other peers. Opcode 1 is reserved for this purpose, where <b>BODY</b> of message ∈ {ping, pong}.
-                    <br>
-                    <br>
-                    If reader has covered whole proposal, they probably understand this is by no means a final version
-                    of design.
-                    Improvements like not forming strongly connected mesh helps in reducing huge bandwidth cost --- can
-                    be taken into consideration to further
-                    enhance protocol. Message authentication can be added so that peers only accept connection request
-                    from other peers
-                    who are trusted, when such setup is desired.
-                    <br>
-                    <br>
-                    Existing <span class="highlight">pub0sub</span> implementation is <a target="_blank" class="blogLink"
-                        href="https://github.com/itzmeanjan/pub0sub">here</a>.
-                </p>
-                <br>
-                <p class="blogText">
-                    Your feedback will be invaluable. Have a great time !
-                </p>
-            </article>
-        </div>
-    </div>
-    <div id="footerDiv">
-        <footer>
-            <p id="footerText">
-                &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink"
-                    target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-            </p>
-        </footer>
-    </div>
-</body>
-
-</html>
diff --git a/pages/sound-of-heartbeat-over-tcp.html b/pages/sound-of-heartbeat-over-tcp.html
deleted file mode 100644
index 8f96216..0000000
--- a/pages/sound-of-heartbeat-over-tcp.html
+++ /dev/null
@@ -1,224 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-    <title>
-        Sound of Heartbeat over TCP
-    </title>
-    <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-    <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Sound of Heartbeat over TCP">
-    <meta prefix="og: http://ogp.me/ns#" property="og:description" content="Incorporating heartbeat over TCP in Pub/Sub System">
-    <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-    <meta property="twitter:card" content="summary_large_image">
-    <meta property="twitter:url" content="https://itzmeanjan.in/">
-    <meta property="twitter:title" content="Sound of Heartbeat over TCP">
-    <meta property="twitter:description" content="Incorporating heartbeat over TCP in Pub/Sub System">
-    <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta property="twitter:site" content="@meanjanry">
-    <meta name="description" content="Incorporating heartbeat over TCP in Pub/Sub System">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-    <meta name="author" content="Anjan Roy">
-    <meta name="keywords"
-        content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-    <meta name="theme-color" content="darkslategrey">
-    <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-    <link rel="stylesheet" type="text/css" href="../styles/index.css">
-    <script src="../styles/code.js"></script>
-</head>
-
-<body>
-    <div id="parentDiv">
-        <div id="navBar">
-            <nav>
-                <a class="navLink" href="/"><big>H</big>ome</a> |
-                <a class="navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-                <a class="navLink" href="/pages/blog.html"><big>B</big>log</a> |
-                <a class="navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-            </nav>
-        </div>
-        <div class="childDiv">
-            <article>
-                <h1 class="blogHeader">
-                    Sound of Heartbeat over TCP
-                </h1>
-                <h3>Created : June 06, 2021</h3>
-            </article>
-        </div>
-        <div class="childDiv">
-            <article>
-                <p class="blogText">
-                    Few weeks back I started working on <span class="highlight">pub0sub - Fast, Light-weight, Ordered Pub/Sub System</span>
-                    leveraging power of kernel event loop, addressing C10K while running on a consumer grade machine.
-                    <br>
-                    <br>
-                    <span class="highlight">pub0sub</span> can easily handle > 10k concurrent connections even on consumer grade
-                    machine, because it doesn't follow conventional way of writing TCP servers in Go. Generally, one
-                    go-routine accepts TCP connection & spawns new go-routine for handling connection throughout its lifetime. This way if objective
-                    is to handle > 10k concurrent connections, there're > 10k go-routines. Go scheduler needs
-                    to perform expensive context switching for running go-routines on underlying OS threads. For > 10k go-routines
-                    cost of context switching is pretty high, when no useful task gets accomplished. Also stack memory requirement
-                    for > 10k go-routines is not something neglectable.
-                    <br>
-                    <br>
-                    Avoiding aforementioned path helps in discovering another potential way, where I can ask kernel event loop to watch
-                    file descriptors of interest & only inform when some action need to be taken. At a time any of two completion events can happen on socket
-                    { <b>READ</b>, <b>WRITE</b> } --- either pre-scheduled reading or writing from socket has been completed,
-                    giving opportunity to act on it & schedule next operation. There's no more > 10k go-routines, rather only 2 go-routines ---
-                    one used for listening & accepting TCP connections; another for watching & responding to I/O events. 
-                    <br>
-                    <br>
-                    It'll be perfectly okay to
-                    add more watcher go-routines ( static, done at system startup phase ), each managing its own kernel event loop and watching some
-                    delegated sockets. But in that case newly accepted connections
-                    need to be fairly distributed among all event watching loops otherwise some of them becomes <b>hotspot</b>, resulting into
-                    performance degradation. Some watcher does more socket watching, some does less. Even it's possible some topics
-                    in pub/sub are popular and all subscribers interested in those topics needs to be distributed across
-                    available watchers. For orchestrating N-sockets on available M-watchers, <i>where N >>> M</i>, I need to keep additional
-                    state information in listener go-routine. It can also be further explored whether dynamic watcher adding/ removing in runtime as per system state
-                    brings any improvement or not.
-                    <br>
-                    <br>
-                    Back to current implementation, as a result, lesser time spent in context switching more time spent doing actual work. As soon as new connection
-                    is accepted, it's delegated to watcher. On the other hand watcher waits for I/O completion events & 
-                    as soon as some of them are available to act on; it starts looping over them one-by-one ---
-                    processing each & scheduling next action on socket.
-                </p>
-                <img class="imgCenter" src="../images/sound-of-heartbeat-kernel-event-loop.jpg">
-                <p class="blogText">
-                    This model of writing TCP server is performant, but brings in some complexities. Previously I could
-                    manage each connection's whole life-cycle in its own go-routine --- seperation of concern was well respected.
-                    As a result implementation was easier to reason about.
-                    <br>
-                    <br>
-                    Say in first model, server has two clients --- each being managed in its own go-routine. Pub/Sub Hub is waiting to
-                    read message from respective sockets, where each message has two parts
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Part</th>
-                        <th>Size ( in bytes )</th>
-                        <th>Purpose</th>
-                    </tr>
-                    <tr>
-                        <td>Envelope</td>
-                        <td>5</td>
-                        <td>Keeps OPCODE, BodyLength</td>
-                    </tr>
-                    <tr>
-                        <td>Body</td>
-                        <td>N</td>
-                        <td>Keeps <i>BodyLength</i>-bytes actual data</td>
-                    </tr>
-                </table>
-                <p class="blogText">
-                    For one client, go-routine reading envelope and for another one body is being read after envelope reading
-                    is done. I'd like to highlight, messages are seperated in two parts because it helps
-                    in determining what's length of variable sized body, where envelope length is fixed at 5-bytes.
-                </p>
-                <img class="imgCenter" src="../images/sound-of-heartbeat-go-routine-reading-message.jpg">
-                <p class="blogText">
-                    But attaining same behaviour when delegating reading from/ writing to sockets to watcher is little more involved. 
-                    <br>
-                    <br>
-                    For reading message envelope, request is issued; watcher informs when envelope is read. Then envelope is deserialised to
-                    figure out how many more bytes to read from socket for consuming message body. N-bytes body reading is
-                    again delegated to watcher, which informs as soon as it's done. Now if there're M-clients connected at this moment
-                    each of them may be reading any possible part of message. There could be also different kinds of messages --- where OPCODE is
-                    encoded in envelope along with body length. Reading handler function needs to remember where it left off
-                    last time & what exactly it was doing then, so that it can keep processing later part of message. 
-                    This calls for additional state keeping --- resulting into more memory allocation than first model.
-                    <br>
-                    <br>
-                    It's like when first time envelope is read, reading handler function understands what's intention of client
-                    and how many more bytes it needs to read from socket to construct message. It puts an entry in <b>ongoing reading
-                    table</b>, <i>indexed by socket</i>, along with OPCODE so that appropriate deserialisation
-                    handler can be invoked when body reading will be completed; issues N-bytes body reading request; moves on to next event processing step.
-                    After sometime when body reading is done on this socket, watcher informs, socket is looked up in <b>ongoing reading table</b>
-                    to understand what has happened till now & what to do next with body. And finally intended action is taken
-                    on received message and socket entry from <b>ongoing reading table</b> is removed.
-                </p>
-                <img class="imgCenter" src="../images/sound-of-heartbeat-watcher-reading-message.jpg">
-                <p class="blogText">
-                    In <span class="highlight">pub0sub</span> there're two kinds of clients i.e. { publisher, subscriber }. Each
-                    of them interact with Pub/Sub server i.e. <span class="highlight">0hub</span> with different intention
-                    resulting into different message format. Opcode helps read handler understand how to deserialise 
-                    message body or what kind of actions to take on deserialised, structured message
-                    & how to eventually respond back to client.
-                </p>
-                <img class="imgCenter" src="../images/sound-of-heartbeat-message-format.jpg">
-                <p class="blogText">
-                    With all these pieces Pub/Sub Hub implementation <span class="highlight">0hub</span> shows quite
-                    promising performance. I've tested it with 16k concurrent connections on consumer grade machine. I believe
-                    if it's tested in containerised environment where virtual overlay networking can be easily used
-                    and more ports ( = more clients ) in total are available, <span class="highlight">0hub</span>
-                    will break its own record. Some other day I'd like to run that experiment.
-                    <br>
-                    <br>
-                    Recently I started noticing issue with long lived TCP connections --- resulting into abnormal connection termination.
-                    I suspect this is due to long lived TCP connections might be idle for long time if publishers are not publishing
-                    often or subscribers have subscribed to some infrequently update receiving topics. To address this
-                    situation I plan to add periodic heartbeat message passing between client & hub. Heartbeat messages will be of
-                    5-bytes --- <i>only envelope, no body</i>. If there's no body, it denotes last 4-bytes of message envelope
-                    will be holding 0, only first byte contains opcode ---  10 for ping, 11 for pong. For maintaining backward compatibility
-                    envelope size can't be changed, which is why I'm wasting 4 bytes in envelope by only storing 0.
-                </p>
-                <img class="imgCenter" src="../images/sound-of-heartbeat-ping-pong.jpg">
-                <p class="blogText">
-                    Every 30 seconds <span class="highlight">0hub</span> sends PING ( opcode = 10 ) message to all connected
-                    publishers & subcribers and expects to hear back with PONG ( opcode = 11 ) message. For all those who responded
-                    back, their next health check to be scheduled at t+30. Others who didn't respond back, they'll be
-                    pinged again upto 3 times at max, each after 30 seconds delay. If they still don't respond back, hub terminates connection
-                    with them while cleaning up all resources associated with respective client.
-                    <br>
-                    <br>
-                    This will hopefully help hub in maintaining connection & related resources only for healthy & active clients, while enabling it
-                    in estimating <i>how many subscribers to receive published message on a topic</i> with better precision.
-                    This estimation calculation will pose a challenge during implementing distributed version of <span class="highlight">pub0sub</span> --- which I'll
-                    face very soon.
-                    <br>
-                    <br>
-                    These PING/ PONG messages are simply an overhead, though unavoidable, consuming bandwidth overtime. But I can probably
-                    reduce #-of health checking done. When hub and client has recently communicated for sake of their
-                    usual business procedure, it's quite evident connection is active --- health check can be avoided. Idea is to only do
-                    health check when hub hasn't heard from client for some time.
-                    <br>
-                    <br>
-                    If some active publisher sends message publish intent every < 30s, it can avoid explicit health check cost. On other hand subscribers
-                    listening to active topic i.e. frequent update receiving topic, can avoid
-                    health check because connection issues will be caught when attempting to push update. That's why I call
-                    health check <b>LAZY</b>.
-                    As each health check message & response wastes 4-bytes for sake of backward compatibility, it's better
-                    to keep its usage as low as possible.
-                    <br>
-                    <br>
-                    Another way I'm looking at --- it's possible to send PING ( opcode = 10 ) message from hub to client
-                    of only 1-byte length i.e. OPCODE part of whole message, but when client responds back with PONG ( opcode = 11 )
-                    then need to send only envelope i.e. 5-bytes, as proposed 👆. This way communication pattern becomes
-                    somewhat asymmetric, but helps in saving 4-bytes, resulting into health check round-trip with 6-bytes
-                    instead of previous 10-bytes.
-                    <br>
-                    <br>
-                    Current version of <span class="highlight">pub0sub</span> is <a class="blogLink" href="https://github.com/itzmeanjan/pub0sub" target="_blank">here</a>.
-                    I'd love to get feedback and have a great time !
-                </p>
-            </article>
-        </div>
-    </div>
-    <div id="footerDiv">
-        <footer>
-            <p id="footerText">
-                &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink"
-                    target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-            </p>
-        </footer>
-    </div>
-</body>
-
-</html>
diff --git a/pages/speaking-tcp.html b/pages/speaking-tcp.html
deleted file mode 100644
index 64571a8..0000000
--- a/pages/speaking-tcp.html
+++ /dev/null
@@ -1,605 +0,0 @@
-<!DOCTYPE html>
-<!--
-  Author: Anjan Roy<hello@itzmeanjan.in>
--->
-<html>
-
-<head>
-    <title>
-        Speaking TCP
-    </title>
-    <meta prefix="og: http://ogp.me/ns#" property="og:type" content="website">
-    <meta prefix="og: http://ogp.me/ns#" property="og:title" content="Speaking TCP">
-    <meta prefix="og: http://ogp.me/ns#" property="og:description" content="Analysing ways of speaking TCP">
-    <meta prefix="og: http://ogp.me/ns#" property="og:url" content="https://itzmeanjan.in">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:secure_url"
-        content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:width" content="950">
-    <meta prefix="og: http://ogp.me/ns#" property="og:image:height" content="735">
-    <meta property="twitter:card" content="summary_large_image">
-    <meta property="twitter:url" content="https://itzmeanjan.in/">
-    <meta property="twitter:title" content="Speaking TCP">
-    <meta property="twitter:description" content="Analysing ways of speaking TCP">
-    <meta property="twitter:image" content="https://itzmeanjan.in/images/myImage.jpg">
-    <meta property="twitter:site" content="@meanjanry">
-    <meta name="description" content="Analysing ways of speaking TCP">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
-    <meta name="author" content="Anjan Roy">
-    <meta name="keywords"
-        content="anjan, roy, itzmeanjan, software, engineer, india, portfolio, skills, projects, thoughts, experiments, experiences">
-    <meta name="theme-color" content="darkslategrey">
-    <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
-    <link rel="stylesheet" type="text/css" href="../styles/index.css">
-    <script src="../styles/code.js"></script>
-</head>
-
-<body>
-    <div id="parentDiv">
-        <div id="navBar">
-            <nav>
-                <a class="navLink" href="/"><big>H</big>ome</a> |
-                <a class="navLink" href="https://github.com/itzmeanjan" target="_blank"><big>P</big>rojects</a> |
-                <a class="navLink" href="/pages/blog.html"><big>B</big>log</a> |
-                <a class="navLink" href="/pages/contact.html"><big>C</big>ontact</a>
-            </nav>
-        </div>
-        <div class="childDiv">
-            <article>
-                <h1 class="blogHeader">
-                    Speaking TCP
-                </h1>
-                <h3>Created : June 13, 2021</h3>
-            </article>
-        </div>
-        <div class="childDiv">
-            <article>
-                <p class="blogText">
-                    For last few months I've been working at TCP level more often than I generally do.
-                    During this period I designed and implemented few systems where multiple participants
-                    talk to each other over TCP while following custom application level protocol.
-                    I learned the way most of TCP applications written in ( specifically ) Golang
-                    can be done in a slight different way so that applications don't end up spawning one go-routine
-                    per accepted connection --- resulting into thousands of active go-routines when talking to thousands
-                    of concurrent peers. Rather than handling each peer in its own go-routine,
-                    proactively attempting to read from socket & spending most of its time in blocked mode; keeping only one socket watcher
-                    go-routine which is responsible for informing any READ/ WRITE completion
-                    event happening on any of delegated sockets --- consumes way lesser resources.
-                    It excels at reducing scope of context switching by bringing possible
-                    go-routine count to minimal. As a result of it, Golang scheduler only needs
-                    to manage a few go-routines now. Previously scheduler had to orchestrate
-                    thousands of go-routines on <b>N</b> system threads. I ran some experiments
-                    and result was promising --- TCP servers able to easily handle <b>100k</b> concurrent connections
-                    when following second approach.
-                </p>
-                <p class="blogText">
-                    Following 3 different approaches, I develop key-value database where clients can send read/ write requests
-                    over TCP. I challenge each implementation with <b>100k</b> concurrent connections
-                    and collect statistics of their performance, resource consumption, execution trace etc. under load;
-                    all running on consumer-grade machines in containerised environment i.e. Docker.
-                </p>
-                <ol>
-                    <li>One go-routine per connection</li>
-                    <li>One watcher for all sockets</li>
-                    <li>N ( >1 ) watchers for all sockets</li>
-                </ol>
-                <p class="blogText">
-                    The application I develop is quite simple but it captures the essence of a TCP application.
-                    It's a remote <i>( not necessarily geographically )</i> in-memory KV database, to which clients connect
-                    over TCP & maintain that connection throughout their life time. During their life time they do
-                    any of two possible operations in a randomised manner.
-                </p>
-                <ol>
-                    <li><b>READ</b> - Attempt to read VALUE associated with supplied KEY</li>
-                    <li><b>WRITE</b> - Attempt to associate VALUE with KEY</li>
-                </ol>
-                <p class="blogText">
-                    In both of the cases clients expect to hear back from server. In response frame
-                    VALUE associated with KEY is returned. For WRITE request, VALUE in response frame
-                    must be equal to what's sent in request frame. On server side all reading/ writing
-                    is done in concurrent safe manner --- by acquiring mutex locks. Only for write request
-                    r/w lock is held i.e. <i>critical section of code</i>, otherwise normal read-only lock is held --- allowing
-                    fulfilment of multiple READ requests concurrently.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-kv-store.jpg">
-                <p class="blogText">
-                    For performing desired operations, clients send structured data frames
-                    over TCP; server extracts that out from socket; performs action as specified
-                    in message envelope i.e. opcode field; responds back with response frame.
-                    <br>
-                    <br>
-                    Each message sent over wire is two-parts, where envelope carries operation kind i.e. {READ, WRITE} &
-                    how many more bytes server need to be read from stream to construct a structured message. Clients
-                    always expect to receive only one kind of frame in response.
-                </p>
-                <ol>
-                    <li>Envelope : 3 bytes</li>
-                    <li>Body : N ( < 65536 ) bytes</li>
-                </ol>
-                <p class="blogText">
-                    For a READ frame, sent when client is interested in looking up VALUE associated
-                    with KEY, body just holds key, preceded with key length in 1 byte field. Notice, body length
-                    field in envelope is 2 bytes, allowing at max 65535 bytes of body, but in body actually
-                    256 bytes can be written due to key length field in body being of 1 byte. This is done intensionally
-                    for keeping illustration simple.
-                    <br>
-                    <br>
-                    Practically max READ frame size over wire will be
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Field</th>
-                        <th>Max Thoeretical Size ( in bytes )</th>
-                        <th>Max Practical Size ( in bytes )</th>
-                    </tr>
-                    <tr>
-                        <td>Envelope</td>
-                        <td>3</td>
-                        <td>3</td>
-                    </tr>
-                    <tr>
-                        <td>Body</td>
-                        <td>65535</td>
-                        <td>256</td>
-                    </tr>
-                    <tr>
-                        <td>Total</td>
-                        <td>65538</td>
-                        <td><b>259</b></td>
-                    </tr>
-                </table>
-                <img class="imgCenter" src="../images/speaking-tcp-read-frame.jpg">
-                <p class="blogText">
-                    WRITE frame carries little more data, which is sent when client is interested
-                    in associating VALUE with some KEY, because it carries both key, value & each of them are preceded
-                    with respective length in 1 byte field. Same scene here, practically WRITE frame's body will be at max 512 bytes
-                    though it's allowed to be at max 65535 bytes theoretically, as written in body length field in stream.
-                    <br>
-                    <br>
-                    Limits WRITE frame size will be
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Field</th>
-                        <th>Max Thoeretical Size ( in bytes )</th>
-                        <th>Max Practical Size ( in bytes )</th>
-                    </tr>
-                    <tr>
-                        <td>Envelope</td>
-                        <td>3</td>
-                        <td>3</td>
-                    </tr>
-                    <tr>
-                        <td>Body</td>
-                        <td>65535</td>
-                        <td>512</td>
-                    </tr>
-                    <tr>
-                        <td>Total</td>
-                        <td>65538</td>
-                        <td><b>515</b></td>
-                    </tr>
-                </table>
-                <img class="imgCenter" src="../images/speaking-tcp-write-frame.jpg">
-                <p class="blogText">
-                    In response of READ/ WRITE request client expects to receive one RESPONSE frame, where VALUE
-                    associated with KEY is encoded, where length of VALUE precedes it, encoded 1 byte --- signaling
-                    client how many more bytes to read from stream to construct response. Good thing about 
-                    response frame, it doesn't waste any space, just allows sending 255 bytes VALUE at max.
-                </p>
-                <table class="centeredTable">
-                    <tr>
-                        <th>Field</th>
-                        <th>Max Theoretical Size ( in bytes )</th>
-                        <th>Max Practical Size ( in bytes )</th>
-                    </tr>
-                    <tr>
-                        <td>Envelope</td>
-                        <td>2</td>
-                        <td>2</td>
-                    </tr>
-                    <tr>
-                        <td>Body</td>
-                        <td>255</td>
-                        <td>255</td>
-                    </tr>
-                    <tr>
-                        <td>Total</td>
-                        <td>257</td>
-                        <td>257</td>
-                    </tr>
-                </table>
-                <img class="imgCenter" src="../images/speaking-tcp-response-frame.jpg">
-                <p class="blogText">
-                    Now I'd like to spend some time in specifying how each of 3 approaches work. 
-                    For ease of addressing, I'll refer to them from now on as <i>{<b>v1</b> => 1, <b>v2</b> => 2, <b>v3</b> => 3}</i>.
-                    <br>
-                    <br>
-                    Model <b>v1</b> is popular way of writing TCP servers in Go, where one listener go-routine
-                    keep listening on a <i>host:port</i>; accepts connection & spawns new go-routine for handling
-                    connection throughout its life time. This model respects seperation of concern well & operations
-                    happening on socket are easier to reason about due to clean structure. But one thing to notice, each go-routine
-                    alive for handling concurrent connections, spends a lot of its time in blocked state --- proactively waiting to read
-                    from socket.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-model-v1.jpg">
-                <p class="blogText">
-                    Model <b>v2</b> is slightly different than <b>v1</b>, where rather than spawning
-                    one go-routine per accepted connection, all accepted connections are delegated to
-                    one watcher go-routine, which runs one kernel event loop and learns about READ/ WRITE
-                    completion events on sockets being watched. Every now and then event loop informs
-                    watcher go-routine of READ/ WRITE completion events, providing with opportunity to 
-                    take action on accomplished task and schedule next operation on socket asynchronously.
-                    <br>
-                    <br>
-                    This mode of operation has some similarity with <i>libuv</i> --- which powers NodeJS's event loop.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-model-v2.jpg">
-                <p class="blogText">
-                    I'd call model <b>v3</b> a generic version of model <b>v2</b>, where N-watcher go-routines
-                    run N-many kernel event loops and each accepted connection is delegated to one of these
-                    available watchers for rest of their life time. Whenever READ/ WRITE completion event ocurrs on
-                    some socket, event loop notifies respective watcher go-routine, which invokes <i>handle{READ,WRITE}</i> method
-                    to take action on completed event and schedule next operation on socket, to be completed asynchronously.
-                    <br>
-                    <br>
-                    Using this model calls for socket orchestrating technique --- connections are fairly
-                    distributed among all available watcher go-routines. Goal of orchestration is not creating hot-spots i.e.
-                    some watcher go-routine managing lots of sockets while some has got few. This defeats whole purpose
-                    of model <b>v3</b>. One naive orchestration technique will be using modular arithmetic, where
-                    M-th accepted connection is delegated to M % N -th watcher go-routine, where M > 0, N > 0, N = #-of watcher go-routines.
-                    <br>
-                    <br>
-                    One problem I see with this scheme is, assuming peer connections are generally long-lived
-                    some watcher might end-up managing all those long-lived peers while some other watcher go-routine
-                    probably received those sockets which were unfortunately not long-lived, will manage few sockets --- creating
-                    imbalance in socket watching delegation i.e. hotspot resulting into bad performance.
-                    What I think can be done, rather than blindly orchestrating sockets using naive round-robin technique, 
-                    better to keep one feedback loop from watcher go-routines, so that they can inform
-                    listener go-routine of their current status i.e. how many delegated sockets are they managing
-                    now ?, how many of them are active in terms of READ/ WRITE operation frequency --- rolling average
-                    over finite timespan ? etc., allowing listener go-routine to make more informed decision before it
-                    delegates accepted connection to some watcher. This brings in management flexibility.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-model-v3.jpg">
-                <p class="blogText">
-                    It's time to run these models on real mahine and collect statistics. I've prepared parallel
-                    benchmarking testcases, where in each round of benchmarking one client connects to TCP server
-                    and sends two frames in order. First frame is read request for some KEY, waits for response, consumes
-                    it <i>( if some other client has already set VALUE for that KEY )</i>; then it sends write request
-                    with a KEY, VALUE pair, waits for response, expecting to see VALUE in response matching what it sent
-                    in write request. Each benchmark is performed 8 times, to get average statistics.
-                    <br>
-                    <br>
-                    I do parallel benchmark of model <b>v1</b> on two machines running GNU/Linux & MacOS
-                    where for each round takes ~34k ns on GNU/Linux, but it's relatively on higher side
-                    when run on MacOS ~45k ns.
-                </p>
-                <div class="microlight">
-                    // v1 on GNU/Linux
-
-                    $ go test -v -run=xxx -bench V1 -count 8
-                    goos: linux
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v1
-                    cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
-                    BenchmarkServerV1
-                    BenchmarkServerV1-4   	   34150	     34866 ns/op	  29.63 MB/s	    3750 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   34316	     34922 ns/op	  29.58 MB/s	    3750 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33979	     35035 ns/op	  29.48 MB/s	    3750 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33944	     35518 ns/op	  29.08 MB/s	    3750 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33795	     35610 ns/op	  29.01 MB/s	    3750 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33493	     35604 ns/op	  29.01 MB/s	    3749 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33490	     35473 ns/op	  29.12 MB/s	    3749 B/op	      52 allocs/op
-                    BenchmarkServerV1-4   	   33523	     35338 ns/op	  29.23 MB/s	    3749 B/op	      52 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v1	12.452s
-                </div>
-                <div class="microlight">
-                    // v1 on MacOS
-
-                    $ go test -v -run=xxx -bench V1 -count 8
-                    goos: darwin
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v1
-                    cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
-                    BenchmarkServerV1
-                    BenchmarkServerV1-8   	   27030	     47430 ns/op	  21.78 MB/s	    3751 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   26208	     44778 ns/op	  23.07 MB/s	    3752 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   27098	     43486 ns/op	  23.75 MB/s	    3752 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   27368	     44496 ns/op	  23.22 MB/s	    3753 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   25234	     49744 ns/op	  20.77 MB/s	    3753 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   24418	     49292 ns/op	  20.96 MB/s	    3752 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   24998	     47754 ns/op	  21.63 MB/s	    3751 B/op	      52 allocs/op
-                    BenchmarkServerV1-8   	   24969	     47495 ns/op	  21.75 MB/s	    3751 B/op	      52 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v1	14.223s
-                </div>
-                <p class="blogText">
-                    For model <b>v2</b>, MacOS takes lesser time for each round than it took in model <b>v1</b>. But that's not
-                    true for GNU/Linux --- rather it almost doubled up.
-                </p>
-                <div class="microlight">
-                    // v2 on GNU/Linux
-
-                    $ go test -v -run=xxx -bench V2 -count 8
-                    goos: linux
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v2
-                    cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
-                    BenchmarkServerV2
-                    BenchmarkServerV2-4   	   19852	     60069 ns/op	  34.39 MB/s	    6113 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   20102	     60362 ns/op	  34.23 MB/s	    6108 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   19983	     59815 ns/op	  34.54 MB/s	    6107 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   20096	     59202 ns/op	  34.90 MB/s	    6107 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   20307	     59099 ns/op	  34.96 MB/s	    6107 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   19944	     60038 ns/op	  34.41 MB/s	    6107 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   20209	     58666 ns/op	  35.22 MB/s	    6107 B/op	      71 allocs/op
-                    BenchmarkServerV2-4   	   20170	     58852 ns/op	  35.11 MB/s	    6105 B/op	      71 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v2	14.448s                    
-                </div>
-                <div class="microlight">
-                    // v2 on MacOS
-
-                    $ go test -v -run=xxx -bench V2 -count 8
-                    goos: darwin
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v2
-                    cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
-                    BenchmarkServerV2
-                    BenchmarkServerV2-8   	   35652	     32646 ns/op	  63.29 MB/s	    6191 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   39087	     30548 ns/op	  67.63 MB/s	    6178 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   39044	     30425 ns/op	  67.91 MB/s	    6173 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   39390	     30321 ns/op	  68.14 MB/s	    6175 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   39427	     30540 ns/op	  67.65 MB/s	    6175 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   37016	     34478 ns/op	  59.92 MB/s	    6177 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   35229	     36566 ns/op	  56.50 MB/s	    6186 B/op	      72 allocs/op
-                    BenchmarkServerV2-8   	   33456	     35525 ns/op	  58.16 MB/s	    6184 B/op	      72 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v2	12.866s
-                </div>
-                <p class="blogText">
-                    In case of model <b>v3</b>, GNU/Linux and MacOS both of them has kept their trends
-                    intact --- for one average benchmark round completion timespan increases, for other
-                    it's decreasing, respectively.
-                </p>
-                <div class="microlight">
-                    // v3 on GNU/Linux
-
-                    $ go test -v -run=xxx -bench V3 -count 8
-                    goos: linux
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v3
-                    cpu: Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
-                    BenchmarkServerV3
-                    BenchmarkServerV3-4   	   15162	     79368 ns/op	  26.03 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   15229	     78720 ns/op	  26.24 MB/s	    5715 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   14929	     81184 ns/op	  25.45 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   15290	     79059 ns/op	  26.13 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   14955	     79231 ns/op	  26.08 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   15013	     78480 ns/op	  26.33 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   14869	     79838 ns/op	  25.88 MB/s	    5713 B/op	      73 allocs/op
-                    BenchmarkServerV3-4   	   15498	     80839 ns/op	  25.56 MB/s	    5713 B/op	      73 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v3	15.678s
-                </div>
-                <div class="microlight">
-                    // v3 on MacOS
-
-                    $ go test -v -run=xxx -bench V3 -count 8
-                    goos: darwin
-                    goarch: amd64
-                    pkg: github.com/itzmeanjan/tseep/v3
-                    cpu: Intel(R) Core(TM) i5-8279U CPU @ 2.40GHz
-                    BenchmarkServerV3
-                    BenchmarkServerV3-8   	   41614	     28501 ns/op	  72.49 MB/s	    5715 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   41720	     28395 ns/op	  72.76 MB/s	    5717 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   43344	     27378 ns/op	  75.46 MB/s	    5716 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   43896	     28022 ns/op	  73.73 MB/s	    5712 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   42164	     30386 ns/op	  67.99 MB/s	    5713 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   37576	     32728 ns/op	  63.13 MB/s	    5718 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   37152	     32555 ns/op	  63.46 MB/s	    5714 B/op	      74 allocs/op
-                    BenchmarkServerV3-8   	   36784	     31925 ns/op	  64.71 MB/s	    5714 B/op	      74 allocs/op
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v3	12.792s
-                </div>
-                <p class="blogText">
-                    Now I plan to stress test 3 models on both of GNU/Linux & MacOS platform with 8k
-                    concurrent connections, where each client connects to TCP server, sends
-                    read & write requests in order while waiting for their response
-                    in both of the cases.
-                    <br>
-                    <br>
-                    When model <b>v1</b> is stress tested, it completes lot faster on GNU/Linux, given
-                    it enjoys benefit of faster CPU.
-                </p>
-                <div class="microlight">
-                    // stress testing v1 on GNU/Linux
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV1_Stress_8k
-                    --- PASS: TestServerV1_Stress_8k (0.50s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v1	0.542s
-
-
-
-                    // stress testing v1 on MacOS
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV1_Stress_8k
-                    --- PASS: TestServerV1_Stress_8k (2.41s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v1	2.714s
-                </div>
-                <p class="blogText">
-                    With 8k concurrent connections model <b>v2</b> takes almost same time to complete
-                    on both GNU/Linux & MacOS platform.
-                </p>
-                <div class="microlight">
-                    // stress testing v2 on GNU/Linux
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV2_Stress_8k
-                    --- PASS: TestServerV2_Stress_8k (0.60s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v2	0.645s
-
-
-
-                    // stress testing v2 on MacOS
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV2_Stress_8k
-                    --- PASS: TestServerV2_Stress_8k (2.50s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v2	3.201s
-                </div>
-                <p class="blogText">
-                    Time required for completing stress testing with model <b>v3</b>
-                    is almost unchanged for MacOS, but for GNU/Linux it's slightly increasing.
-                </p>
-                <div class="microlight">
-                    // stress testing v3 on GNU/Linux
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV3_Stress_8k
-                    --- PASS: TestServerV3_Stress_8k (0.73s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v3	0.757s
-
-
-
-                    // stress testing v3 on MacOS
-
-                    $ go test -v -tags stress -run=8k
-                    === RUN   TestServerV3_Stress_8k
-                    --- PASS: TestServerV3_Stress_8k (2.48s)
-                    PASS
-                    ok  	github.com/itzmeanjan/tseep/v3	3.095s
-                </div>
-                <p class="blogText">
-                    Go's trace tool is helpful in getting deeper insight into what happened
-                    when program was running. So I collect program execution trace when running
-                    test cases. These are collected on MacOS machine with Intel i5 CPU @ 2.4Ghz.
-                    <br>
-                    <br>
-                    While looking for how major go-routines spent their time model <b>v1</b>,
-                    I found listener go-routine <i>which accepts connections and spawns new go-routine
-                        for handling it</i>, spends a major portion of time in blocked state --- which is understandable
-                    because it's waiting for new connection to arrive.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v1-listener-trace.png">
-                <p class="blogText">
-                    If I now look at how its spawned connection handler go-routine spent its time,
-                    I see it has also spent most of its time in waiting for network IO. This also
-                    makes sense given the fact, in model <b>v1</b> each connection is handled in its
-                    own go-routine, resulting into each of those go-routines proactively waiting
-                    to read from socket --- waiting for network IO event.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v1-handle-connection-trace.png">
-                <p class="blogText">
-                    I look at model <b>v2</b>'s execution trace. It has two major go-routines i.e. {listener, watcher}.
-                    Listener does same job in all 3 models --- wait for incoming connection; accept it; prepare connection
-                    handling phase <i>( different in each model )</i>; keep looping, which is why network IO based blocking is evident in its trace.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v2-listener-trace.png">
-                <p class="blogText">
-                    When I look at model <b>v2</b>'s watcher go-routine trace, it doesn't spend any time in waiting
-                    for network IO --- it makes a blocking call where it waits for accumulation of a few READ/ WRITE
-                    completion event from underlying kernel event loop, as soon as it's done, it starts looping over them and takes necessary actions.
-                    This single function is equivalent of what N-many handleConnection go-routine does in model <b>v1</b>.
-                    When scheduler wait column is checked in each of these traces, it's well understandable each go-routine
-                    spawned needs to be scheduled on underlying OS threads to actually run, and scheduling is not cheap
-                    when having 100k go-routines.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v2-watcher-trace.png">
-                <p class="blogText">
-                    At last I take a look at trace of model <b>v3</b>, where I run 4 watcher go-routines
-                    each managing a subset of accepted connections. Listener go-routine's trace is similar
-                    to what I found in other models.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v3-listener-trace.png">
-                <p class="blogText">
-                    Downside of having more go-routines is scheduling cost --- here I run 4 go-routines
-                    with 4 different kernel event loop, subset of sockets are delegated to them, resulting
-                    into spending more time in scheduler wait stage. Also notice though there're 4 watcher go-routines
-                    ready to do their job, not all being used. It's because of the fact during test, 
-                    when trace is collected, only one connection request is sent from client side, resulting
-                    into only one socket being managed by one of available watcher go-routines.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-v3-watcher-trace.png">
-                <p class="blogText">
-                    Finally it's time for <b>100k</b> concurrent connection challenge.
-                    <br>
-                    <br>
-                    The problem I face is how to run 100k clients on my machine ? Given the fact
-                    network port identifier is of 16 bits, which allows me to run 65536 <i>( 1 << 16 )</i>
-                    clients at max. Leaving lower 1024 port numbers, I still need ~40k clients. It's all happening
-                    because I've only one IP address i.e. <i>127.0.0.1</i> . I can make use of some virtual
-                    networking technique, where I get a different subnet and multiple virtual machines are
-                    allocated IP address from that subnet. Each of those virtual machines run one copy of client
-                    application, actually to be more specific each of them run N <i>( < 65536 )</i>-clients. This way
-                    I can easily get to 100k client target.
-                    <br>
-                    <br>
-                    I choose Docker for its virtual networking capability, where each client container runs 16k clients,
-                    requiring only 6 containers together hitting another server container i.e. <i>{v1, v2, v3}_server</i>
-                    able to simulate 100k concurrent connection scenario.
-                    <br>
-                    <br>
-                    I start with model <b>v1</b> --- total 7 containers to run, one for server, others for clients.
-                    I see CPU usage ~60%, suddenly it moves to ~100%. The memory usage is due to high number of key value lookup
-                    happening concurrently.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-docker-v1.png">
-                <p class="blogText">
-                    Similarly model <b>v2</b> and <b>v3</b> are simulated, where one TCP server manages
-                    ~100k concurrent connections, each client attempting to randomly read/ write some
-                    randomly generated key & respective value is returned back to them in response.
-                    <br>
-                    <br>
-                    I notice, in model <b>v1</b>, PID count for <i>v1_server</i> container i.e. TCP server
-                    is 33, denoting 33 OS threads created in this containerised environment, which is due to
-                    handling 100k active go-routines require lots of underlying OS threads --- sign of context
-                    switching. Now I look at same field for model <b>v{2, 3}</b>, requiring ~11 OS threads
-                    for serving 100k concurrent connections --- seemily saving some context switching cost.
-                </p>
-                <img class="imgCenter" src="../images/speaking-tcp-docker-v2.png">
-                <img class="imgCenter" src="../images/speaking-tcp-docker-v3.png">
-                <p class="blogText">
-                    I note, each model is capable of handling 100k concurrent connection in simulated
-                    environment. Each of these models has its own benefits or downsides such using model <b>v1</b>
-                    program structuring is easier to understand, also it's natual & familiar way of
-                    writing TCP applications in Golang; while using model <b>v2</b>, chance of context switching
-                    can be avoided by drastically reducing #-of active go-routines, but it's no silver bullet. On the other hand model <b>v3</b> which is a generic
-                    version of model <b>v2</b>, is able to leverage power of more than one event loop, each managing subset of accepted
-                    connections --- sharded architecture, resulting into less mutex lock contention, given orchestration
-                    technique fits well.
-                    <br>
-                    <br>
-                    For almost all standard TCP applications, model <b>v1</b> is good fit, model <b>v2</b> or model <b>v3</b> <i>( with better orchestrator )</i>
-                    can be used when extreme performance is required, while paying relatively lesser cost.
-                    <br>
-                    <br>
-                    I keep implementation powering these findings in <a class="blogLink" href="https://github.com/itzmeanjan/tseep" target="_blank">this repository</a>
-                    for future reference.
-                    <br>
-                    I plan to impose C1M challenge <i>( i.e. managing 1M concurrent connections )</i> on these models ---
-                    some other day I'll talk about it. Have a great time !
-                </p>
-            </article>
-        </div>
-    </div>
-    <div id="footerDiv">
-        <footer>
-            <p id="footerText">
-                &copy <a href="https://github.com/itzmeanjan/itzmeanjan.github.io" id="footerLink"
-                    target="_blank"><big>A</big>njan Roy</a> ( <big>M</big>IT Licensed )
-            </p>
-        </footer>
-    </div>
-</body>
-
-</html>

- BLAKE3 Hash using `approach_1` -
- Input Size -	- Accelerator -	- Kernel Execution Time -	- Host -> Device Tx Time -	- Host <- Device Tx Time -
64 MB
	Tesla V100-SXM2-16GB	844.598250 us	6.166145 ms	6.973250 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz	6.239875 ms	9.797500 ms	2.525625 us
	Intel(R) Iris(R) Xe MAX Graphics	4.974242 ms	17.749401 ms	1.319500 us
128 MB
	Tesla V100-SXM2-16GB	1.800964 ms	12.269974 ms	7.080000 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz	8.187520 ms	20.664062 ms	1.242000 us
	Intel(R) Iris(R) Xe MAX Graphics	9.812348 ms	35.475108 ms	1.319500 us
256 MB
	Tesla V100-SXM2-16GB	3.267731 ms	24.462952 ms	6.805500 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz	8.853032 ms	32.455801 ms	1.047125 us
	Intel(R) Iris(R) Xe MAX Graphics	19.465823 ms	70.886068 ms	1.293500 us
512 MB
	Tesla V100-SXM2-16GB	5.998047 ms	48.833740 ms	6.713750 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz	14.807205 ms	48.242437 ms	1.063000 us
	Intel(R) Iris(R) Xe MAX Graphics	39.271700 ms	141.716997 ms	1.313000 us
1024 MB
	Tesla V100-SXM2-16GB	11.915527 ms	97.573730 ms	8.423000 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz	22.864140 ms	79.047688 ms	1.088500 us
	Intel(R) Iris(R) Xe MAX Graphics	77.556440 ms	283.341799 ms	1.534000 us
- BLAKE3 Hash using `approach_2`, compressing {2, 4, 8, 16} chunks together -
- Input Size -	- Accelerator -	- SIMD Width -	- Kernel Execution Time -	- Host -> Device Tx Time -	- Host <- Device Tx Time -
64 MB
	Tesla V100-SXM2-16GB
		64 -bit	1.016358 ms	6.172363 ms	7.568375 us
		128 -bit	923.828375 us	6.168457 ms	7.323875 us
		256 -bit	1.318848 ms	6.168945 ms	7.812625 us
		512 -bit	2.055176 ms	6.176270 ms	10.254000 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
		64 -bit	7.193866 ms	13.689200 ms	4.531500 us
		128 -bit	6.739462 ms	14.008103 ms	2.967625 us
		256 -bit	7.261953 ms	14.829467 ms	2.978000 us
		512 -bit	11.546031 ms	13.229008 ms	1.385125 us
	Intel(R) Iris(R) Xe MAX Graphics
		64 -bit	3.106389 ms	17.748458 ms	1.365000 us
		128 -bit	28.628951 ms	17.749823 ms	1.332500 us
		256 -bit	56.188691 ms	17.748861 ms	1.326000 us
		512 -bit	105.559818 ms	17.749823 ms	1.365000 us
256 MB
	Tesla V100-SXM2-16GB
		64 -bit	3.539550 ms	24.455078 ms	7.080250 us
		128 -bit	4.190674 ms	24.442871 ms	7.080000 us
		256 -bit	5.203370 ms	24.459961 ms	7.568250 us
		512 -bit	13.925293 ms	24.453369 ms	7.568375 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
		64 -bit	10.928828 ms	33.915237 ms	967.625000 ns
		128 -bit	8.854166 ms	32.901272 ms	976.500000 ns
		256 -bit	10.290331 ms	33.643110 ms	1.030125 us
		512 -bit	18.790299 ms	33.722702 ms	966.125000 ns
	Intel(R) Iris(R) Xe MAX Graphics
		64 -bit	11.941254 ms	70.892191 ms	1.326000 us
		128 -bit	110.007846 ms	70.894857 ms	1.339000 us
		256 -bit	245.655891 ms	70.883748 ms	1.391000 us
		512 -bit	475.246200 ms	70.886621 ms	1.332500 us
1024 MB
	Tesla V100-SXM2-16GB
		64 -bit	11.715087 ms	97.482910 ms	9.765625 us
		128 -bit	12.184326 ms	97.552734 ms	8.300750 us
		256 -bit	18.732911 ms	97.577148 ms	7.812250 us
		512 -bit	52.898436 ms	97.524170 ms	8.056625 us
	Intel(R) Xeon(R) Platinum 8358 CPU @ 2.60GHz
		64 -bit	35.084335 ms	76.414034 ms	903.625000 ns
		128 -bit	25.805447 ms	79.800968 ms	1.052625 us
		256 -bit	28.765474 ms	80.076494 ms	1.017875 us
		512 -bit	61.698307 ms	75.633475 ms	1.107500 us
	Intel(R) Iris(R) Xe MAX Graphics
		64 -bit	47.515533 ms	283.342299 ms	1.319500 us
		128 -bit	431.324205 ms	283.344665 ms	1.306500 us
		256 -bit	938.669251 ms	283.345946 ms	2.164500 us
		512 -bit	1.843786 s	283.342267 ms	2.164500 us
Operation	Interpretation
Topic subscription	Letting peers know of interest in some topics
Topic subscription ACK	Peer saying it has noted down & will forward published messages if sees any
Topic unsubscription	Announcing not interested in topics anymore
Topic unsubscription ACK	Peer saying it has removed entry & will no more forward published messages
Published message forwarding	Passing published message to interested peer
Periodic heartbeat	Network health check
Interpretation	Opcode
Heartbeat	1
Topic subscription	2
Topic subscription ACK	3
Topic unsubscription	4
Topic unsubscription ACK	5
Message Forward	6
Message Part	Field Name	Field Byte Length	Field Value
Header	Opcode	1	2
Header	Body Length	4	16
Body	Topic-1 Length	1	7
	Topic-1 Name	7	topic_1
	Topic-2 Length	1	7
	Topic-2 Name	7	topic_2
Part	Size ( in bytes )	Purpose
Envelope	5	Keeps OPCODE, BodyLength
Body	N	Keeps BodyLength-bytes actual data