improve script - currently this produces a mostly flat json

asoplata · Jan 30, 2025 · 3961a20 · 3961a20
1 parent 54ef4d1
commit 3961a20
Showing 1 changed file with 46 additions and 28 deletions.
diff --git a/scripts/convert_notebooks.py b/scripts/convert_notebooks.py
@@ -7,7 +7,6 @@
 import nbformat
 import markdown
 from nbconvert.preprocessors import ExecutePreprocessor
-from bs4 import BeautifulSoup
 
 
 def execute_notebook(notebook_path):
@@ -31,37 +30,56 @@ def save_plot_as_image(img_data, img_filename, output_dir):
     return
 
 
-def html_to_hierarchical_json(
-        html: str,
-        filename: str,
-        ):
-    soup = BeautifulSoup(html, 'html.parser')
-    hierarchy = {filename: {}}
-    stack = []
+def html_to_hierarchical_json(html: str, filename: str):
+    """
+    Convert html into hierarchical json
+    """
+    # variable for processed json output
+    contents = {filename: {}}
+
+    # variable to track section content and metadata
+    current_html = None
+    current_title = None
+    current_level = None
+
+    # split html into lines while removing empty lines
+    lines = [line.strip() for line in html.splitlines() if line.strip()]
+
+    for i, line in enumerate(lines):
+        # identify lines with header tags
+        line_match = re.match(r'(<h[1-6]>)(.*?)(</h[1-6]>)', line)
+
+        if line_match:
+            # when a new header is found, save the previous section
+            if current_title:
+                contents[filename][current_title] = {}
+                contents[filename][current_title]['level'] = current_level
+                contents[filename][current_title]['html'] = \
+                    '\n'.join(current_html)
+
+            # get the title, level of the new section
+            current_level = line_match.group(1).strip()
+            current_title = line_match.group(2).strip()
 
-    for tag in soup.find_all(re.compile(r'h[1-6]')):
-        level = int(tag.name[1])
-        title = tag.get_text(strip=True)
-        contents = str(tag) + ''.join(
-            str(sibling) for sibling in tag.find_next_siblings()
-            if not re.match(r'h[1-6]', sibling.name)
-        )
-        section = {"contents": contents}
+            # start a new section with the previous line
+            current_html = [lines[i-1]]
 
-        while stack and stack[-1][1] >= level:
-            stack.pop()
+        elif current_html is not None:
+            # add new html lines
+            current_html.append(lines[i-1])
 
-        if stack:
-            parent = stack[-1][0]
-            if "sections" not in parent:
-                parent["sections"] = {}
-            parent["sections"][title] = section
-        else:
-            hierarchy[filename][title] = section
+    # save the last section
+    if current_title:
+        # append the last line
+        current_html.append(line)
 
-        stack.append((section, level))
+        # update contants
+        contents[filename][current_title] = {}
+        contents[filename][current_title]['level'] = current_level
+        contents[filename][current_title]['html'] = \
+            '\n'.join(current_html)
 
-    return hierarchy
+    return contents
 
 
 def extract_html_from_notebook(
@@ -258,7 +276,7 @@ def convert_notebooks_to_html(
 # %%
 def test_nb_conversion():
 
-    input_folder = "../content/05_erps"
+    input_folder = "../tests"
 
     convert_notebooks_to_html(
         input_folder,