-
Notifications
You must be signed in to change notification settings - Fork 571
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Change the logics to get the headers using parsing of the markdown ce…
…ll content to html and using BeautifulSoup.select.
- Loading branch information
1 parent
e833379
commit 8623ce8
Showing
6 changed files
with
478 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "8253d8be-f1ee-4e5c-a868-416d78ac4d9f", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"[(1, 'Heading 1'), (2, 'Heading 2')]\n", | ||
"<bound method TitleExtractorRenderer.heading of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n", | ||
"<bound method TitleExtractorRenderer.html_block of <__main__.TitleExtractorRenderer object at 0x77e46cf82120>>\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import mistune\n", | ||
"import re\n", | ||
"\n", | ||
"class TitleExtractorRenderer(mistune.HTMLRenderer):\n", | ||
" def __init__(self):\n", | ||
" super().__init__()\n", | ||
" self.titles = []\n", | ||
"\n", | ||
" # Override the heading method for Markdown headings\n", | ||
" def heading(self, text, level):\n", | ||
" self.titles.append((level, text))\n", | ||
" return '' # return empty since we only want to extract\n", | ||
"\n", | ||
" # Override the html_block method to handle raw HTML\n", | ||
" def html_block(self, html):\n", | ||
" # Regex to find HTML headings <h1> to <h6>\n", | ||
" matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n", | ||
" for level, text in matches:\n", | ||
" self.titles.append((int(level), text))\n", | ||
" print(text);\n", | ||
" return '' # return empty as we're only extracting titles\n", | ||
"\n", | ||
"# Create an instance of the renderer and Markdown parser\n", | ||
"renderer = TitleExtractorRenderer()\n", | ||
"markdown = mistune.create_markdown(renderer=renderer)\n", | ||
"\n", | ||
"# Parse your markdown input\n", | ||
"markdown_text = \"\"\"\n", | ||
"# Heading 1\n", | ||
"\n", | ||
"Some paragraph here.\n", | ||
"\n", | ||
"## Heading 2\n", | ||
"\n", | ||
"<h1>HTML Heading 1</h1>\n", | ||
"<h2>HTML Heading 2</h2>\n", | ||
"\"\"\"\n", | ||
"\n", | ||
"# Process the markdown to extract titles\n", | ||
"markdown(markdown_text)\n", | ||
"\n", | ||
"# Print the extracted titles\n", | ||
"print(renderer.titles)\n", | ||
"print(renderer.heading)\n", | ||
"print(renderer.html_block)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "5009b588-7bb0-46b8-8460-c3eb7f63582f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import mistune\n", | ||
"import re\n", | ||
"\n", | ||
"class TitleExtractorRenderer(mistune.HTMLRenderer):\n", | ||
" def __init__(self):\n", | ||
" super().__init__()\n", | ||
" self.titles = []\n", | ||
"\n", | ||
" # Override the heading method for Markdown headings\n", | ||
" def heading(self, text, level):\n", | ||
" self.titles.append((level, text))\n", | ||
" return '' # return empty since we only want to extract\n", | ||
"\n", | ||
" # Override the html_block method to handle raw HTML\n", | ||
" def html_block(self, html):\n", | ||
" # Regex to find HTML headings <h1> to <h6>\n", | ||
" matches = re.findall(r'<h([1-6])>(.*?)</h\\1>', html, re.IGNORECASE)\n", | ||
" for level, text in matches:\n", | ||
" self.titles.append((int(level), text))\n", | ||
" print(text);\n", | ||
" return '' # return empty as we're only extracting titles\n", | ||
"\n", | ||
"# Create an instance of the renderer and Markdown parser\n", | ||
"renderer = TitleExtractorRenderer()\n", | ||
"markdown = mistune.create_markdown(renderer=renderer)\n", | ||
"\n", | ||
"# Parse your markdown input\n", | ||
"markdown_text = \"\"\"\n", | ||
"# Heading 1\n", | ||
"\n", | ||
"Some paragraph here.\n", | ||
"\n", | ||
"## Heading 2\n", | ||
"\n", | ||
"<h1>HTML Heading 1</h1>\n", | ||
"<h2>HTML Heading 2</h2>\n", | ||
"\"\"\"\n", | ||
"\n", | ||
"# Process the markdown to extract titles\n", | ||
"markdown(markdown_text)\n", | ||
"\n", | ||
"# Print the extracted titles\n", | ||
"print(renderer.titles)\n", | ||
"print(renderer.heading)\n", | ||
"print(renderer.html_block)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "0b71a9ab-4b5a-4b48-9f59-0ba7e92b3999", | ||
"metadata": {}, | ||
"source": [ | ||
"# Main title" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4f05b735-2329-444f-bb54-40aa23d3aa81", | ||
"metadata": {}, | ||
"source": [ | ||
"## paragraph 1" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "9fd42aae-d388-4aaf-9fb6-8a27f5869dff", | ||
"metadata": {}, | ||
"source": [ | ||
"<h2> paragraph 2 </h2>" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "828c2b99-a9f7-424a-8241-e9f812cde8a9", | ||
"metadata": {}, | ||
"source": [ | ||
"<h2>\n", | ||
"paragraph3\n", | ||
"</h2>" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "34b9e91f-b4d9-481e-9b97-fc5766c98ce6", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"ename": "NameError", | ||
"evalue": "name 'extrac' is not defined", | ||
"output_type": "error", | ||
"traceback": [ | ||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | ||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", | ||
"Cell \u001b[0;32mIn[1], line 68\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[38;5;66;03m# print(\"header_level:\", header_level)\u001b[39;00m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;66;03m# print(\"raw_text:\", raw_text)\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m titles_array\n\u001b[0;32m---> 68\u001b[0m \u001b[43mextrac\u001b[49m\n", | ||
"\u001b[0;31mNameError\u001b[0m: name 'extrac' is not defined" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import mistune\n", | ||
"import re\n", | ||
"from mistune.renderers.markdown import MarkdownRenderer\n", | ||
"from nbformat import NotebookNode\n", | ||
"\n", | ||
"class HeadingExtractor(MarkdownRenderer):\n", | ||
" \"\"\"A renderer to capture headings\"\"\"\n", | ||
"\n", | ||
" def __init__(self):\n", | ||
" \"\"\"Initialize the class.\"\"\"\n", | ||
" super().__init__()\n", | ||
" self.headings = []\n", | ||
"\n", | ||
" def heading(self, text, level):\n", | ||
" \"\"\"Return an empty string for the headings to avoid outputting them.\"\"\"\n", | ||
" matches = re.findall(r'<h[1-6]>.*?<\\/h[1-6]>', text)\n", | ||
" print(matches)\n", | ||
" \n", | ||
" for level, text in matches:\n", | ||
" # You can use int() to convert the level to an integer\n", | ||
" self.headings.append((int(level), text.strip())) # .strip() removes any leading/trailing whitespace\n", | ||
" print(f\"Level: {level}, Text: {text.strip()}\")\n", | ||
" # self.headings.append((level, text))\n", | ||
" return \"\"\n", | ||
"\n", | ||
"\n", | ||
"def extract_titles_from_notebook_node(nb: NotebookNode):\n", | ||
" \"\"\"Create a Markdown parser with the HeadingExtractor renderer to collect all the headings of a notebook\n", | ||
" The input argument is the notebooknode from which a single string with all the markdown content concatenated\n", | ||
" The output is an array containing information about the headings such as their level, their text content, an identifier and a href that can be used in case of html converter.s\"\"\"\n", | ||
"\n", | ||
" markdown_collection = \"\"\n", | ||
"\n", | ||
" for cell in nb.cells:\n", | ||
" if cell.cell_type == \"markdown\":\n", | ||
" lines = cell.source.splitlines()\n", | ||
" for line in lines:\n", | ||
" newline= line\n", | ||
" \n", | ||
" if line.startswith('#'):\n", | ||
" newline = mistune.html(newline)\n", | ||
" \n", | ||
" #print(\"line:\", line)\n", | ||
" #print('newline:', newline)\n", | ||
" markdown_collection = markdown_collection + newline.strip() + \"\\n\"\n", | ||
" #print(markdown_collection)\n", | ||
" titles_array = []\n", | ||
" renderer = HeadingExtractor()\n", | ||
" extract_titles = mistune.create_markdown(renderer=renderer)\n", | ||
" extract_titles(markdown_collection)\n", | ||
" headings = renderer.headings\n", | ||
" print(\"Titles:\", headings)\n", | ||
"\n", | ||
" # Iterate on all headings to get the necessary information on the various titles\n", | ||
" for __, title in headings:\n", | ||
" children = title[\"children\"]\n", | ||
" attrs = title[\"attrs\"]\n", | ||
" raw_text = children[0][\"raw\"]\n", | ||
" header_level = attrs[\"level\"]\n", | ||
" id = raw_text.replace(\" \", \"-\")\n", | ||
" href = \"#\" + id\n", | ||
" titles_array.append([header_level, raw_text, id, href])\n", | ||
" # print(\"header_level:\", header_level)\n", | ||
" # print(\"raw_text:\", raw_text)\n", | ||
" return titles_array\n", | ||
"\n", | ||
"\n", | ||
"extract_titles_from_notebook_node()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "11d24acf-e8ba-4cf4-9443-c505d5687811", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.