-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: expose functionality to extract unit content
- Loading branch information
Showing
6 changed files
with
400 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
Contain all imported functions coming out of the platform. | ||
We know these functions will be available at run time, but they | ||
cannot be imported normally. | ||
""" | ||
|
||
|
||
def get_text_transcript(video_block): | ||
"""Get the transcript for a video block in text format, or None.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from xmodule.exceptions import NotFoundError | ||
from xmodule.video_block.transcripts_utils import get_transcript | ||
try: | ||
transcript, _, _ = get_transcript(video_block, output_format='txt') | ||
except NotFoundError: | ||
# some old videos have no transcripts, just accept that reality | ||
return None | ||
return transcript | ||
|
||
|
||
def get_single_block(request, user_id, course_id, usage_key_string, course=None, will_recheck_access=False): | ||
"""Load a single xblock.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from lms.djangoapps.courseware.block_renderer import load_single_xblock | ||
return load_single_xblock(request, user_id, course_id, usage_key_string, course, will_recheck_access) | ||
|
||
|
||
def traverse_block_pre_order(start_node, get_children, filter_func=None): | ||
"""Traverse a DAG or tree in pre-order.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import traverse_pre_order | ||
return traverse_pre_order(start_node, get_children, filter_func) | ||
|
||
|
||
def block_leaf_filter(block): | ||
"""Return only leaf nodes.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import leaf_filter | ||
return leaf_filter(block) | ||
|
||
|
||
def block_get_children(block): | ||
"""Return children of a given block.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import get_children | ||
return get_children(block) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
""" | ||
Text manipulation utils. | ||
""" | ||
|
||
from html.parser import HTMLParser | ||
from re import sub | ||
|
||
from django.conf import settings | ||
|
||
|
||
def cleanup_text(text): | ||
""" | ||
Remove litter from replacing or manipulating text. | ||
""" | ||
stripped = sub(r'[^\S\r\n]+', ' ', text) # Removing extra spaces | ||
stripped = sub(r'\n{2,}', '\n', stripped) # Removing extra new lines | ||
stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped) # Removing starting extra spacesbetween new lines | ||
stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped) # Trim | ||
|
||
return stripped | ||
|
||
|
||
class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method | ||
""" | ||
Helper function for html_to_text below. | ||
""" | ||
|
||
_is_content = True | ||
|
||
def __init__(self): | ||
HTMLParser.__init__(self) | ||
self.reset() | ||
self.fed = [] | ||
|
||
def handle_starttag(self, tag, _): | ||
"""On each tag, check whether this is a tag we think is content.""" | ||
tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None) | ||
self._is_content = not (tags_to_filter and tag in tags_to_filter) | ||
|
||
def handle_data(self, data): | ||
"""Handle tag data by appending text we think is content.""" | ||
if self._is_content: | ||
self.fed.append(data) | ||
|
||
def handle_entityref(self, name): | ||
"""If there is an entity, append the reference to the text.""" | ||
if self._is_content: | ||
self.fed.append('&%s;' % name) | ||
|
||
def get_data(self): | ||
"""Join together the separate data chunks into one cohesive string.""" | ||
return ''.join(self.fed) | ||
|
||
|
||
def html_to_text(html): | ||
"""Strip the html tags off of the text to return plaintext.""" | ||
htmlstripper = _HTMLToTextHelper() | ||
htmlstripper.feed(html) | ||
text = htmlstripper.get_data() | ||
text = cleanup_text(text) | ||
|
||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.