-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: expose functionality to extract unit content #37
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,16 @@ | ||
""" | ||
Library for the learning_assistant app. | ||
""" | ||
from learning_assistant.constants import ACCEPTED_CATEGORY_TYPES, CATEGORY_TYPE_MAP | ||
from learning_assistant.models import CoursePrompt | ||
from learning_assistant.platform_imports import ( | ||
block_get_children, | ||
block_leaf_filter, | ||
get_single_block, | ||
get_text_transcript, | ||
traverse_block_pre_order, | ||
) | ||
from learning_assistant.text_utils import html_to_text | ||
|
||
|
||
def get_deserialized_prompt_content_by_course_id(course_id): | ||
|
@@ -23,3 +32,69 @@ def get_setup_messages(course_id): | |
setup_messages = [{'role': 'system', 'content': x} for x in message_content] | ||
return setup_messages | ||
return None | ||
|
||
|
||
def _extract_block_contents(child, category): | ||
""" | ||
Process the child contents based on its category. | ||
|
||
Returns a string or None if there are no contents available. | ||
""" | ||
if category == 'html': | ||
content_html = child.get_html() | ||
text = html_to_text(content_html) | ||
return text | ||
|
||
if category == 'video': | ||
transcript = get_text_transcript(child) # may be None | ||
return transcript | ||
|
||
return None | ||
|
||
|
||
def _leaf_filter(block): | ||
""" | ||
Return only leaf nodes of a particular category. | ||
""" | ||
is_leaf = block_leaf_filter(block) | ||
category = block.category | ||
|
||
return is_leaf and category in ACCEPTED_CATEGORY_TYPES | ||
|
||
|
||
def _get_children_contents(block): | ||
""" | ||
Given a specific block, return the content type and text of a pre-order traversal of the blocks children. | ||
""" | ||
leaf_nodes = traverse_block_pre_order(block, block_get_children, _leaf_filter) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is slick! |
||
|
||
length = 0 | ||
items = [] | ||
|
||
for node in leaf_nodes: | ||
category = node.category | ||
content = _extract_block_contents(node, category) | ||
|
||
if content: | ||
length += len(content) | ||
items.append({ | ||
'content_type': CATEGORY_TYPE_MAP.get(category), | ||
'content_text': content, | ||
}) | ||
|
||
return length, items | ||
|
||
|
||
def get_block_content(request, user_id, course_id, unit_usage_key): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for my own understanding, what calls this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function will be used as part of https://2u-internal.atlassian.net/browse/COSMO-73 - essentially whatever code is putting together the prompt will call it. |
||
""" | ||
Public wrapper for retrieving the content of a given block's children. | ||
|
||
Returns | ||
length - the cummulative length of a block's children's content | ||
items - a list of dictionaries containing the content type and text for each child | ||
""" | ||
block = get_single_block(request, user_id, course_id, unit_usage_key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm only using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You shouldn't need to, no. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got it, thank you! |
||
|
||
length, items = _get_children_contents(block) | ||
|
||
return length, items |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
""" | ||
Contain all imported functions coming out of the platform. | ||
|
||
We know these functions will be available at run time, but they | ||
cannot be imported normally. | ||
""" | ||
|
||
|
||
def get_text_transcript(video_block): | ||
"""Get the transcript for a video block in text format, or None.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from xmodule.exceptions import NotFoundError | ||
from xmodule.video_block.transcripts_utils import get_transcript | ||
try: | ||
transcript, _, _ = get_transcript(video_block, output_format='txt') | ||
except NotFoundError: | ||
# some old videos have no transcripts, just accept that reality | ||
return None | ||
return transcript | ||
|
||
|
||
def get_single_block(request, user_id, course_id, usage_key_string, course=None): | ||
"""Load a single xblock.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from lms.djangoapps.courseware.block_renderer import load_single_xblock | ||
return load_single_xblock(request, user_id, course_id, usage_key_string, course) | ||
|
||
|
||
def traverse_block_pre_order(start_node, get_children, filter_func=None): | ||
"""Traverse a DAG or tree in pre-order.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import traverse_pre_order | ||
return traverse_pre_order(start_node, get_children, filter_func) | ||
|
||
|
||
def block_leaf_filter(block): | ||
"""Return only leaf nodes.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import leaf_filter | ||
return leaf_filter(block) | ||
|
||
|
||
def block_get_children(block): | ||
"""Return children of a given block.""" | ||
# pylint: disable=import-error, import-outside-toplevel | ||
from openedx.core.lib.graph_traversals import get_children | ||
return get_children(block) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
""" | ||
Text manipulation utils. This has been copied from the ai-aside repository. | ||
""" | ||
|
||
from html.parser import HTMLParser | ||
from re import sub | ||
|
||
from django.conf import settings | ||
|
||
|
||
def cleanup_text(text): | ||
""" | ||
Remove litter from replacing or manipulating text. | ||
""" | ||
stripped = sub(r'[^\S\r\n]+', ' ', text) # Removing extra spaces | ||
stripped = sub(r'\n{2,}', '\n', stripped) # Removing extra new lines | ||
stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped) # Removing starting extra spacesbetween new lines | ||
stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped) # Trim | ||
|
||
return stripped | ||
|
||
|
||
class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method | ||
""" | ||
Helper function for html_to_text below. | ||
""" | ||
|
||
_is_content = True | ||
|
||
def __init__(self): | ||
HTMLParser.__init__(self) | ||
self.reset() | ||
self.fed = [] | ||
|
||
def handle_starttag(self, tag, _): | ||
"""On each tag, check whether this is a tag we think is content.""" | ||
tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None) | ||
self._is_content = not (tags_to_filter and tag in tags_to_filter) | ||
|
||
def handle_data(self, data): | ||
"""Handle tag data by appending text we think is content.""" | ||
if self._is_content: | ||
self.fed.append(data) | ||
|
||
def handle_entityref(self, name): | ||
"""If there is an entity, append the reference to the text.""" | ||
if self._is_content: | ||
self.fed.append('&%s;' % name) | ||
|
||
def get_data(self): | ||
"""Join together the separate data chunks into one cohesive string.""" | ||
return ''.join(self.fed) | ||
|
||
|
||
def html_to_text(html): | ||
"""Strip the html tags off of the text to return plaintext.""" | ||
htmlstripper = _HTMLToTextHelper() | ||
htmlstripper.feed(html) | ||
text = htmlstripper.get_data() | ||
text = cleanup_text(text) | ||
|
||
return text |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,55 @@ | ||
""" | ||
Test cases for the learning-assistant api module. | ||
""" | ||
from unittest.mock import MagicMock, patch | ||
|
||
import ddt | ||
from django.test import TestCase | ||
from opaque_keys.edx.keys import UsageKey | ||
|
||
from learning_assistant.api import get_deserialized_prompt_content_by_course_id, get_setup_messages | ||
from learning_assistant.api import ( | ||
_extract_block_contents, | ||
_get_children_contents, | ||
_leaf_filter, | ||
get_block_content, | ||
get_deserialized_prompt_content_by_course_id, | ||
get_setup_messages, | ||
) | ||
from learning_assistant.models import CoursePrompt | ||
|
||
fake_transcript = 'This is the text version from the transcript' | ||
|
||
|
||
class FakeChild: | ||
"""Fake child block for testing""" | ||
transcript_download_format = 'txt' | ||
|
||
def __init__(self, category, test_id='test-id', test_html='<div>This is a test</div>'): | ||
self.category = category | ||
self.published_on = 'published-on-{}'.format(test_id) | ||
self.edited_on = 'edited-on-{}'.format(test_id) | ||
self.scope_ids = lambda: None | ||
self.scope_ids.def_id = 'def-id-{}'.format(test_id) | ||
self.html = test_html | ||
self.transcript = fake_transcript | ||
|
||
def get_html(self): | ||
if self.category == 'html': | ||
return self.html | ||
|
||
return None | ||
|
||
|
||
class FakeBlock: | ||
"Fake block for testing, returns given children" | ||
def __init__(self, children): | ||
self.children = children | ||
self.scope_ids = lambda: None | ||
self.scope_ids.usage_id = UsageKey.from_string('block-v1:edX+A+B+type@vertical+block@verticalD') | ||
|
||
def get_children(self): | ||
return self.children | ||
|
||
|
||
class LearningAssistantAPITests(TestCase): | ||
""" | ||
|
@@ -38,3 +82,112 @@ def test_get_setup_messages(self): | |
def test_get_setup_messages_invalid_course_id(self): | ||
setup_messages = get_setup_messages('course-v1:edx+fake+19') | ||
self.assertIsNone(setup_messages) | ||
|
||
|
||
@ddt.ddt | ||
class GetBlockContentAPITests(TestCase): | ||
""" | ||
Test suite for the get_block_content api function. | ||
""" | ||
|
||
def setUp(self): | ||
self.children = [ | ||
FakeChild('html', '01', ''' | ||
<p> | ||
Lorem ipsum dolor sit amet, consectetur adipiscing elit. | ||
Vivamus dapibus elit lacus, at vehicula arcu vehicula in. | ||
In id felis arcu. Maecenas elit quam, volutpat cursus pharetra vel, tempor at lorem. | ||
Fusce luctus orci quis tempor aliquet. | ||
</p>'''), | ||
FakeChild('html', '02', ''' | ||
<Everything on the content on this child is inside a tag, so parsing it would return almost> | ||
Nothing | ||
</The quick brown fox jumps over the lazy dog>'''), | ||
FakeChild('video', '03'), | ||
FakeChild('unknown', '04') | ||
] | ||
self.block = FakeBlock(self.children) | ||
|
||
self.course_id = 'course-v1:edx+test+23' | ||
|
||
@ddt.data( | ||
('video', True), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we include There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, thanks for catching that! |
||
('html', True), | ||
('unknown', False) | ||
) | ||
@ddt.unpack | ||
@patch('learning_assistant.api.block_leaf_filter') | ||
def test_block_leaf_filter(self, category, expected_value, mock_leaf_filter): | ||
mock_leaf_filter.return_value = True | ||
|
||
block = FakeChild(category) | ||
|
||
is_leaf = _leaf_filter(block) | ||
self.assertEqual(is_leaf, expected_value) | ||
|
||
@ddt.data( | ||
'video', | ||
'html', | ||
'unknown' | ||
) | ||
@patch('learning_assistant.api.html_to_text') | ||
@patch('learning_assistant.api.get_text_transcript') | ||
def test_extract_block_contents(self, category, mock_html, mock_transcript): | ||
mock_return = 'This is the block content' | ||
mock_html.return_value = mock_return | ||
mock_transcript.return_value = mock_return | ||
|
||
block = FakeChild(category) | ||
|
||
block_content = _extract_block_contents(block, category) | ||
|
||
if category in ['html', 'video']: | ||
self.assertEqual(block_content, mock_return) | ||
else: | ||
self.assertIsNone(block_content) | ||
|
||
@patch('learning_assistant.api.traverse_block_pre_order') | ||
@patch('learning_assistant.api.html_to_text') | ||
@patch('learning_assistant.api.get_text_transcript') | ||
def test_get_children_contents(self, mock_transcript, mock_html, mock_traversal): | ||
mock_traversal.return_value = self.children | ||
block_content = 'This is the block content' | ||
mock_html.return_value = block_content | ||
mock_transcript.return_value = block_content | ||
|
||
length, items = _get_children_contents(self.block) | ||
|
||
expected_items = [ | ||
{'content_type': 'TEXT', 'content_text': block_content}, | ||
{'content_type': 'TEXT', 'content_text': block_content}, | ||
{'content_type': 'VIDEO', 'content_text': block_content} | ||
] | ||
|
||
# expected length should be equivalent to the sum of the content length in each of the 3 child blocks | ||
# that are either video or html | ||
self.assertEqual(length, len(block_content) * 3) | ||
self.assertEqual(len(items), 3) | ||
self.assertEqual(items, expected_items) | ||
|
||
@patch('learning_assistant.api.get_single_block') | ||
@patch('learning_assistant.api._get_children_contents') | ||
def test_get_block_content(self, mock_get_children_contents, mock_get_single_block): | ||
mock_get_single_block.return_value = self.block | ||
|
||
block_content = 'This is the block content' | ||
content_items = [{'content_type': 'TEXT', 'content_text': block_content}] | ||
mock_get_children_contents.return_value = (len(block_content), content_items) | ||
|
||
# mock arguments that are passed through to `get_single_block` function. the value of these | ||
# args does not matter for this test right now, as the `get_single_block` function is entirely mocked. | ||
request = MagicMock() | ||
user_id = 1 | ||
course_id = self.course_id | ||
unit_usage_key = 'block-v1:edX+A+B+type@vertical+block@verticalD' | ||
|
||
length, items = get_block_content(request, user_id, course_id, unit_usage_key) | ||
|
||
mock_get_children_contents.assert_called_with(self.block) | ||
|
||
self.assertEqual(length, len(block_content)) | ||
self.assertEqual(items, content_items) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a custom leaf filter to exclude blocks that are not video or html, that way we limit any calculations/processing on nodes that aren't relevant to what we need.