diff --git a/learning_assistant/api.py b/learning_assistant/api.py
index fbc9bf3..17bb9f4 100644
--- a/learning_assistant/api.py
+++ b/learning_assistant/api.py
@@ -1,7 +1,16 @@
"""
Library for the learning_assistant app.
"""
+from learning_assistant.constants import ACCEPTED_CATEGORY_TYPES, CATEGORY_TYPE_MAP
from learning_assistant.models import CoursePrompt
+from learning_assistant.platform_imports import (
+ block_get_children,
+ block_leaf_filter,
+ get_single_block,
+ get_text_transcript,
+ traverse_block_pre_order,
+)
+from learning_assistant.text_utils import html_to_text
def get_deserialized_prompt_content_by_course_id(course_id):
@@ -23,3 +32,69 @@ def get_setup_messages(course_id):
setup_messages = [{'role': 'system', 'content': x} for x in message_content]
return setup_messages
return None
+
+
+def _extract_block_contents(child, category):
+ """
+ Process the child contents based on its category.
+
+ Returns a string or None if there are no contents available.
+ """
+ if category == 'html':
+ content_html = child.get_html()
+ text = html_to_text(content_html)
+ return text
+
+ if category == 'video':
+ transcript = get_text_transcript(child) # may be None
+ return transcript
+
+ return None
+
+
+def _leaf_filter(block):
+ """
+ Return only leaf nodes of a particular category.
+ """
+ is_leaf = block_leaf_filter(block)
+ category = block.category
+
+ return is_leaf and category in ACCEPTED_CATEGORY_TYPES
+
+
+def _get_children_contents(block):
+ """
+ Given a specific block, return the content type and text of a pre-order traversal of the blocks children.
+ """
+ leaf_nodes = traverse_block_pre_order(block, block_get_children, _leaf_filter)
+
+ length = 0
+ items = []
+
+ for node in leaf_nodes:
+ category = node.category
+ content = _extract_block_contents(node, category)
+
+ if content:
+ length += len(content)
+ items.append({
+ 'content_type': CATEGORY_TYPE_MAP.get(category),
+ 'content_text': content,
+ })
+
+ return length, items
+
+
+def get_block_content(request, user_id, course_id, unit_usage_key):
+ """
+ Public wrapper for retrieving the content of a given block's children.
+
+ Returns
+ length - the cummulative length of a block's children's content
+ items - a list of dictionaries containing the content type and text for each child
+ """
+ block = get_single_block(request, user_id, course_id, unit_usage_key)
+
+ length, items = _get_children_contents(block)
+
+ return length, items
diff --git a/learning_assistant/constants.py b/learning_assistant/constants.py
index 580616c..06097ff 100644
--- a/learning_assistant/constants.py
+++ b/learning_assistant/constants.py
@@ -8,3 +8,9 @@
EXTERNAL_COURSE_KEY_PATTERN = r'([A-Za-z0-9-_:]+)'
COURSE_ID_PATTERN = rf'(?P({INTERNAL_COURSE_KEY_PATTERN}|{EXTERNAL_COURSE_KEY_PATTERN}))'
+
+ACCEPTED_CATEGORY_TYPES = ['html', 'video']
+CATEGORY_TYPE_MAP = {
+ "html": "TEXT",
+ "video": "VIDEO",
+}
diff --git a/learning_assistant/platform_imports.py b/learning_assistant/platform_imports.py
new file mode 100644
index 0000000..d29de31
--- /dev/null
+++ b/learning_assistant/platform_imports.py
@@ -0,0 +1,47 @@
+"""
+Contain all imported functions coming out of the platform.
+
+We know these functions will be available at run time, but they
+cannot be imported normally.
+"""
+
+
+def get_text_transcript(video_block):
+ """Get the transcript for a video block in text format, or None."""
+ # pylint: disable=import-error, import-outside-toplevel
+ from xmodule.exceptions import NotFoundError
+ from xmodule.video_block.transcripts_utils import get_transcript
+ try:
+ transcript, _, _ = get_transcript(video_block, output_format='txt')
+ except NotFoundError:
+ # some old videos have no transcripts, just accept that reality
+ return None
+ return transcript
+
+
+def get_single_block(request, user_id, course_id, usage_key_string, course=None):
+ """Load a single xblock."""
+ # pylint: disable=import-error, import-outside-toplevel
+ from lms.djangoapps.courseware.block_renderer import load_single_xblock
+ return load_single_xblock(request, user_id, course_id, usage_key_string, course)
+
+
+def traverse_block_pre_order(start_node, get_children, filter_func=None):
+ """Traverse a DAG or tree in pre-order."""
+ # pylint: disable=import-error, import-outside-toplevel
+ from openedx.core.lib.graph_traversals import traverse_pre_order
+ return traverse_pre_order(start_node, get_children, filter_func)
+
+
+def block_leaf_filter(block):
+ """Return only leaf nodes."""
+ # pylint: disable=import-error, import-outside-toplevel
+ from openedx.core.lib.graph_traversals import leaf_filter
+ return leaf_filter(block)
+
+
+def block_get_children(block):
+ """Return children of a given block."""
+ # pylint: disable=import-error, import-outside-toplevel
+ from openedx.core.lib.graph_traversals import get_children
+ return get_children(block)
diff --git a/learning_assistant/text_utils.py b/learning_assistant/text_utils.py
new file mode 100644
index 0000000..728fd14
--- /dev/null
+++ b/learning_assistant/text_utils.py
@@ -0,0 +1,62 @@
+"""
+Text manipulation utils. This has been copied from the ai-aside repository.
+"""
+
+from html.parser import HTMLParser
+from re import sub
+
+from django.conf import settings
+
+
+def cleanup_text(text):
+ """
+ Remove litter from replacing or manipulating text.
+ """
+ stripped = sub(r'[^\S\r\n]+', ' ', text) # Removing extra spaces
+ stripped = sub(r'\n{2,}', '\n', stripped) # Removing extra new lines
+ stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped) # Removing starting extra spacesbetween new lines
+ stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped) # Trim
+
+ return stripped
+
+
+class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method
+ """
+ Helper function for html_to_text below.
+ """
+
+ _is_content = True
+
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.reset()
+ self.fed = []
+
+ def handle_starttag(self, tag, _):
+ """On each tag, check whether this is a tag we think is content."""
+ tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None)
+ self._is_content = not (tags_to_filter and tag in tags_to_filter)
+
+ def handle_data(self, data):
+ """Handle tag data by appending text we think is content."""
+ if self._is_content:
+ self.fed.append(data)
+
+ def handle_entityref(self, name):
+ """If there is an entity, append the reference to the text."""
+ if self._is_content:
+ self.fed.append('&%s;' % name)
+
+ def get_data(self):
+ """Join together the separate data chunks into one cohesive string."""
+ return ''.join(self.fed)
+
+
+def html_to_text(html):
+ """Strip the html tags off of the text to return plaintext."""
+ htmlstripper = _HTMLToTextHelper()
+ htmlstripper.feed(html)
+ text = htmlstripper.get_data()
+ text = cleanup_text(text)
+
+ return text
diff --git a/tests/test_api.py b/tests/test_api.py
index 3316f98..bf198d7 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,11 +1,55 @@
"""
Test cases for the learning-assistant api module.
"""
+from unittest.mock import MagicMock, patch
+
+import ddt
from django.test import TestCase
+from opaque_keys.edx.keys import UsageKey
-from learning_assistant.api import get_deserialized_prompt_content_by_course_id, get_setup_messages
+from learning_assistant.api import (
+ _extract_block_contents,
+ _get_children_contents,
+ _leaf_filter,
+ get_block_content,
+ get_deserialized_prompt_content_by_course_id,
+ get_setup_messages,
+)
from learning_assistant.models import CoursePrompt
+fake_transcript = 'This is the text version from the transcript'
+
+
+class FakeChild:
+ """Fake child block for testing"""
+ transcript_download_format = 'txt'
+
+ def __init__(self, category, test_id='test-id', test_html='This is a test
'):
+ self.category = category
+ self.published_on = 'published-on-{}'.format(test_id)
+ self.edited_on = 'edited-on-{}'.format(test_id)
+ self.scope_ids = lambda: None
+ self.scope_ids.def_id = 'def-id-{}'.format(test_id)
+ self.html = test_html
+ self.transcript = fake_transcript
+
+ def get_html(self):
+ if self.category == 'html':
+ return self.html
+
+ return None
+
+
+class FakeBlock:
+ "Fake block for testing, returns given children"
+ def __init__(self, children):
+ self.children = children
+ self.scope_ids = lambda: None
+ self.scope_ids.usage_id = UsageKey.from_string('block-v1:edX+A+B+type@vertical+block@verticalD')
+
+ def get_children(self):
+ return self.children
+
class LearningAssistantAPITests(TestCase):
"""
@@ -38,3 +82,112 @@ def test_get_setup_messages(self):
def test_get_setup_messages_invalid_course_id(self):
setup_messages = get_setup_messages('course-v1:edx+fake+19')
self.assertIsNone(setup_messages)
+
+
+@ddt.ddt
+class GetBlockContentAPITests(TestCase):
+ """
+ Test suite for the get_block_content api function.
+ """
+
+ def setUp(self):
+ self.children = [
+ FakeChild('html', '01', '''
+
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+ Vivamus dapibus elit lacus, at vehicula arcu vehicula in.
+ In id felis arcu. Maecenas elit quam, volutpat cursus pharetra vel, tempor at lorem.
+ Fusce luctus orci quis tempor aliquet.
+
'''),
+ FakeChild('html', '02', '''
+
+ Nothing
+ '''),
+ FakeChild('video', '03'),
+ FakeChild('unknown', '04')
+ ]
+ self.block = FakeBlock(self.children)
+
+ self.course_id = 'course-v1:edx+test+23'
+
+ @ddt.data(
+ ('video', True),
+ ('html', True),
+ ('unknown', False)
+ )
+ @ddt.unpack
+ @patch('learning_assistant.api.block_leaf_filter')
+ def test_block_leaf_filter(self, category, expected_value, mock_leaf_filter):
+ mock_leaf_filter.return_value = True
+
+ block = FakeChild(category)
+
+ is_leaf = _leaf_filter(block)
+ self.assertEqual(is_leaf, expected_value)
+
+ @ddt.data(
+ 'video',
+ 'html',
+ 'unknown'
+ )
+ @patch('learning_assistant.api.html_to_text')
+ @patch('learning_assistant.api.get_text_transcript')
+ def test_extract_block_contents(self, category, mock_html, mock_transcript):
+ mock_return = 'This is the block content'
+ mock_html.return_value = mock_return
+ mock_transcript.return_value = mock_return
+
+ block = FakeChild(category)
+
+ block_content = _extract_block_contents(block, category)
+
+ if category in ['html', 'video']:
+ self.assertEqual(block_content, mock_return)
+ else:
+ self.assertIsNone(block_content)
+
+ @patch('learning_assistant.api.traverse_block_pre_order')
+ @patch('learning_assistant.api.html_to_text')
+ @patch('learning_assistant.api.get_text_transcript')
+ def test_get_children_contents(self, mock_transcript, mock_html, mock_traversal):
+ mock_traversal.return_value = self.children
+ block_content = 'This is the block content'
+ mock_html.return_value = block_content
+ mock_transcript.return_value = block_content
+
+ length, items = _get_children_contents(self.block)
+
+ expected_items = [
+ {'content_type': 'TEXT', 'content_text': block_content},
+ {'content_type': 'TEXT', 'content_text': block_content},
+ {'content_type': 'VIDEO', 'content_text': block_content}
+ ]
+
+ # expected length should be equivalent to the sum of the content length in each of the 3 child blocks
+ # that are either video or html
+ self.assertEqual(length, len(block_content) * 3)
+ self.assertEqual(len(items), 3)
+ self.assertEqual(items, expected_items)
+
+ @patch('learning_assistant.api.get_single_block')
+ @patch('learning_assistant.api._get_children_contents')
+ def test_get_block_content(self, mock_get_children_contents, mock_get_single_block):
+ mock_get_single_block.return_value = self.block
+
+ block_content = 'This is the block content'
+ content_items = [{'content_type': 'TEXT', 'content_text': block_content}]
+ mock_get_children_contents.return_value = (len(block_content), content_items)
+
+ # mock arguments that are passed through to `get_single_block` function. the value of these
+ # args does not matter for this test right now, as the `get_single_block` function is entirely mocked.
+ request = MagicMock()
+ user_id = 1
+ course_id = self.course_id
+ unit_usage_key = 'block-v1:edX+A+B+type@vertical+block@verticalD'
+
+ length, items = get_block_content(request, user_id, course_id, unit_usage_key)
+
+ mock_get_children_contents.assert_called_with(self.block)
+
+ self.assertEqual(length, len(block_content))
+ self.assertEqual(items, content_items)
diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py
new file mode 100644
index 0000000..3fbb244
--- /dev/null
+++ b/tests/test_text_utils.py
@@ -0,0 +1,57 @@
+"""Tests for text utils used by the blocks"""
+import unittest
+from textwrap import dedent
+
+from learning_assistant.text_utils import html_to_text
+
+
+class TestSummaryHookAside(unittest.TestCase):
+ """Tests of text utils as used by the summary hook"""
+ def test_html_to_text(self):
+ html_content = '''\
+
+
Lorem Ipsum
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+
+
Sed volutpat velit sed dui fringilla fermentum.
+ Nullam quis velit at turpis lacinia convallis.
+ '''
+ expected_text = dedent('''\
+ Lorem Ipsum
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+ Sed volutpat velit sed dui fringilla fermentum.
+ Nullam quis velit at turpis lacinia convallis.''')
+ text = html_to_text(html_content)
+ self.assertEqual(text, expected_text)
+
+ def test_html_to_text_messy(self):
+ html_content = '''\
+
+ Lorem Ipsum
+ > Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+ > Sed volutpat velit sed dui fringilla fermentum.42>
+ > Nullam quis velit at turpis lacinia convallis.
'''
+ expected_text = dedent('''\
+ Lorem Ipsum
+ > Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+ > Sed volutpat velit sed dui fringilla fermentum.
+ > Nullam quis velit at turpis lacinia convallis.''')
+ text = html_to_text(html_content)
+ self.assertEqual(text, expected_text)
+
+ def test_html_to_text_iframe(self):
+ html_content = '''\
+
+ '''
+ expected_text = dedent('')
+ text = html_to_text(html_content)
+ self.assertEqual(text, expected_text)
+
+
+if __name__ == '__main__':
+ unittest.main()