Skip to content

Commit

Permalink
Merge pull request #1245 from rdmorganiser/refactor_pandoc
Browse files Browse the repository at this point in the history
Refactor pandoc
  • Loading branch information
jochenklar authored Feb 20, 2025
2 parents 1417153 + 675078e commit 85e99c0
Show file tree
Hide file tree
Showing 6 changed files with 310 additions and 147 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ jobs:
path: dist
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt-get install --yes pandoc texlive-xetex librsvg2-bin
sudo apt-get update && sudo apt-get install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin
pandoc --version
python -m pip install --upgrade pip
python -m pip --version
Expand Down Expand Up @@ -172,7 +172,7 @@ jobs:
path: dist
- name: Install Dependencies
run: |
sudo apt-get update && sudo apt install --yes pandoc texlive-xetex librsvg2-bin
sudo apt-get update && sudo apt install --yes pandoc texlive-latex-base texlive-latex-extra texlive-luatex librsvg2-bin
python -m pip install --upgrade pip
- name: Install rdmo[postgres] from wheel and start postgresql
run: |
Expand Down
143 changes: 143 additions & 0 deletions rdmo/core/pandoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import json
import logging
import os
import re
from pathlib import Path
from tempfile import mkstemp

from django.apps import apps
from django.conf import settings

import pypandoc
from packaging.version import Version
from packaging.version import parse as parse_version

log = logging.getLogger(__name__)


def get_pandoc_version():
return parse_version(pypandoc.get_pandoc_version())


def get_pandoc_content(html, metadata, export_format, context):
pandoc_args = get_pandoc_args(export_format, context)

if metadata:
# create a temporary file for the metadata
(metadata_tmp_fd, metadata_tmp_file_name) = mkstemp(suffix='.json')

# save metadata
log.info('Save metadata file %s %s', metadata_tmp_file_name, str(metadata))
with open(metadata_tmp_file_name, 'w') as fp:
json.dump(metadata, fp)

# add metadata file to pandoc args
pandoc_args.append('--metadata-file=' + metadata_tmp_file_name)

# create a temporary file
(tmp_fd, tmp_file_name) = mkstemp(f'.{export_format}')

# convert the file using pandoc
log.info('Export %s document using args %s.', export_format, pandoc_args)
html = re.sub(
r'(<img.+src=["\'])' + settings.STATIC_URL + r'([\w\-\@?^=%&/~\+#]+)', r'\g<1>' +
str(Path(settings.STATIC_ROOT)) + r'/\g<2>', html
)
pypandoc.convert_text(html, export_format, format='html', outputfile=tmp_file_name, extra_args=pandoc_args)

# read the created temporary file
with open(tmp_file_name, 'rb') as fp:
pandoc_content = fp.read()

# delete temporary files
if metadata:
os.remove(metadata_tmp_file_name)
os.remove(tmp_file_name)

return pandoc_content


def get_pandoc_content_disposition(export_format, title):
if export_format == 'pdf':
# display pdf in browser
return f'filename="{title}.{export_format}"'
else:
return f'attachment; filename="{title}.{export_format}"'


def get_pandoc_args(export_format, context):
pandoc_version = get_pandoc_version()
pandoc_args = list(settings.EXPORT_PANDOC_ARGS.get(export_format, [])) # without list(), settings would be changed

if export_format == 'pdf':
# we used xelatex before pandoc 3
if pandoc_version < Version('3'):
pandoc_args = [
arg.replace('--pdf-engine=lualatex', '--pdf-engine=xelatex')
for arg in pandoc_args
]

elif export_format in ['docx', 'odt']:
# find and add a possible reference document
reference_document = get_pandoc_reference_document(export_format, context)
if reference_document:
if pandoc_version >= Version('2'):
pandoc_args.append(f'--reference-doc={reference_document}')
else:
pandoc_args.append(f'--reference-{export_format}={reference_document}')

# add STATIC_ROOT and possible additional resource paths
if pandoc_version >= Version('2'):
pandoc_args.append(f'--resource-path={settings.STATIC_ROOT}')
if 'resource_path' in context:
resource_path = Path(settings.MEDIA_ROOT) / context['resource_path']
pandoc_args.append(f'--resource-path={resource_path}')

return pandoc_args


def get_pandoc_reference_document(export_format, context):
# collect all configured reference documents
reference_documents = get_pandoc_reference_documents(export_format, context)

# return the first reference document that actually exists
for reference_document in reference_documents:
if reference_document and reference_document.exists():
return Path(reference_document)


def get_pandoc_reference_documents(export_format, context):
# try to get the view and its uri from the context, if it is not set, the current url should be project_answers
try:
view = context['view']
view_uri = view.uri
except (KeyError, AttributeError):
view_uri = None

reference_documents = []

if export_format == 'odt':
# append view specific custom reference document
if view_uri and view_uri in settings.EXPORT_REFERENCE_ODT_VIEWS:
reference_documents.append(settings.EXPORT_REFERENCE_ODT_VIEWS[view_uri])

# append generic custom reference document
if settings.EXPORT_REFERENCE_ODT:
reference_documents.append(settings.EXPORT_REFERENCE_ODT)

# append the default reference document
reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.odt')

elif export_format == 'docx':
# append view specific custom reference document
if view_uri and view_uri in settings.EXPORT_REFERENCE_DOCX_VIEWS:
reference_documents.append(settings.EXPORT_REFERENCE_DOCX_VIEWS[view_uri])

# append generic custom reference document
if settings.EXPORT_REFERENCE_DOCX:
reference_documents.append(settings.EXPORT_REFERENCE_DOCX)

# append the default reference document
reference_documents.append(Path(apps.get_app_config('rdmo').path) / 'share' / 'reference.docx')

return reference_documents
2 changes: 1 addition & 1 deletion rdmo/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@
EXPORT_REFERENCE_DOCX = None

EXPORT_PANDOC_ARGS = {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'],
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex'],
'rtf': ['--standalone']
}

Expand Down
156 changes: 156 additions & 0 deletions rdmo/core/tests/test_pandoc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from pathlib import Path

import pytest

from django.apps import apps

from packaging.version import Version

from ..pandoc import get_pandoc_args, get_pandoc_reference_document, get_pandoc_reference_documents, get_pandoc_version

rdmo_path = Path(apps.get_app_config('rdmo').path)
testing_path = rdmo_path.parent / 'testing'

pandoc_versions = [
'1.9.0',
'2.0.0',
'3.0.0',
'3.5.0'
]

export_formats = [
'rtf',
'odt',
'docx',
'html',
'markdown',
'tex',
'pdf'
]

pandoc_args_map = {
'1.9.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex'],
'rtf': ['--standalone'],
'docx': [f'--reference-docx={rdmo_path}/share/reference.docx'],
'odt': [f'--reference-odt={rdmo_path}/share/reference.odt'],
'other': []
},
'2.0.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=xelatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
},
'3.0.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
},
'3.5.0': {
'pdf': ['-V', 'geometry:a4paper, margin=1in', '--pdf-engine=lualatex',
f'--resource-path={testing_path}/static_root'],
'rtf': ['--standalone', f'--resource-path={testing_path}/static_root'],
'docx': [f'--reference-doc={rdmo_path}/share/reference.docx', f'--resource-path={testing_path}/static_root'],
'odt': [f'--reference-doc={rdmo_path}/share/reference.odt', f'--resource-path={testing_path}/static_root'],
'other': [f'--resource-path={testing_path}/static_root']
}
}

class MockedView:
uri = 'http://example.com/terms/views/view'

@pytest.mark.parametrize('pandoc_version', pandoc_versions)
def test_get_pandoc_version(mocker, pandoc_version):
mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version)
assert get_pandoc_version() == Version(pandoc_version)


@pytest.mark.parametrize('pandoc_version', pandoc_versions)
@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_args(settings, mocker, pandoc_version, export_format):
mocker.patch('pypandoc.get_pandoc_version', return_value=pandoc_version)

assert get_pandoc_args(export_format, {}) == \
pandoc_args_map[pandoc_version].get(export_format, pandoc_args_map[pandoc_version]['other'])


def test_get_pandoc_reference_document(mocker):
mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[
rdmo_path / 'share' / 'missing.docx',
rdmo_path / 'share' / 'reference.docx',
rdmo_path / 'share' / 'reference.odt'
])

# return the first existing file
assert get_pandoc_reference_document('other', {}) == rdmo_path / 'share' / 'reference.docx'


def test_get_pandoc_reference_document_missing(mocker):
mocker.patch('rdmo.core.pandoc.get_pandoc_reference_documents', return_value=[
rdmo_path / 'share' / 'missing.docx',
rdmo_path / 'share' / 'missing.odt'
])

assert get_pandoc_reference_document('other', {}) is None


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents(export_format):
rdmo_path = Path(apps.get_app_config('rdmo').path)

reference_documents = get_pandoc_reference_documents(export_format, {})

if export_format in ['docx', 'odt']:
assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_view(export_format):
reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()})

if export_format in ['docx', 'odt']:
assert reference_documents == [rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_view_settings(settings, export_format):
mock_file = rdmo_path / 'share' / f'mock.{export_format}'

if export_format == 'docx':
settings.EXPORT_REFERENCE_DOCX_VIEWS = {'http://example.com/terms/views/view': mock_file}
elif export_format == 'odt':
settings.EXPORT_REFERENCE_ODT_VIEWS = {'http://example.com/terms/views/view': mock_file}

reference_documents = get_pandoc_reference_documents(export_format, {'view': MockedView()})

if export_format in ['docx', 'odt']:
assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []


@pytest.mark.parametrize('export_format', export_formats)
def test_get_pandoc_reference_documents_settings(settings, export_format):
mock_file = rdmo_path / 'share' / f'mock.{export_format}'

if export_format == 'docx':
settings.EXPORT_REFERENCE_DOCX = mock_file
elif export_format == 'odt':
settings.EXPORT_REFERENCE_ODT = mock_file

reference_documents = get_pandoc_reference_documents(export_format, {})

if export_format in ['docx', 'odt']:
assert reference_documents == [mock_file, rdmo_path / 'share' / f'reference.{export_format}']
else:
assert reference_documents == []
Loading

0 comments on commit 85e99c0

Please sign in to comment.