diff --git a/manubot/pandoc/cite.py b/manubot/pandoc/cite.py index 7dfc1f4d..3acb9164 100644 --- a/manubot/pandoc/cite.py +++ b/manubot/pandoc/cite.py @@ -21,13 +21,12 @@ http://scorreia.com/software/panflute/code.html#panflute.elements.Citation """ import argparse +import logging import sys import panflute as pf -from manubot.cite.citekey import ( - is_valid_citekey, -) +from manubot.cite.citekey import is_valid_citekey from manubot.process.util import ( get_citekeys_df, generate_csl_items, @@ -65,7 +64,7 @@ def parse_args(): return args -def _get_citation_string_action(elem, doc): +def _get_citekeys_action(elem, doc): """ Panflute action to extract citationId from all Citations in the AST. """ @@ -89,6 +88,39 @@ def _citation_to_id_action(elem, doc): return None +def _get_reference_link_citekey_aliases(elem, doc): + """ + Based on TypeScript implementation by phiresky at + https://github.com/phiresky/pandoc-url2cite/blob/b28374a9a037a5ce1747b8567160d8dffd64177e/index.ts#L118-L152 + + Uses markdown's link reference syntax to define citekey aliases (tags) + https://spec.commonmark.org/0.29/#link-reference-definitions + """ + if type(elem) != pf.Para: + return + while ( + len(elem.content) >= 3 + and type(elem.content[0]) == pf.Cite + and len(elem.content[0].citations) == 1 # differs from pandoc-url2cite + and type(elem.content[1]) == pf.Str + and elem.content[1].text == ":" + ): + space_index = 3 if type(elem.content[2]) == pf.Space else 2 + destination = elem.content[space_index] + if type(destination) == pf.Str: + # paragraph starts with [@something]: something + # save info to citekeys and remove from paragraph + citekey = elem.content[0].citations[0].id # differs from pandoc-url2cite + citekey_aliases = global_variables["citekey_aliases"] + if citekey in citekey_aliases: + logging.warning(f"duplicate citekey {citekey}") + citekey_aliases[citekey] = destination.text + # found citation, add it to citekeys and remove it from document + elem.content = elem.content[space_index + 1 :] + if len(elem.content) > 0 and type(elem.content[0]) == pf.SoftBreak: + elem.content.pop(0) + + def process_citations(doc): """ Apply citation-by-identifier to a Python object representation of @@ -100,8 +132,13 @@ def process_citations(doc): - manubot-requests-cache-path - manubot-clear-requests-cache """ + global_variables["citekey_aliases"] = doc.get_metadata( + "citekey-aliases", default={}, builtin=True + ) + manuscript_citekeys = set(global_variables["manuscript_citekeys"]) - doc.walk(_get_citation_string_action) + doc.walk(_get_reference_link_citekey_aliases) + doc.walk(_get_citekeys_action) manuscript_citekeys = set(global_variables["manuscript_citekeys"]) manuscript_citekeys = sorted( filter( @@ -112,8 +149,9 @@ def process_citations(doc): ) ) global_variables["manuscript_citekeys"] = manuscript_citekeys - tag_to_string = doc.get_metadata("citekey-aliases", default={}, builtin=True) - citekeys_df = get_citekeys_df(manuscript_citekeys, tag_to_string) + citekeys_df = get_citekeys_df( + manuscript_citekeys, global_variables["citekey_aliases"] + ) global_variables["citekeys_df"] = citekeys_df global_variables["citekey_shortener"] = dict( zip((citekeys_df["manuscript_citekey"]), citekeys_df["short_citekey"]) diff --git a/manubot/pandoc/tests/input-with-cites.md b/manubot/pandoc/tests/input-with-cites.md index bc3460a5..4db49efb 100644 --- a/manubot/pandoc/tests/input-with-cites.md +++ b/manubot/pandoc/tests/input-with-cites.md @@ -8,6 +8,13 @@ This is a sentence with many citations [@pmid:20170387; @doi:10.7717/peerj.705]. Citations in code blocks should not be modified [@pmid:20170387]. ``` +Defining citekeys with the link reference syntax [@tag:issue; @tag:bad-doi; @tag:bad-url]. + +[@tag:issue]: url:https://github.com/manubot/manubot/pull/189 +[@tag:bad-doi]: doi:10.1016/S0022-2836(05)80360-2 + +[@tag:bad-url]: url:https://openreview.net/forum?id=HkwoSDPgg + ## References ::: {#refs} diff --git a/manubot/pandoc/tests/output-with-cites.txt b/manubot/pandoc/tests/output-with-cites.txt index 00d5f215..364fe505 100644 --- a/manubot/pandoc/tests/output-with-cites.txt +++ b/manubot/pandoc/tests/output-with-cites.txt @@ -6,6 +6,8 @@ This is a sentence with many citations [1,2]. Citations in code blocks should not be modified [@pmid:20170387]. +Defining citekeys with the link reference syntax [3,4,5]. + References 1. Orthodontic treatment with tooth transplantation for patients with cleft lip and palate. @@ -17,3 +19,14 @@ DOI: 10.1597/08-134 · PMID: 20170387 Kamen P. Simeonov, Daniel S. Himmelstein PeerJ (2015-01-13) https://doi.org/98p DOI: 10.7717/peerj.705 · PMID: 25648772 · PMCID: PMC4304851 + +3. CSL_Item: add date IO functionality by dhimmel · Pull Request #189 · manubot/manubotGitHub https://github.com/manubot/manubot/pull/189 + +4. Basic local alignment search tool +Stephen F. Altschul, Warren Gish, Webb Miller, Eugene W. Myers, David J. Lipman +Journal of Molecular Biology (1990-10) https://doi.org/cnsjsz +DOI: 10.1016/s0022-2836(05)80360-2 + +5. Semi-supervised Knowledge Transfer for Deep Learning from Private Training Data +Nicolas Papernot, Martí, N Abadi, Ú, Lfar Erlingsson, Ian Goodfellow, Kunal Talwar +(2016-11-02) https://openreview.net/forum?id=HkwoSDPgg