striptexcomments

#!/usr/bin/env python3
#
# Copyright 2019 Thomas Nyman <thomas.nyman@iki.fi>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Original version by Adam Merberger (https://github.com/amerberg) released
# under the WTFPL license (http://www.wtfpl.net/about/) at
# https://gist.github.com/amerberg/a273ca1e579ab573b499
#
# Modifications that preserve comments in \makeatletter and \makeatother blocks
# by dzhuang (https://github.com/dzhuang) released at
# https://gist.github.com/dzhuang/dc34cdd7efa43e5ecc1dc981cc906c85
#
# Usage:
# python striptexcomments input.tex > output.tex
# python striptexcomments input.tex -e encoding > output.tex
#
import ply.lex, argparse, io
import re
import sys

def strip_comments(source):
    tokens = (
                'PERCENT', 'BEGINCOMMENT', 'ENDCOMMENT',
                'BACKSLASH', 'CHAR', 'BEGINVERBATIM',
                'ENDVERBATIM', 'NEWLINE', 'ESCPCT',
                'MAKEATLETTER', 'MAKEATOTHER', 'BOLPCT',
                'BEGINGROUP', 'ENDGROUP',
             )
    states = (
                ('makeatblock', 'exclusive'),
                ('makeatlinecomment', 'exclusive'),
                ('linecomment', 'exclusive'),
                ('midlinecomment', 'exclusive'),
                ('commentenv', 'exclusive'),
                ('verbatim', 'exclusive'),
                ('group', 'exclusive'),
                ('grouplinecomment', 'exclusive')
            )

    # Deal with escaped backslashes, so we don't think they're escaping %
    def t_BACKSLASH(t):
        r"\\\\"
        return t

    # Leaving all % in makeatblock
    def t_MAKEATLETTER(t):
        r"\\makeatletter"
        t.lexer.begin("makeatblock")
        return t

    # Leaving all % in \begingroup..\endgroup
    def t_BEGINGROUP(t):
        r"\\begingroup"
        t.lexer.group_depth += 1
        t.lexer.begin("group")
        return t

    # Comments from % at beginning of line to end of line
    def t_BOLPCT(t):
        r"^\s*\%"
        t.lexer.begin("linecomment")

    # Comments from % mid-line to end of line
    def t_PERCENT(t):
        r"(?<!^)\s*\%"
        t.lexer.begin("midlinecomment")

    # Escaped percent signs
    def t_ESCPCT(t):
        r"\\\%"
        return t

    # Comment environment, as defined by verbatim package
    def t_BEGINCOMMENT(t):
        r"\\begin\s*{\s*comment\s*}"
        t.lexer.begin("commentenv")

    # Verbatim environment (different treatment of comments within)
    def t_BEGINVERBATIM(t):
        r"\\begin\s*{\s*verbatim\s*}"
        t.lexer.begin("verbatim")
        return t

    # Any other character in initial state we leave alone
    def t_CHAR(t):
        r"."
        return t

    def t_NEWLINE(t):
        r"\n"
        return t

    # End comment environment
    def t_commentenv_ENDCOMMENT(t):
        r"\\end\s*{\s*comment\s*}"

        # Anything after \end{comment} on a line is ignored!
        t.lexer.begin('linecomment')

    # Ignore comments of comment environment
    def t_commentenv_CHAR(t):
        r"."
        pass

    def t_commentenv_NEWLINE(t):
        r"\n"
        pass

    # End of verbatim environment
    def t_verbatim_ENDVERBATIM(t):
        r"\\end\s*{\s*verbatim\s*}"
        t.lexer.begin('INITIAL')
        return t

    # Leave contents of verbatim environment alone
    def t_verbatim_CHAR(t):
        r"."
        return t

    def t_verbatim_NEWLINE(t):
        r"\n"
        return t

    # End a % comment when we get to a new line
    def t_linecomment_ENDCOMMENT(t):
        r"\n"
        t.lexer.begin("INITIAL")

        # Don't preserve newline at the end of a line comment
        pass

    def t_midlinecomment_ENDCOMMENT(t):
        r"\n"
        t.lexer.begin("INITIAL")

        # Preserve newline at the end of a eol comment
        return t

    # Ignore anything after a % on a line
    def t_linecomment_CHAR(t):
        r"."
        pass

    def t_midlinecomment_CHAR(t):
        r"."
        pass

    def t_makeatblock_MAKEATOTHER(t):
        r"\\makeatother"
        t.lexer.begin('INITIAL')
        return t

    def t_makeatblock_BACKSLASH(t):
        r"\\\\"
        return t

    # Escaped percent signs in makeatblock
    def t_makeatblock_ESCPCT(t):
        r"\\\%"
        return t

    # Preserve % in makeatblock
    def t_makeatblock_PERCENT(t):
        r"\%"
        t.lexer.begin("makeatlinecomment")
        return t

    def t_makeatlinecomment_NEWLINE(t):
        r"\n"
        t.lexer.begin('makeatblock')
        return t

    # Leave contents of makeatblock alone
    def t_makeatblock_CHAR(t):
        r"."
        return t

    def t_makeatblock_NEWLINE(t):
        r"\n"
        return t

    def t_group_BEGINGROUP(t):
        r"\\begingroup"
        t.lexer.group_depth += 1
        t.lexer.begin("group")
        return t

    def t_group_ENDGROUP(t):
        r"\\endgroup"
        t.lexer.group_depth -= 1
        if t.lexer.group_depth == 0:
            t.lexer.begin('INITIAL')
        return t

    def t_group_BACKSLASH(t):
        r"\\\\"
        return t

    # Escaped percent signs in group
    def t_group_ESCPCT(t):
        r"\\\%"
        return t

    # Preserve % in group
    def t_group_PERCENT(t):
        r"\%"
        t.lexer.begin("grouplinecomment")
        return t

    def t_grouplinecomment_NEWLINE(t):
        r"\n"
        t.lexer.begin('group')
        return t

    # Leave contents of group alone
    def t_group_CHAR(t):
        r"."
        return t

    def t_group_NEWLINE(t):
        r"\n"
        return t

    # For bad characters, we just skip over it
    def t_ANY_error(t):
        t.lexer.skip(1)

    # ply uses the re module internally. In the re module ^ is by default
    # interpreted only at the beginning of a string and NOT at beginning of
    # every line. The MULTILINE flag changes this behavior to make ^ match
    # the beginning of every line.
    lexer = ply.lex.lex(reflags=re.VERBOSE | re.MULTILINE)
    lexer.group_depth = 0
    lexer.input(source)
    return u"".join([tok.value for tok in lexer])


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help = 'the file to strip comments from')
    parser.add_argument('--encoding', '-e', default='utf-8')

    args = parser.parse_args()

    with io.open(args.filename, encoding=args.encoding) as f:
        source = f.read()

    sys.stdout.buffer.write(strip_comments(source).encode(args.encoding))

if __name__ == '__main__':
    main()