-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_article.py
33 lines (25 loc) · 1.72 KB
/
extract_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import argparse
import json
from .readabilipy import simple_json_from_html_string
def main():
parser = argparse.ArgumentParser(
description="Extract article data from a HTML file using either Mozilla's Readability.js package or a simplified python-only alternative.")
parser.add_argument('-i', '--input-file', required=True,
help="Path to input file containing HTML.")
parser.add_argument('-o', '--output-file', required=True,
help="Path to file to output the article data to as JSON.")
parser.add_argument('-c', '--content-digests', action='store_true',
help="Add a 'data-content-digest' attribute containing a SHA256-based digest of the element's contents to each HTML element in the plain_content output.")
parser.add_argument('-n', '--node-indexes', action='store_true',
help="Add a 'data-node-index' attribute containing a hierarchical representation of the element's position in the HTML structure each HTML element in the plain_content output.")
parser.add_argument('-p', '--use-python-parser', action='store_true',
help="Use the pure-python 'plain_html' parser included in this project rather than Mozilla's Readability.js.")
args = parser.parse_args()
with open(args.input_file) as h:
html = h.read()
article = simple_json_from_html_string(html, content_digests=args.content_digests,
node_indexes=args.node_indexes, use_readability=(not args.use_python_parser))
with open(args.output_file, "w") as j:
json.dump(article, j, ensure_ascii=False)
if __name__ == '__main__':
main()