-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebparsing.py
57 lines (49 loc) · 2.46 KB
/
webparsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env python
""" webparsing.py: Provides resolution table from VMware KBs as machine-readable json files.
VMware KBs provide release information only as a human-readable HTML table.
However, for automation it would be nice to have it in a machine-readable format.
This script takes the tables from a VMware KB page and provides a json-file as an output.
This program is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, either version 3 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program. If not, see <http://www.gnu.org/licenses/>.
"""
__author__ = "Dominik Zorgnotti"
__contact__ = "[email protected]"
__created__ = "2020-02-26"
__deprecated__ = False
__contact__ = "[email protected]"
__license__ = "GPLv3"
__status__ = "beta"
__version__ = "0.2.0"
import requests, re
from bs4 import BeautifulSoup
BASEURL_VMWARE_KB = "https://kb.vmware.com/services/apexrest/v1/article?docid="
def get_kb_webdata(kb_article_id: int):
"""Accepts an int with the KB article id and will return the wbe response as JSON output"""
vmware_kb_url = f"{BASEURL_VMWARE_KB}{str(kb_article_id)}"
response = requests.get(vmware_kb_url)
response.raise_for_status()
return response.json()
def parse_kb_article_ids(summary_kb_article: int):
"""Accepts an int with the KB article id holding the sub-pages with the release data. Returns these as list of int"""
raw_data = get_kb_webdata(summary_kb_article)
list_of_kb_ids = []
resolution = raw_data["content"][1]["Resolution"]
# Parse for anchor tags in the table of the resolution table
soup = BeautifulSoup(resolution, "html.parser")
table = soup.find("table")
anchors = table.find_all("a")
# A very crude way of extracting the KB id from the HREF since not all are in the same format :-(
regular_expression = r"/(\d+)(?:.*)"
regex_object = re.compile(regular_expression)
for href_link in anchors:
regex_results = regex_object.search(href_link["href"])
kb_id = regex_results.groups()[0]
list_of_kb_ids.append(int(kb_id))
return list_of_kb_ids