-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 1eb2a72
Showing
5 changed files
with
211 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import re | ||
|
||
web = ["www.", "://", ".com", ".net", ".org", ".us", ".gov"] | ||
accepted_tags = ["p", "span", "article", "font", "blockquote"] | ||
exclude = ["cite"] | ||
accepted_classes = {"paragraph", "text"} | ||
bad_phrases = ["back to top", "home", "welcome", "you are here:", "itunes", "google", "facebook", "twitter", "comment"] | ||
bad_subphrases = ["powered by", "around the web", "around the internet", "et al", "ndl", "view source", "view history", | ||
"edit links", "last modified", "text is available under", "creative commons"] | ||
bad_headers = ["References", "Citations", "Further Reading", "External Links", "Footnotes", "See Also"] | ||
a = lambda x: x == x.lower() | ||
|
||
A = re.compile("[a-zA-Z]{2,}[0-9]{2,}[ \\.]*") | ||
B = re.compile("([0-9]+[a-zA-Z]+)+[\\s\\.]+") | ||
C = re.compile("[\\[\\{].*[\\]\\}]") | ||
D = re.compile("[A-Z]{2,3}: {0,2}[0-9]{3,}.{0,2}[0-9]*") | ||
E = re.compile("\\([a-zA-Z\\s]+ ([0-9]+[.]*)+\\)") | ||
F = re.compile("(\\\\[a-zA-Z0-9]{1,5})") | ||
def add_item(goods, parent): | ||
goods.append(parent) | ||
|
||
|
||
def find_good(parent, goods, wiki_mode): | ||
if parent is not None: | ||
if not parent.__class__.__name__ == "NavigableString" and not parent.__class__.__name__ == "Comment": | ||
if hasattr(parent, "name"): | ||
if parent.name in accepted_tags: | ||
add_item(goods, parent) | ||
else: | ||
classes_proto = parent.get("class") | ||
classes = set() if classes_proto is None else set(filter(a, classes_proto)) | ||
ids_proto = parent.get("id") | ||
# ids = set() if ids_proto is None else set(filter(a, ids_proto)) | ||
# converts all lists of ids and classes to sets with their lowercase versions | ||
# ids are not currently used, but may be used later | ||
if bool(classes & accepted_classes): | ||
add_item(goods, parent) | ||
# if the class is an accepted class, add the item to the list | ||
else: | ||
if hasattr(parent, "children"): | ||
for item in parent.children: | ||
if hasattr(item, "get_text"): | ||
if not(item.name=="a" or item.parent.name=="a"): | ||
t = item.get_text().strip() | ||
if t in bad_headers: | ||
add_item(goods, None) | ||
return False | ||
find_good(item, goods, wiki_mode) | ||
# searches through the child's child nodes | ||
elif parent.__class__.__name__ == "NavigableString": | ||
add_item(goods, parent) | ||
|
||
|
||
def decide(factors, threshold): | ||
totalWeight = 0 | ||
totalValue = 0 | ||
for value, weight in factors: | ||
totalValue += value * weight | ||
totalWeight += weight | ||
adjusted = totalValue / totalWeight | ||
return adjusted > threshold | ||
|
||
|
||
def check(text): | ||
texts = text.split("\n") | ||
result = "" | ||
for item in texts: | ||
new = (checkIndividual(item) + "\n") | ||
result += new | ||
return result | ||
|
||
|
||
def checkIndividual(text): | ||
text = destroy_citations(text.replace("\r", "\n")) | ||
stripped = text.lower().strip("\n").strip("\t").strip(" ").strip("\r") | ||
if stripped in bad_phrases: | ||
return "" | ||
for item in bad_subphrases: | ||
if item in stripped: | ||
return "" | ||
for item in web: | ||
if item in text: | ||
text = text.replace(item, "") | ||
if len(stripped) < 7: | ||
return "" | ||
if not text[0].isalnum() and not text[0] == " " and not text[0] == "/t" and not text[0] == "\n": | ||
return "" | ||
lastchr = stripped[len(stripped) - 1] | ||
if not lastchr.isalnum() and not (lastchr == "." or lastchr == "?" or lastchr == " "): | ||
return "" | ||
if stripped.isdigit(): | ||
return "" | ||
endsWithPunc = 0 if stripped[len(stripped) - 1] == '.' else 1 | ||
length = 1 / (len(stripped) - 6) | ||
numSpaces = 1 / (stripped.count(' ') + 1) | ||
if numSpaces > 1 / 3: | ||
return "" | ||
factors = [(endsWithPunc, 2), (length, 1), (numSpaces, 3)] | ||
if decide(factors, 0.4): | ||
return "" | ||
return text | ||
|
||
|
||
def extract(item): | ||
result = "" | ||
if hasattr(item, "children"): | ||
for text in item.children: | ||
if not hasattr(text, "name") or not text.name in exclude: | ||
result += extract(text) | ||
elif hasattr(item, "get_text"): | ||
result = item.get_text() | ||
else: | ||
result = item | ||
return result.replace("\n", " ") | ||
|
||
|
||
def check_spaces(text): | ||
obj = re.compile("[\\s \\t\\n]{2,}]") | ||
text = obj.sub(" ", text) | ||
text = re.compile("[ ]{2,}").sub(" ", text) | ||
text = text.replace("\n ", "\n").replace(" \n", "\n") | ||
text = re.compile("[\\r\\n]{3,}").sub("\n", text) | ||
return text | ||
|
||
|
||
def destroy_citations(text): | ||
return A.sub(" ", B.sub(" ", C.sub("", D.sub(" ", E.sub(" ", F.sub(" ", text)))))) | ||
|
||
|
||
def get_text(soup): | ||
soup = soup.html | ||
wiki_mode = False | ||
if ("wikipedia.org" in str(soup)): | ||
wiki_mode = True | ||
goods = list() | ||
find_good(soup, goods, wiki_mode) | ||
text = "" | ||
for item in goods: | ||
if item is None: | ||
break | ||
extraction = extract(item) | ||
if extraction is not None: | ||
text += extraction + "\n" | ||
text = check_spaces(text) | ||
text = check(text) | ||
text = check_spaces(text) | ||
#return text.split(BAD_STUFF)[0] | ||
return text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import os | ||
|
||
from main import get_text_from_url | ||
|
||
|
||
def generate_file_from_url(url: str): | ||
text = get_text_from_url(url) | ||
if len(text) < 200: | ||
return False | ||
opened_file = open(find_free_filename("./generated/"), "w", encoding="utf-8") | ||
opened_file.write(text) | ||
opened_file.close() | ||
return True | ||
|
||
|
||
def find_free_filename(directory, iteration=0): | ||
desired_name = directory + "file_" + str(iteration) + ".txt" | ||
if os.path.isfile(desired_name): | ||
return find_free_filename(directory, iteration=iteration + 1) | ||
return desired_name | ||
|
||
|
||
def get_apnotes_url(index: int): | ||
return "https://apnotes.net/notes-12e/ch%d-12e.html" % index | ||
|
||
|
||
def generate_apnotes_files(count: int): | ||
for i in range(1, count + 1): | ||
url = get_apnotes_url(i) | ||
print(url) | ||
generate_file_from_url(url) | ||
|
||
|
||
def generate_wiki_files(count: int): | ||
generated = 0 | ||
while generated < count: | ||
if generate_file_from_url("https://en.wikipedia.org/wiki/Special:Random"): | ||
generated += 1 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
from bs4 import BeautifulSoup | ||
import urllib.request | ||
from urllib.request import Request | ||
import functions | ||
|
||
|
||
def getInp(url): | ||
req = Request(url) | ||
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36') | ||
response = urllib.request.urlopen(req).read() | ||
html = response.decode('utf-8', errors='ignore').strip() | ||
html = html.replace("\\n", '\n').replace("\\'", "'").replace("\'", "'").replace("\\r", " ").replace("\\t", " ") | ||
return html | ||
|
||
|
||
def get_text_from_url(url): | ||
htext = getInp(url) | ||
soup = BeautifulSoup(htext, "html.parser") | ||
return functions.get_text(soup) |