Skip to content

Commit

Permalink
intial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
dlike230 committed Jan 21, 2019
0 parents commit 1eb2a72
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

148 changes: 148 additions & 0 deletions functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import re

web = ["www.", "://", ".com", ".net", ".org", ".us", ".gov"]
accepted_tags = ["p", "span", "article", "font", "blockquote"]
exclude = ["cite"]
accepted_classes = {"paragraph", "text"}
bad_phrases = ["back to top", "home", "welcome", "you are here:", "itunes", "google", "facebook", "twitter", "comment"]
bad_subphrases = ["powered by", "around the web", "around the internet", "et al", "ndl", "view source", "view history",
"edit links", "last modified", "text is available under", "creative commons"]
bad_headers = ["References", "Citations", "Further Reading", "External Links", "Footnotes", "See Also"]
a = lambda x: x == x.lower()

A = re.compile("[a-zA-Z]{2,}[0-9]{2,}[ \\.]*")
B = re.compile("([0-9]+[a-zA-Z]+)+[\\s\\.]+")
C = re.compile("[\\[\\{].*[\\]\\}]")
D = re.compile("[A-Z]{2,3}: {0,2}[0-9]{3,}.{0,2}[0-9]*")
E = re.compile("\\([a-zA-Z\\s]+ ([0-9]+[.]*)+\\)")
F = re.compile("(\\\\[a-zA-Z0-9]{1,5})")
def add_item(goods, parent):
goods.append(parent)


def find_good(parent, goods, wiki_mode):
if parent is not None:
if not parent.__class__.__name__ == "NavigableString" and not parent.__class__.__name__ == "Comment":
if hasattr(parent, "name"):
if parent.name in accepted_tags:
add_item(goods, parent)
else:
classes_proto = parent.get("class")
classes = set() if classes_proto is None else set(filter(a, classes_proto))
ids_proto = parent.get("id")
# ids = set() if ids_proto is None else set(filter(a, ids_proto))
# converts all lists of ids and classes to sets with their lowercase versions
# ids are not currently used, but may be used later
if bool(classes & accepted_classes):
add_item(goods, parent)
# if the class is an accepted class, add the item to the list
else:
if hasattr(parent, "children"):
for item in parent.children:
if hasattr(item, "get_text"):
if not(item.name=="a" or item.parent.name=="a"):
t = item.get_text().strip()
if t in bad_headers:
add_item(goods, None)
return False
find_good(item, goods, wiki_mode)
# searches through the child's child nodes
elif parent.__class__.__name__ == "NavigableString":
add_item(goods, parent)


def decide(factors, threshold):
totalWeight = 0
totalValue = 0
for value, weight in factors:
totalValue += value * weight
totalWeight += weight
adjusted = totalValue / totalWeight
return adjusted > threshold


def check(text):
texts = text.split("\n")
result = ""
for item in texts:
new = (checkIndividual(item) + "\n")
result += new
return result


def checkIndividual(text):
text = destroy_citations(text.replace("\r", "\n"))
stripped = text.lower().strip("\n").strip("\t").strip(" ").strip("\r")
if stripped in bad_phrases:
return ""
for item in bad_subphrases:
if item in stripped:
return ""
for item in web:
if item in text:
text = text.replace(item, "")
if len(stripped) < 7:
return ""
if not text[0].isalnum() and not text[0] == " " and not text[0] == "/t" and not text[0] == "\n":
return ""
lastchr = stripped[len(stripped) - 1]
if not lastchr.isalnum() and not (lastchr == "." or lastchr == "?" or lastchr == " "):
return ""
if stripped.isdigit():
return ""
endsWithPunc = 0 if stripped[len(stripped) - 1] == '.' else 1
length = 1 / (len(stripped) - 6)
numSpaces = 1 / (stripped.count(' ') + 1)
if numSpaces > 1 / 3:
return ""
factors = [(endsWithPunc, 2), (length, 1), (numSpaces, 3)]
if decide(factors, 0.4):
return ""
return text


def extract(item):
result = ""
if hasattr(item, "children"):
for text in item.children:
if not hasattr(text, "name") or not text.name in exclude:
result += extract(text)
elif hasattr(item, "get_text"):
result = item.get_text()
else:
result = item
return result.replace("\n", " ")


def check_spaces(text):
obj = re.compile("[\\s \\t\\n]{2,}]")
text = obj.sub(" ", text)
text = re.compile("[ ]{2,}").sub(" ", text)
text = text.replace("\n ", "\n").replace(" \n", "\n")
text = re.compile("[\\r\\n]{3,}").sub("\n", text)
return text


def destroy_citations(text):
return A.sub(" ", B.sub(" ", C.sub("", D.sub(" ", E.sub(" ", F.sub(" ", text))))))


def get_text(soup):
soup = soup.html
wiki_mode = False
if ("wikipedia.org" in str(soup)):
wiki_mode = True
goods = list()
find_good(soup, goods, wiki_mode)
text = ""
for item in goods:
if item is None:
break
extraction = extract(item)
if extraction is not None:
text += extraction + "\n"
text = check_spaces(text)
text = check(text)
text = check_spaces(text)
#return text.split(BAD_STUFF)[0]
return text
38 changes: 38 additions & 0 deletions generator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

from main import get_text_from_url


def generate_file_from_url(url: str):
text = get_text_from_url(url)
if len(text) < 200:
return False
opened_file = open(find_free_filename("./generated/"), "w", encoding="utf-8")
opened_file.write(text)
opened_file.close()
return True


def find_free_filename(directory, iteration=0):
desired_name = directory + "file_" + str(iteration) + ".txt"
if os.path.isfile(desired_name):
return find_free_filename(directory, iteration=iteration + 1)
return desired_name


def get_apnotes_url(index: int):
return "https://apnotes.net/notes-12e/ch%d-12e.html" % index


def generate_apnotes_files(count: int):
for i in range(1, count + 1):
url = get_apnotes_url(i)
print(url)
generate_file_from_url(url)


def generate_wiki_files(count: int):
generated = 0
while generated < count:
if generate_file_from_url("https://en.wikipedia.org/wiki/Special:Random"):
generated += 1
Empty file added grab_links.py
Empty file.
19 changes: 19 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from bs4 import BeautifulSoup
import urllib.request
from urllib.request import Request
import functions


def getInp(url):
req = Request(url)
req.add_header('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36')
response = urllib.request.urlopen(req).read()
html = response.decode('utf-8', errors='ignore').strip()
html = html.replace("\\n", '\n').replace("\\'", "'").replace("\'", "'").replace("\\r", " ").replace("\\t", " ")
return html


def get_text_from_url(url):
htext = getInp(url)
soup = BeautifulSoup(htext, "html.parser")
return functions.get_text(soup)

0 comments on commit 1eb2a72

Please sign in to comment.