Skip to content

Commit

Permalink
Configurable output path
Browse files Browse the repository at this point in the history
  • Loading branch information
schulzetenberg committed Jan 6, 2023
1 parent 02c5da7 commit cd64363
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: release-test
name: release

on:
# push
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,4 @@ ENV QTWEBENGINE_CHROMIUM_FLAGS="--no-sandbox"
RUN apt-get update && apt-get install -y python3-pip && apt-get install -y calibre
COPY requirements.txt generate-ebooks.py Cover.png ./
RUN pip install --no-cache-dir -r requirements.txt
CMD ["python3", "./generate-ebooks.py"]
CMD ["python3", "./generate-ebooks.py", "Ebooks"]
2 changes: 2 additions & 0 deletions Ebooks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*
!.gitignore
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ This project depends on:

- **Calibre**. You will need to install this [manually](https://calibre-ebook.com/download) or via package manager. If this is not installed on your computer the script should still generate an HTML copy of the MMM blog but will not update the ebooks included in the repo.

In the repo root run ```pip3 install -r requirements.txt``` to install python dependencies via pip, then run **generate-ebooks.py** in the repo root. When the script completes pdf, mobi, epub, and azw3 ebooks in the Ebooks dir will be updated with the latest posts. If you would like to generate ebooks yourself in Calibre you can import the file ```import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html```, which will let you convert it to the format of your choice. Note: You will want to set Calibre to import HTML files in breadth-first order by going to Preferences → Advanced → Plugins → File type → HTML to ZIP and checking **Add linked files in breadth first order**.
In the repo root run ```pip3 install -r requirements.txt``` to install python dependencies via pip, then run `generate-ebooks.py Ebooks` in the repo root. When the script completes pdf, mobi, epub, and azw3 ebooks in the Ebooks dir will be updated with the latest posts. If you would like to generate ebooks yourself in Calibre you can import the file ```import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html```, which will let you convert it to the format of your choice. Note: You will want to set Calibre to import HTML files in breadth-first order by going to Preferences → Advanced → Plugins → File type → HTML to ZIP and checking **Add linked files in breadth first order**.

## MMM Approved!

Expand Down
110 changes: 58 additions & 52 deletions generate-ebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,62 +32,62 @@
ImageFile.LOAD_TRUNCATED_IMAGES = True

# Book data (use data here to construct ebook
BOOK_DATA = os.path.join(os.path.dirname(__file__),
BOOK_DATA = os.path.join(os.path.dirname(__file__),
"import_index.html_in_this_folder_in_calibre_to_create_ebook")
MEDIA = os.path.join(BOOK_DATA, "media")

class RSSParser(object):
"""Downloads (or reads from local file cache) RSS data of MMM feed"""
def __init__(self, url, pageNo=None):

def __init__(self, url, pageNo=None):
self.url = url # Confusing design - URL doubles as an actual URL or a cached local file
self.pageNo = pageNo

url = "file://" + self.url if Path(self.url).exists() else self.url
print("Trying to open and parse RSS feed @ <" + url + ">...")
doc = ET.parse(urlopen(url))
self.root = doc.getroot()

# Cache the page
# Cache the page
if self.pageNo is not None:
self.url = os.path.join(CACHED_DATA, CACHED_RSS_PREFIX +
self.url = os.path.join(CACHED_DATA, CACHED_RSS_PREFIX +
CACHED_RSS_PAGENO % (self.pageNo, ) + CACHED_RSS_SUFFIX)
ET.ElementTree(self.root).write(open(self.url, "wb"))
def parse(self):
"""Extract useful data from the RSS posting"""

def parse(self):
"""Extract useful data from the RSS posting"""
for item in self.root.find('channel').findall('item'):
title = item.find('title').text
url = item.find('link').text
url = item.find('link').text
text = item.find('.//content:encoded', namespaces=self.root.nsmap).text
date = item.find('pubDate').text
author = item.find('.//dc:creator', namespaces=self.root.nsmap).text

yield (
title.encode('utf-8'),
text.encode('utf-8'),
title.encode('utf-8'),
text.encode('utf-8'),
url.encode('utf-8'),
date.encode('utf-8'),
author.encode('utf-8'))

def getCachedPostings():
"""Get a list of all the cached RSS data on disk"""
filePaths = glob.glob(os.path.join(CACHED_DATA,
filePaths = glob.glob(os.path.join(CACHED_DATA,
CACHED_RSS_PREFIX + '*' + CACHED_RSS_SUFFIX))
filePaths.sort()
return filePaths

def getLastPostPageNo():
"""Get the last RSS page number downloaded"""
downloadedPages = getCachedPostings()
if not downloadedPages or not len(downloadedPages):
return 1 # Pages start at 1
downloadedPages.sort()
lastPage = downloadedPages[-1]
return int(re.findall(os.path.join(CACHED_DATA, CACHED_RSS_PREFIX

return int(re.findall(os.path.join(CACHED_DATA, CACHED_RSS_PREFIX
+ r'(\d+)' + CACHED_RSS_SUFFIX), lastPage)[0])


def getLatestRssDataFromMMM():
"""Download newest RSS pages - always redownloads last page as it may
Expand All @@ -97,36 +97,36 @@ def getLatestRssDataFromMMM():

parsers = []
pageNo = getLastPostPageNo()

print("Downloading pages %d and newer" % (pageNo, ))

while True:
try:
print(MMM_RSS_URL)
parser = RSSParser(MMM_RSS_URL % (pageNo), pageNo)
parser = RSSParser(MMM_RSS_URL % (pageNo), pageNo)
parsers.append(parser)
pageNo += 1
except IOError as e:
print(f'Failed to open last (end of detected RSS pages), error: {e}')
break

return parsers


def getRssData():
"""Gets a list of all RSS data from cache and downloads"""
"""Gets a list of all RSS data from cache and downloads"""
parsers = []

print("Parsing cached pages from disk")

# First parse our cached pages
for cachedPageFilePath in getCachedPostings():
parsers.append(RSSParser(cachedPageFilePath)) # No page number necessary since cached

parsers.extend(getLatestRssDataFromMMM())

return parsers

class Post(object):
"""Once we have the RSS data and have started parsing it, we can break
it down into posts"""
Expand All @@ -137,7 +137,7 @@ def __init__(self, title, text, date, author, num=None):
self.text = text
self.date = date
self.author = author

if num is None:
num = Post.next
Post.next = Post.next + 1
Expand All @@ -147,46 +147,46 @@ def createPostingsFromParsedRss(parsers):
"""Create a list of all the posts from the RSS data"""
postsInOrder = []
posts = {}

for parser in parsers:
for (title, text, url, date, author) in parser.parse():
postsInOrder.append(url)
posts[url] = Post(title, text, date, author)
posts[url] = Post(title, text, date, author)

return (posts, postsInOrder)


def getCachedUrlMaps():
if not os.path.isdir(CACHED_DATA):
os.mkdir(CACHED_DATA)

if not os.path.isfile(CACHED_URL_MAP):
return ({}, {})

remoteToLocal, localToRemote = pickle.load(open(CACHED_URL_MAP, 'rb'))

return (remoteToLocal, localToRemote)

def saveUrlMaps(remoteToLocal, localToRemote):
if not os.path.isdir(CACHED_DATA):
os.mkdir(CACHED_DATA)

pickle.dump((remoteToLocal, localToRemote), open(CACHED_URL_MAP, 'wb'))

def rewritePostLinks(posts, postsInOrder):
"""We do this once we have all the posts since sometimes MMM goes back
and edits earlier posts to include a link to a later posting"""

print("Rewriting post links...")

for url in postsInOrder:
post = posts[url]
text = post.text if isinstance(post.text, str) else post.text.decode('utf-8')

for url2 in postsInOrder:
regex = re.compile('<a\\s(.*href=")%s(".*)>(.*)</a>' % url2)
post.text = regex.sub('<a \\1' + posts[url2].localUrl + '\\2>\\3</a>', text)

def rewriteImageLinks(posts):
print("Rewriting image links...")

Expand Down Expand Up @@ -236,14 +236,14 @@ def rewriteImageLinks(posts):
text = re.sub(r'srcset=".*"', "", text)
text = text.replace(imageurl, outputImageRelativePath)
post.text = text

def createBookData(posts, postsInOrder):
print("Creating book data...")

shutil.copyfile(COVER_PATH, os.path.join(BOOK_DATA, 'Cover.png'))

index = open(os.path.join(BOOK_DATA, 'index.html'), 'w')

index.write(f'''<!DOCTYPE html>
<html lang="en">
<head>
Expand All @@ -261,7 +261,7 @@ def createBookData(posts, postsInOrder):
for url in postsInOrder:
post = posts[url]
text = post.text if isinstance(post.text, str) else post.text.decode('utf-8')

open(os.path.join(BOOK_DATA, post.localUrl), 'w').write(
'<!DOCTYPE html>\n' + \
'<html lang="en">\n' + \
Expand All @@ -278,18 +278,24 @@ def createBookData(posts, postsInOrder):
'</html>')
chapter += 1
index.write(f'{chapter}. <a href=%s>%s</a><br/>\n' % (post.localUrl, post.title.decode('utf-8')))

index.write(''' </p>
</body>
</html>''')

def generateEbooks():
print("Generating eBooks...")
outputDir = ""

if len(sys.argv) > 1:
outputDir = sys.argv[1] + "/"

subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", "Ebooks/mmm.azw3", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", "Ebooks/mmm.epub", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", "Ebooks/mmm.mobi", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", "Ebooks/mmm.pdf", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--cover", "Cover.png"])
print("Output directory: " + outputDir)

subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", outputDir + "mmm.azw3", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", outputDir + "mmm.epub", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", outputDir + "mmm.mobi", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--pubdate", f"{date.today()}", "--cover", "Cover.png"])
subprocess.run(["ebook-convert", "import_index.html_in_this_folder_in_calibre_to_create_ebook/index.html", outputDir + "mmm.pdf", "--title", "Financial Freedom Through Badassity", "--authors", "Mr. Money Mustache", "--cover", "Cover.png"])

print("Finished generating Ebooks")

Expand All @@ -306,6 +312,6 @@ def main():
rewriteImageLinks(posts)
createBookData(posts, postsInOrder)
generateEbooks()

if __name__=="__main__":
main()

0 comments on commit cd64363

Please sign in to comment.