snapshill.py

import logging
import os
import praw
import re
import random
import requests
import sqlite3
import time
import traceback
import warnings

from bs4 import BeautifulSoup
from html.parser import unescape
from urllib.parse import urlencode

from praw.exceptions import APIException, ClientException, PRAWException
from prawcore.exceptions import PrawcoreException
from requests.exceptions import ConnectionError

USER_AGENT = "Archives to archive.is and archive.org (/r/SnapshillBot) v1.4"
INFO = "/r/SnapshillBot"
CONTACT = "/message/compose?to=\/r\/SnapshillBot"
ARCHIVE_ORG_FORMAT = "%Y%m%d%H%M%S"
MEGALODON_JP_FORMAT = "%Y-%m%d-%H%M-%S"
DB_FILE = os.environ.get("DATABASE", "snapshill.sqlite3")
LEN_MAX = 35
REDDIT_API_WAIT = 2
WARN_TIME = 300  # warn after spending 5 minutes on a post
REDDIT_PATTERN = re.compile(
    "https?://(([A-z]{2})(-[A-z]{2})" "?|beta|i|m|pay|ssl|www|old|new|alpha)\.?reddit\.com"
)
SUBREDDIT_OR_USER = re.compile("/(u|user|r)/[^\/]+/?$")
# we have to do some manual ratelimiting because we are tunnelling through
# some other websites.

RECOVERABLE_EXC = (
    APIException,
    ClientException,
    PRAWException,
    PrawcoreException,
    ConnectionError,
)


loglevel = logging.DEBUG if os.environ.get("DEBUG") == "true" else logging.INFO
TESTING = os.environ.get("TEST") == "true"

logging.basicConfig(level=loglevel, format="[%(asctime)s] [%(levelname)s] %(message)s")

log = logging.getLogger("snapshill")
logging.getLogger("requests").setLevel(loglevel)
warnings.simplefilter("ignore")  # Ignore ResourceWarnings (because screw them)


def get_footer():
    return "\n\n*I am just a simple bot, __not__ a moderator of this subreddit* | [*bot subreddit*]({info}) | [*contact the maintainers*]({contact})".format(
        info=INFO, contact=CONTACT
    )


def should_notify(submission):
    """
    Looks if we have seen this link before.
    :param submission: Submission to check
    :return: If we should comment or not
    """
    cur.execute("SELECT * FROM links WHERE id=?", (submission.name,))
    return not cur.fetchone()


def ratelimit(url):
    if len(re.findall(REDDIT_PATTERN, url)) == 0:
        return
    time.sleep(REDDIT_API_WAIT)


def fix_url(url):
    """
    Change language code links, mobile links and beta links, SSL links and
    username/subreddit mentions
    :param url: URL to change.
    :return: Returns a fixed URL
    """
    if url.startswith("r/") or url.startswith("u/"):
        url = "http://old.reddit.com/" + url
    if url.startswith("/r/") or url.startswith("/u/"):
        url = "http://old.reddit.com" + url
    return re.sub(REDDIT_PATTERN, "http://old.reddit.com", url)


def skip_url(url):
    """
    Skip naked username mentions and subreddit links.
    """
    return REDDIT_PATTERN.match(url) and SUBREDDIT_OR_USER.search(url)


def log_error(e):
    log.error("Unexpected {}:\n{}".format(e.__class__.__name__, traceback.format_exc()))


class NameMixin:
    site_name = None

    @property
    def name(self):
        if self.archived:
            return self.site_name
        else:
            return "_{}\*_".format(self.site_name)


class ArchiveIsArchive(NameMixin):
    site_name = "archive.today"

    def __init__(self, url):
        self.url = url
        self.archived = self.archive()
        pairs = {"url": self.url, "run": 1}
        self.error_link = "https://archive.today/?" + urlencode(pairs)

    def archive(self):
        """
        Archives to archive.is. Returns a 200, and we have to find the
        JavaScript redirect through a regex in the response text.
        :return: URL of the archive or False if an error occurred
        """
        pairs = {"url": self.url}

        try:
            res = requests.post("https://archive.today/submit/", pairs, verify=False)
        except RECOVERABLE_EXC:
            return False

        # Note; findall returns a list of tuples [('url', 'tld')]
        found = re.findall(
            "(http[s]?://archive.(fo|vn|today|is|li|md|ph)/[0-z]{1,6})", res.text
        )

        if len(found) < 1:
            return False

        return found[0][0]


class ArchiveOrgArchive(NameMixin):
    site_name = "archive.org"

    def __init__(self, url):
        self.url = url
        self.archived = self.archive()
        self.error_link = "https://web.archive.org/save/" + self.url

    def archive(self):
        """
        Archives to archive.org. The website gives a 403 Forbidden when the
        archive cannot be generated (because it follows robots.txt rules)
        :return: URL of the archive, False if an error occurred, or None if
        we cannot archive this page.
        """
        try:
            requests.get("https://web.archive.org/save/" + self.url)
        except RECOVERABLE_EXC as e:
            if isinstance(e, HTTPError) and e.status_code == 403:
                return None
            return False
        date = time.strftime(ARCHIVE_ORG_FORMAT, time.gmtime())
        return "https://web.archive.org/" + date + "/" + self.url


class MegalodonJPArchive(NameMixin):
    site_name = "megalodon.jp"

    def __init__(self, url):
        self.url = url
        self.archived = self.archive()
        self.error_link = "http://megalodon.jp/pc/get_simple/decide?url={}".format(
            self.url
        )

    def archive(self):
        """
        Archives to megalodon.jp. The website gives a 302 redirect when we
        POST to the webpage. We can't guess the link because a 1 second
        discrepancy will give an error when trying to view it.
        :return: URL of the archive, or False if an error occurred.
        """
        pairs = {"url": self.url}
        try:
            res = requests.post("http://megalodon.jp/pc/get_simple/decide", pairs)
        except RECOVERABLE_EXC:
            return False
        if res.url == "http://megalodon.jp/pc/get_simple/decide":
            return False
        return res.url


class GoldfishArchive(NameMixin):
    site_name = "snew.github.io"

    def __init__(self, url):
        self.url = url
        self.archived = re.sub(REDDIT_PATTERN, "https://snew.github.io", url)
        self.error_link = "https://snew.github.io/"


class RemovedditArchive(NameMixin):
    site_name = "removeddit.com"

    def __init__(self, url):
        self.url = url
        self.archived = re.sub(REDDIT_PATTERN, "https://www.removeddit.com", url)
        self.error_link = "https://www.removeddit.com/"


class ArchiveContainer:
    def __init__(self, url, text):
        log.debug("Creating ArchiveContainer")
        self.url = url
        self.text = (text[:LEN_MAX] + "...") if len(text) > LEN_MAX else text
        self.archives = [ArchiveOrgArchive(url), ArchiveIsArchive(url)]

        if re.match(REDDIT_PATTERN, url):
            self.archives.append(RemovedditArchive(url))


class Notification:
    def __init__(self, reddit, post, header, links):
        self.reddit = reddit
        self.post = post
        self.header = header
        self.links = links

    def notify(self):
        """
        Replies with a comment containing the archives or if there are too
        many links to fit in a comment, post a submisssion to
        /r/SnapshillBotEx and then make a comment linking to it.
        :return Nothing
        """
        try:
            comment = self._build()
            if TESTING:
                print(comment)
                return
            if len(comment) > 9999:
                link = self.post.permalink
                submission = self.reddit.subreddit("SnapshillBotEx").submit(
                    "Archives for " + link, selftext=comment[:39999]
                )
                submission.reply(
                    "The original submission can be found " "here:\n\n" + link
                )
                comment = self.post.reply(
                    "Wow, that's a lot of links! The "
                    "snapshots can be [found here.]("
                    + submission.url
                    + ")\n\n"
                    + get_footer()
                )
                log.info("Posted a comment and new submission")
            else:
                comment = self.post.reply(comment)
        except RECOVERABLE_EXC as e:
            log_error(e)
            return
        cur.execute(
            "INSERT INTO links (id, reply) VALUES (?, ?)",
            (self.post.name, comment.name),
        )

    def _build(self):
        parts = [self.header.get(), "Snapshots:"]
        format = "[{name}]({archive})"

        for i, link in enumerate(self.links, 1):
            subparts = []
            log.debug("Found link")

            for archive in link.archives:
                if archive.archived is None:
                    continue

                archive_link = archive.archived

                if not archive_link:
                    log.debug("Not found, using error link")
                    archive_link = (
                        archive.error_link
                        + ' "could not auto-archive; click to resubmit it!"'
                    )
                else:
                    log.debug("Found archive")

                subparts.append(format.format(name=archive.name, archive=archive_link))

            link_text = link.text if self.post.subreddit is not "TheseFuckingAccounts" else link.text.replace('u/', 'u\\/')
            parts.append("{}. {} - {}".format(i, link_text, ", ".join(subparts)))

        parts.append(get_footer())

        return "\n\n".join(parts)


class Header:
    def __init__(self, reddit, settings_wiki, subreddit):
        self.subreddit = subreddit
        self.texts = []
        self._settings = reddit.subreddit(settings_wiki)

        try:
            content = self._get_wiki_content()
            if not content.startswith("!ignore"):
                self.texts = self._parse_quotes(content)
        except RECOVERABLE_EXC:
            pass

    def __len__(self):
        return len(self.texts)

    def get(self):
        """
        Gets a random message from the extra text or nothing if there are no
        messages.
        :return: Random message or an empty string if the length of "texts"
        is 0.
        """
        return "" if not self.texts else random.choice(self.texts)

    def _get_wiki_content(self):
        try:
            return self._settings.wiki["extxt/" + self.subreddit.lower()].content_md
        except TypeError as err:
            log.debug(
                "could not get wiki content for {} in {} ({})".format(
                    self.subreddit, self._settings, err
                )
            )

        return ""

    def _parse_quotes(self, quotes_str):
        return [q.strip() for q in re.split("(\r)?\n-{3,}(\r)?\n", quotes_str) if q and q.strip()]


class Snapshill:
    def __init__(
        self, username, password, client_id, client_secret, settings_wiki, limit=25
    ):
        self.username = username
        self.password = password
        self.client_id = client_id
        self.client_secret = client_secret
        self.limit = limit
        self.settings_wiki = settings_wiki
        self.headers = {}
        self._setup = False
        self.reddit = None

    def run(self):
        """
        Checks through the submissions and archives and posts comments.
        """
        if not self._setup:
            raise Exception("Snapshill not ready yet!")

        submissions = self.reddit.front.new(limit=self.limit)

        for submission in submissions:
            debugTime = time.time()
            warned = False

            log.debug("Found submission.\n" + submission.permalink)

            if not should_notify(submission):
                log.debug("Skipping.")
                continue

            archives = [ArchiveContainer(fix_url(submission.url), submission.title)]

            if submission.is_self and submission.selftext_html is not None:
                log.debug("Found text post...")

                links = BeautifulSoup(unescape(submission.selftext_html)).find_all("a")

                finishedURLs = []

                for anchor in links:
                    if time.time() > debugTime + WARN_TIME and not warned:
                        log.warn(
                            "Spent over {} seconds on post (ID: {})".format(
                                WARN_TIME, submission.name
                            )
                        )

                        warned = True

                    log.debug("Found link in text post...")

                    url = fix_url(anchor["href"])

                    if skip_url(url):
                        continue

                    if url in finishedURLs:
                        continue  # skip for sanity

                    archives.append(ArchiveContainer(url, anchor.contents[0]))
                    finishedURLs.append(url)
                    ratelimit(url)

            Notification(
                self.reddit,
                submission,
                self._get_header(submission.subreddit),
                archives,
            ).notify()
            db.commit()

    def setup(self):
        """
        Logs into reddit and refreshs the header text.
        """
        self._login()
        self.refresh_headers()
        self._setup = True

    def quit(self):
        self.headers = {}
        self._setup = False

    def refresh_headers(self):
        """
        Refreshes the header text for all subreddits.
        """
        self.headers = {"all": Header(self.reddit, self.settings_wiki, "all")}
        for subreddit in self.reddit.user.subreddits():
            name = subreddit.display_name.lower()
            log.debug("get header name: {}".format(name))
            self.headers[name] = Header(self.reddit, self.settings_wiki, name)

    def _login(self):
        self.reddit = praw.Reddit(
            client_id=self.client_id,
            client_secret=self.client_secret,
            username=self.username,
            password=self.password,
            user_agent=USER_AGENT,
        )

    def _get_header(self, subreddit):
        """
        Gets the correct Header object for this subreddit. If the one for 'all'
        is not "!ignore", then this one will always be returned.
        :param subreddit: Subreddit object to get.
        :return: Extra text object found or the one for "all" if we can't find
        it or if not empty.
        """
        all = self.headers["all"]

        if len(all):
            return all  # return 'all' one for announcements

        return self.headers.get(subreddit.display_name.lower(), all)


db = sqlite3.connect(DB_FILE)
cur = db.cursor()

if __name__ == "__main__":
    username = os.environ.get("REDDIT_USER")
    password = os.environ.get("REDDIT_PASS")

    client_id = os.environ.get("REDDIT_CLIENT_ID")
    client_secret = os.environ.get("REDDIT_CLIENT_SECRET")

    limit = int(os.environ.get("LIMIT", 25))
    wait = int(os.environ.get("WAIT", 5))
    refresh = int(os.environ.get("REFRESH", 1800))

    log.info("Starting...")
    snapshill = Snapshill(
        username,
        password,
        client_id,
        client_secret,
        settings_wiki="SnapshillBot",
        limit=limit,
    )
    snapshill.setup()

    log.info("Started.")
    try:
        cycles = 0
        while True:
            try:
                cycles += 1
                log.info("Running")
                snapshill.run()
                log.info("Done")
                # This will refresh by default around ~30 minutes (depending
                # on delays).
                if cycles > (refresh / wait) / 2:
                    log.info("Reloading header text and ignore list...")
                    snapshill.refresh_headers()
                    cycles = 0
            except RECOVERABLE_EXC as e:
                log_error(e)

            time.sleep(wait)
    except KeyboardInterrupt:
        pass
    snapshill.quit()
    db.close()
    exit(0)