import logging import os import praw import re import random import requests import sqlite3 import time import traceback import warnings from bs4 import BeautifulSoup from html.parser import unescape from urllib.parse import urlencode from praw.helpers import flatten_tree from praw.errors import APIException, ClientException, HTTPException USER_AGENT = "Archives to archive.is and archive.org (/r/SnapshillBot) v1.3" INFO = "/r/SnapshillBot" CONTACT = "/message/compose?to=\/r\/SnapshillBot" ARCHIVE_ORG_FORMAT = "%Y%m%d%H%M%S" MEGALODON_JP_FORMAT = "%Y-%m%d-%H%M-%S" DB_FILE = os.environ.get("DATABASE", "snapshill.sqlite3") LEN_MAX = 35 REDDIT_API_WAIT = 2 WARN_TIME = 300 # warn after spending 5 minutes on a post REDDIT_PATTERN = re.compile("https?://(([A-z]{2})(-[A-z]{2})" "?|beta|i|m|pay|ssl|www)\.?reddit\.com") SUBREDDIT_OR_USER = re.compile("/(u|user|r)/[^\/]+/?$") # we have to do some manual ratelimiting because we are tunnelling through # some other websites. RECOVERABLE_EXC = (APIException, ClientException, HTTPException) loglevel = logging.DEBUG if os.environ.get("DEBUG") == "true" else logging.INFO logging.basicConfig(level=loglevel, format="[%(asctime)s] [%(levelname)s] %(message)s") log = logging.getLogger("snapshill") logging.getLogger("requests").setLevel(loglevel) warnings.simplefilter("ignore") # Ignore ResourceWarnings (because screw them) r = praw.Reddit(USER_AGENT) ignorelist = set() def get_footer(): return "*^(I am a bot.) ^\([*Info*]({info}) ^/ ^[*Contact*]({" \ "contact}))*".format(info=INFO, contact=CONTACT) def should_notify(submission): """ Looks for other snapshot bot comments in the comment chain and doesn't post if they do. :param submission: Submission to check :return: If we should comment or not """ cur.execute("SELECT * FROM links WHERE id=?", (submission.name,)) if cur.fetchone(): return False submission.replace_more_comments() for comment in flatten_tree(submission.comments): if comment.author and comment.author.name in ignorelist: return False return True def ratelimit(url): if len(re.findall(REDDIT_PATTERN, url)) == 0: return time.sleep(REDDIT_API_WAIT) def refresh_ignore_list(): ignorelist.clear() ignorelist.add(r.user.name) for friend in r.user.get_friends(): ignorelist.add(friend.name) def fix_url(url): """ Change language code links, mobile links and beta links, SSL links and username/subreddit mentions :param url: URL to change. :return: Returns a fixed URL """ if url.startswith("r/") or url.startswith("u/"): url = "http://www.reddit.com" + url return re.sub(REDDIT_PATTERN, "http://www.reddit.com", url) def skip_url(url): """ Skip naked username mentions and subreddit links. """ if REDDIT_PATTERN.match(url) and SUBREDDIT_OR_USER.search(url): return True return False def log_error(e): log.error("Unexpected {}:\n{}".format(e.__class__.__name__, traceback.format_exc())) class NameMixin: site_name = None @property def name(self): if self.archived: return self.site_name else: return "_{}\*_".format(self.site_name) class ArchiveIsArchive(NameMixin): site_name = "archive.is" def __init__(self, url): self.url = url self.archived = self.archive() pairs = {"url": self.url, "run": 1} self.error_link = "https://archive.is/?" + urlencode(pairs) def archive(self): """ Archives to archive.is. Returns a 200, and we have to find the JavaScript redirect through a regex in the response text. :return: URL of the archive or False if an error occurred """ pairs = {"url": self.url} try: res = requests.post("https://archive.is/submit/", pairs, verify=False) except RECOVERABLE_EXC: return False found = re.findall("http[s]?://archive.is/[0-z]{1,6}", res.text) if len(found) < 1: return False return found[0] class ArchiveOrgArchive(NameMixin): site_name = "archive.org" def __init__(self, url): self.url = url self.archived = self.archive() self.error_link = "https://web.archive.org/save/" + self.url def archive(self): """ Archives to archive.org. The website gives a 403 Forbidden when the archive cannot be generated (because it follows robots.txt rules) :return: URL of the archive, False if an error occurred, or None if we cannot archive this page. """ try: requests.get("https://web.archive.org/save/" + self.url) except RECOVERABLE_EXC as e: if isinstance(e, HTTPError) and e.status_code == 403: return None return False date = time.strftime(ARCHIVE_ORG_FORMAT, time.gmtime()) return "https://web.archive.org/" + date + "/" + self.url class MegalodonJPArchive(NameMixin): site_name = "megalodon.jp" def __init__(self, url): self.url = url self.archived = self.archive() self.error_link = "http://megalodon.jp/pc/get_simple/decide?url={}".format(self.url) def archive(self): """ Archives to megalodon.jp. The website gives a 302 redirect when we POST to the webpage. We can't guess the link because a 1 second discrepancy will give an error when trying to view it. :return: URL of the archive, or False if an error occurred. """ pairs = {"url": self.url} try: res = requests.post("http://megalodon.jp/pc/get_simple/decide", pairs) except RECOVERABLE_EXC: return False if res.url == "http://megalodon.jp/pc/get_simple/decide": return False return res.url class GoldfishArchive(NameMixin): site_name = "snew.github.io" def __init__(self, url): self.url = url self.archived = re.sub(REDDIT_PATTERN, "https://snew.github.io", url) self.error_link = "https://snew.github.io/" class RemovedditArchive(NameMixin): site_name = "removeddit.com" def __init__(self, url): self.url = url self.archived = re.sub(REDDIT_PATTERN, "https://www.removeddit.com", url) self.error_link = "https://www.removeddit.com/" class ArchiveContainer: def __init__(self, url, text): log.debug("Creating ArchiveContainer") self.url = url self.text = (text[:LEN_MAX] + "...") if len(text) > LEN_MAX else text self.archives = [ArchiveOrgArchive(url), MegalodonJPArchive(url)] if re.match(REDDIT_PATTERN, url): self.archives.append(RemovedditArchive(url)) self.archives.append(ArchiveIsArchive(url)) class Notification: def __init__(self, post, header, links): self.post = post self.header = header self.links = links def notify(self): """ Replies with a comment containing the archives or if there are too many links to fit in a comment, post a submisssion to /r/SnapshillBotEx and then make a comment linking to it. :return Nothing """ try: comment = self._build() if len(comment) > 9999: link = self.post.permalink submission = r.submit("SnapshillBotEx", "Archives for " + link, text=comment[:39999], raise_captcha_exception=True) submission.add_comment("The original submission can be found " "here:\n\n" + link) comment = self.post.add_comment("Wow, that's a lot of links! The " "snapshots can be [found here.](" + submission.url + ")\n\n" + get_footer()) log.info("Posted a comment and new submission") else: comment = self.post.add_comment(comment) except RECOVERABLE_EXC as e: log_error(e) return cur.execute("INSERT INTO links (id, reply) VALUES (?, ?)", (self.post.name, comment.name)) def _build(self): parts = [self.header.get(), "Snapshots:"] format = "[{name}]({archive})" for i, link in enumerate(self.links, 1): subparts = [] log.debug("Found link") for archive in link.archives: if archive.archived is None: continue archive_link = archive.archived if not archive_link: log.debug("Not found, using error link") archive_link = archive.error_link + ' "could not ' \ 'auto-archive; ' \ 'click to resubmit it!"' else: log.debug("Found archive") subparts.append(format.format(name=archive.name, archive=archive_link)) parts.append("{}. {} - {}".format(i, link.text, ", ".join(subparts))) parts.append(get_footer()) return "\n\n".join(parts) class Header: def __init__(self, settings_wiki, subreddit): self.subreddit = subreddit self.texts = [] self._settings = r.get_subreddit(settings_wiki) try: content = self._get_wiki_content() if not content.startswith("!ignore"): self.texts = self._parse_quotes(content) except RECOVERABLE_EXC: pass def __len__(self): return len(self.texts) def get(self): """ Gets a random message from the extra text or nothing if there are no messages. :return: Random message or an empty string if the length of "texts" is 0. """ return "" if not self.texts else random.choice(self.texts) def _get_wiki_content(self): return self._settings.get_wiki_page("extxt/" + self.subreddit.lower()).content_md def _parse_quotes(self, quotes_str): return [q.strip() for q in re.split('\r\n-{3,}\r\n', quotes_str) if q.strip()] class Snapshill: def __init__(self, username, password, settings_wiki, limit=25): self.username = username self.password = password self.limit = limit self.settings_wiki = settings_wiki self.headers = {} self._setup = False def run(self): """ Checks through the submissions and archives and posts comments. """ if not self._setup: raise Exception("Snapshiller not ready yet!") submissions = r.get_new(limit=self.limit) for submission in submissions: debugTime = time.time() warned = False log.debug("Found submission.\n" + submission.permalink) if not should_notify(submission): log.debug("Skipping.") continue archives = [ArchiveContainer(fix_url(submission.url), "*This Post*")] if submission.is_self and submission.selftext_html is not None: log.debug("Found text post...") links = BeautifulSoup(unescape( submission.selftext_html)).find_all("a") if not len(links): continue finishedURLs = [] for anchor in links: if time.time() > debugTime + WARN_TIME and not warned: log.warn("Spent over {} seconds on post (ID: {})".format( WARN_TIME, submission.name)) warned = True log.debug("Found link in text post...") url = fix_url(anchor['href']) if skip_url(url): continue if url in finishedURLs: continue #skip for sanity archives.append(ArchiveContainer(url, anchor.contents[0])) finishedURLs.append(url) ratelimit(url) Notification(submission, self._get_header(submission.subreddit), archives).notify() db.commit() def setup(self): """ Logs into reddit and refreshs the header text and ignore list. """ self._login() self.refresh_headers() refresh_ignore_list() self._setup = True def quit(self): self.headers = {} self._setup = False def refresh_headers(self): """ Refreshes the header text for all subreddits. """ self.headers = {"all": Header(self.settings_wiki, "all")} for subreddit in r.get_my_subreddits(): name = subreddit.display_name.lower() self.headers[name] = Header(self.settings_wiki, name) def _login(self): r.login(self.username, self.password) def _get_header(self, subreddit): """ Gets the correct Header object for this subreddit. If the one for 'all' is not "!ignore", then this one will always be returned. :param subreddit: Subreddit object to get. :return: Extra text object found or the one for "all" if we can't find it or if not empty. """ all = self.headers["all"] if len(all): return all # return 'all' one for announcements return self.headers.get(subreddit.display_name.lower(), all) db = sqlite3.connect(DB_FILE) cur = db.cursor() if __name__ == "__main__": username = os.environ.get("REDDIT_USER") password = os.environ.get("REDDIT_PASS") limit = int(os.environ.get("LIMIT", 25)) wait = int(os.environ.get("WAIT", 5)) refresh = int(os.environ.get("REFRESH", 1800)) log.info("Starting...") snapshill = Snapshill(username, password, "SnapshillBot", limit) snapshill.setup() log.info("Started.") try: cycles = 0 while True: try: cycles += 1 log.info("Running") snapshill.run() log.info("Done") # This will refresh by default around ~30 minutes (depending # on delays). if cycles > (refresh / wait) / 2: log.info("Reloading header text and ignore list...") refresh_ignore_list() snapshill.refresh_headers() cycles = 0 except RECOVERABLE_EXC as e: log_error(e) time.sleep(wait) except KeyboardInterrupt: pass snapshill.quit() db.close() exit(0)