Skip to content

Commit

Permalink
Feature/redgif (#364)
Browse files Browse the repository at this point in the history
* working redgifs ingest.  Proxy not enabled

* Cleanup and final redgif support

* Cleanup and final redgif support

* failed remove post test

* added redgifs package
  • Loading branch information
barrycarey authored Feb 18, 2024
1 parent 61dff2f commit 3dbb15c
Show file tree
Hide file tree
Showing 11 changed files with 261 additions and 39 deletions.
75 changes: 68 additions & 7 deletions redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,75 @@
import logging
import os
from hashlib import md5
from typing import Optional
from urllib.parse import urlparse

import imagehash
import redgifs
from redgifs import HTTPException

from redditrepostsleuth.core.db.databasemodels import Post, PostHash
from redditrepostsleuth.core.exception import ImageRemovedException, ImageConversionException, InvalidImageUrlException, \
GalleryNotProcessed
from redditrepostsleuth.core.util.imagehashing import log, generate_img_by_url_requests
from redditrepostsleuth.core.proxy_manager import ProxyManager
from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT
from redditrepostsleuth.core.util.imagehashing import generate_img_by_url_requests
from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post

log = logging.getLogger(__name__)

def pre_process_post(submission: dict) -> Optional[Post]:

def get_redgif_id_from_url(url: str) -> Optional[str]:
parsed_url = urlparse(url)
id, _ = os.path.splitext(parsed_url.path.replace('/i/', ''))
return id

def get_redgif_image_url(reddit_url: str, token: str, proxy: str = None) -> Optional[str]:

id = get_redgif_id_from_url(reddit_url)
if not id:
log.error('Failed to parse RedGifs ID from %s', reddit_url)
return

api = redgifs.API()
api.http._proxy = {'http': proxy, 'https': proxy}
api.http.headers.update({'User-Agent': GENERIC_USER_AGENT, 'authorization': f'Bearer {token}'})
try:
gif = api.get_gif(id)
except Exception as e:
log.error('')
return gif.urls.hd


def pre_process_post(
submission: dict,
proxy_manager: ProxyManager,
redgif_manager: RedGifsTokenManager,
domains_to_proxy: list[str]
) -> Optional[Post]:

post = reddit_submission_to_post(submission)

proxy = None
parsed_url = urlparse(post.url)
if parsed_url.netloc in domains_to_proxy:
proxy = proxy_manager.get_proxy().address

if post.post_type_id == 2: # image
process_image_post(post)

# Hacky RedGif support. Will need to be refactored if we have to do similar for other sites
redgif_url = None
if 'redgif' in post.url:
token = redgif_manager.get_redgifs_token()
try:
redgif_url = get_redgif_image_url(submission['url'], token)
except HTTPException as e:
if 'code' in e.error and e.error['code'] == 'TokenDecodeError':
redgif_manager.remove_redgifs_token(proxy or 'localhost')
raise e

process_image_post(post, url=redgif_url, proxy=proxy)
elif post.post_type_id == 6: # gallery
process_gallery(post, submission)

Expand All @@ -28,12 +80,21 @@ def pre_process_post(submission: dict) -> Optional[Post]:
return post


def process_image_post(post: Post, hash_size: int = 16) -> Post:

log.info('Hashing image with URL: %s', post.url)
def process_image_post(post: Post, url: str = None, proxy: str = None, hash_size: int = 16) -> Post:
"""
Process an image post to generate the required hashes
:param proxy: Proxy to request image with
:param post: post object
:param url: Alternate URL to use
:param hash_size: Size of hash
:return: Post object with hashes
"""
log.debug('Hashing image with URL: %s', post.url)
if url:
log.info('Hashing %s', post.url)

try:
img = generate_img_by_url_requests(post.url)
img = generate_img_by_url_requests(url or post.url, proxy=proxy)
except ImageConversionException as e:
log.warning('Image conversion error: %s', e)
raise
Expand Down
48 changes: 42 additions & 6 deletions redditrepostsleuth/core/celery/tasks/ingest_tasks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,47 @@
import json
import random
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Optional

import requests
from celery import Task
from redgifs import HTTPException
from sqlalchemy.exc import IntegrityError

from redditrepostsleuth.core.celery import celery
from redditrepostsleuth.core.celery.basetasks import SqlAlchemyTask
from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post
from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post, get_redgif_image_url
from redditrepostsleuth.core.config import Config
from redditrepostsleuth.core.db.db_utils import get_db_engine
from redditrepostsleuth.core.db.uow.unitofworkmanager import UnitOfWorkManager
from redditrepostsleuth.core.exception import InvalidImageUrlException, GalleryNotProcessed, ImageConversionException, \
ImageRemovedException
ImageRemovedException, RedGifsTokenException
from redditrepostsleuth.core.logging import get_configured_logger
from redditrepostsleuth.core.proxy_manager import ProxyManager
from redditrepostsleuth.core.services.eventlogging import EventLogging
from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT
from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post

log = get_configured_logger('redditrepostsleuth')


@celery.task(bind=True, base=SqlAlchemyTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed), retry_kwargs={'max_retries': 10, 'countdown': 300})
@dataclass
class RedGifsToken:
token: str
expires_at: datetime
proxy: str

class IngestTask(Task):
def __init__(self):
self.config = Config()
self.uowm = UnitOfWorkManager(get_db_engine(self.config))
self.event_logger = EventLogging()
self._redgifs_token_manager = RedGifsTokenManager()
self._proxy_manager = ProxyManager(self.uowm, 1000)
self.domains_to_proxy = []

@celery.task(bind=True, base=IngestTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed, HTTPException), retry_kwargs={'max_retries': 10, 'countdown': 300})
def save_new_post(self, submission: dict, repost_check: bool = True):

# TODO: temp fix until I can fix imgur gifs
Expand All @@ -24,16 +54,22 @@ def save_new_post(self, submission: dict, repost_check: bool = True):
return

try:
post = pre_process_post(submission)
post = pre_process_post(submission, self._proxy_manager, self._redgifs_token_manager, [])
except (ImageRemovedException, InvalidImageUrlException) as e:
return
except GalleryNotProcessed as e:
log.warning('Gallery not finished processing')
raise e
except Exception as e:
log.exception('Failed during post pre-process')
return

if not post:
return

monitored_sub = uow.monitored_sub.get_by_sub(post.subreddit)
if monitored_sub and monitored_sub.active:
log.info('Sending ingested post to monitored sub queue')
log.info('Sending ingested post to monitored sub queue for %s', monitored_sub.name)
celery.send_task('redditrepostsleuth.core.celery.tasks.monitored_sub_tasks.sub_monitor_check_post',
args=[post.post_id, monitored_sub],
queue='submonitor', countdown=20)
Expand Down
6 changes: 5 additions & 1 deletion redditrepostsleuth/core/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,8 @@ def __init__(self, message):

class UserNotFound(RepostSleuthException):
def __init__(self, message):
super(UserNotFound, self).__init__(message)
super(UserNotFound, self).__init__(message)

class RedGifsTokenException(RepostSleuthException):
def __init__(self, message):
super(RedGifsTokenException, self).__init__(message)
84 changes: 84 additions & 0 deletions redditrepostsleuth/core/services/redgifs_token_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import json
import logging

import requests
from redis import Redis

from redditrepostsleuth.core.config import Config
from redditrepostsleuth.core.exception import RedGifsTokenException
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT

log = logging.getLogger(__name__)

"""
Class for managing and caching RedGifs API tokens. Currently overkill but if we need to backfill the database or
API rate limits get tight this will support caching a token for each proxy to Redis
"""
class RedGifsTokenManager:
def __init__(self):
config = Config()
self.redis = Redis(
host=config.redis_host,
port=config.redis_port,
db=config.redis_database,
password=config.redis_password,
decode_responses=True
)


def _cache_token(self, key: str, token: str) -> None:
"""
Take a given token and cache it to Redis
:param key: key of the token
:param token: API token
"""
log.info('Caching token for %s', key)
self.redis.set(f'redgifs-token:{key}', token, ex=82800)

def remove_redgifs_token(self, key: str) -> None:
"""
Removed a cached token from Redis with a given key
:param key: key to remove
"""
log.info('Removing token for %s', key)
self.redis.delete(f'redgifs-token:{key}')


def get_redgifs_token(self, address: str = 'localhost') -> str:
"""
Either return an existing cached token or create a new one
:param address: address of the proxy being used
:return: Token
"""
cached_token = self.redis.get(f'redgifs-token:{address}')
if not cached_token:
return self._request_and_cache_token(address)

log.debug('Found cached token for %s', address)
return cached_token


def _request_and_cache_token(self, proxy_address: str = 'localhost') -> str:
"""
Hit the Redgif API and request a new auth token. Cache it to Redis
:param proxy_address: Proxy to use, if any
:return: Token
"""
proxies = None
if proxy_address != 'localhost':
proxies = {'http': f'https://{proxy_address}', 'https': f'http://{proxy_address}'}

token_res = requests.get(
'https://api.redgifs.com/v2/auth/temporary',
headers={'User-Agent': GENERIC_USER_AGENT},
proxies=proxies
)

if token_res.status_code != 200:
log.error('Failed to get RedGif token. Status Code %s', token_res.status_code)
raise RedGifsTokenException(f'Failed to get RedGif token. Status Code {token_res.status_code}')

token_data = json.loads(token_res.text)

self._cache_token(proxy_address or 'localhost', token_data['token'])
return token_data['token']
2 changes: 2 additions & 0 deletions redditrepostsleuth/core/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}

GENERIC_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'

REDDIT_REMOVAL_REASONS = ['deleted', 'author', 'reddit', 'copyright_takedown', 'content_takedown']

EXCLUDE_FROM_TOP_REPOSTERS = [
Expand Down
49 changes: 34 additions & 15 deletions redditrepostsleuth/core/util/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,33 +250,52 @@ def get_default_image_search_settings(config: Config) -> ImageSearchSettings:
)

def get_image_search_settings_from_request(req, config: Config) -> ImageSearchSettings:
return ImageSearchSettings(
search_settings = ImageSearchSettings(
req.get_param_as_int('target_match_percent', required=True, default=None) or config.default_image_target_match,
config.default_image_target_annoy_distance,
target_title_match=req.get_param_as_int('target_title_match', required=False,
default=None) or config.default_image_target_title_match,
filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False,
default=None) or config.default_image_dead_matches_filter,
filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False,
default=None) or config.default_image_removed_match_filter,
only_older_matches=req.get_param_as_bool('only_older_matches', required=False,
default=None) or config.default_image_only_older_matches,
filter_same_author=req.get_param_as_bool('filter_same_author', required=False,
default=None) or config.default_image_same_author_filter,
filter_crossposts=req.get_param_as_bool('filter_crossposts', required=False,
default=None) or config.default_image_crosspost_filter,
filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False, default=None),
filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False, default=None),
only_older_matches=req.get_param_as_bool('only_older_matches', required=False, default=None),
filter_same_author=req.get_param_as_bool('filter_same_author', required=False, default=None),
filter_crossposts=req.get_param_as_bool('include_crossposts', required=False, default=None),
target_meme_match_percent=req.get_param_as_int('target_meme_match_percent', required=False,
default=None) or config.default_image_target_meme_match,
meme_filter=req.get_param_as_bool('meme_filter', required=False,
default=None) or config.default_image_meme_filter,
same_sub=req.get_param_as_bool('same_sub', required=False,
default=None) or config.default_image_same_sub_filter,
meme_filter=req.get_param_as_bool('meme_filter', required=False, default=None),
same_sub=req.get_param_as_bool('same_sub', required=False, default=None),
max_days_old=req.get_param_as_int('max_days_old', required=False,
default=None) or config.default_link_max_days_old_filter,
max_depth=10000

)

if search_settings.filter_dead_matches is None:
search_settings.filter_dead_matches = config.default_image_dead_matches_filter

if search_settings.filter_removed_matches is None:
search_settings.filter_removed_matches = config.default_image_removed_match_filter

if search_settings.only_older_matches is None:
search_settings.only_older_matches = config.default_image_only_older_matches

if search_settings.filter_same_author is None:
search_settings.filter_same_author = config.default_image_same_author_filter

if search_settings.meme_filter is None:
search_settings.meme_filter = config.default_image_meme_filter

if search_settings.filter_crossposts is None:
search_settings.filter_crossposts = config.default_image_crosspost_filter
else:
search_settings.filter_crossposts = not search_settings.filter_crossposts

if search_settings.same_sub is None:
search_settings.same_sub = config.default_image_same_sub_filter


return search_settings


def get_default_link_search_settings(config: Config) -> SearchSettings:
return SearchSettings(
Expand Down
12 changes: 9 additions & 3 deletions redditrepostsleuth/core/util/imagehashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from redditrepostsleuth.core.db.databasemodels import Post
from redditrepostsleuth.core.exception import ImageConversionException, ImageRemovedException, InvalidImageUrlException
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,23 +52,28 @@ def generate_img_by_url(url: str) -> Image:

return img if img else None

def generate_img_by_url_requests(url: str) -> Optional[Image]:
def generate_img_by_url_requests(url: str, proxy: str = None) -> Optional[Image]:
"""
Take a URL and generate a PIL image
:param proxy: Optional proxy to use with request
:param url: URL to get
:return: PIL image
"""
if 'redd.it' in url:
useragent = 'repostsleuthbot:v1.0.3 Image Hasher (by /u/barrycarey)'
else:
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
useragent = GENERIC_USER_AGENT

headers = {
'User-Agent': useragent
}

proxies = None
if proxy:
proxies = {'http': proxy, 'https': proxy}

try:
res = requests.get(url, headers=headers, timeout=7)
res = requests.get(url, headers=headers, timeout=7, proxies=proxies)
except (ConnectionError, Timeout) as e:
raise ImageConversionException(str(e))

Expand Down
Loading

0 comments on commit 3dbb15c

Please sign in to comment.