From 3dbb15ce4bea875268a43ffb35d024ca05db9c9d Mon Sep 17 00:00:00 2001 From: Matthew Carey Date: Sun, 18 Feb 2024 17:42:40 -0500 Subject: [PATCH] Feature/redgif (#364) * working redgifs ingest. Proxy not enabled * Cleanup and final redgif support * Cleanup and final redgif support * failed remove post test * added redgifs package --- .../celery/task_logic/ingest_task_logic.py | 75 +++++++++++++++-- .../core/celery/tasks/ingest_tasks.py | 48 +++++++++-- redditrepostsleuth/core/exception.py | 6 +- .../core/services/redgifs_token_manager.py | 84 +++++++++++++++++++ redditrepostsleuth/core/util/constants.py | 2 + redditrepostsleuth/core/util/helpers.py | 49 +++++++---- redditrepostsleuth/core/util/imagehashing.py | 12 ++- redditrepostsleuth/ingestsvc/ingestsvc.py | 2 +- requirements.txt | 3 +- tests/submonitorsvc/test_subMonitor.py | 16 +++- worker-requirements.txt | 3 +- 11 files changed, 261 insertions(+), 39 deletions(-) create mode 100644 redditrepostsleuth/core/services/redgifs_token_manager.py diff --git a/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py b/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py index 9a9fef0..23a2bee 100644 --- a/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py +++ b/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py @@ -1,23 +1,75 @@ import logging +import os from hashlib import md5 from typing import Optional +from urllib.parse import urlparse import imagehash +import redgifs +from redgifs import HTTPException from redditrepostsleuth.core.db.databasemodels import Post, PostHash from redditrepostsleuth.core.exception import ImageRemovedException, ImageConversionException, InvalidImageUrlException, \ GalleryNotProcessed -from redditrepostsleuth.core.util.imagehashing import log, generate_img_by_url_requests +from redditrepostsleuth.core.proxy_manager import ProxyManager +from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager +from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT +from redditrepostsleuth.core.util.imagehashing import generate_img_by_url_requests from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post log = logging.getLogger(__name__) -def pre_process_post(submission: dict) -> Optional[Post]: + +def get_redgif_id_from_url(url: str) -> Optional[str]: + parsed_url = urlparse(url) + id, _ = os.path.splitext(parsed_url.path.replace('/i/', '')) + return id + +def get_redgif_image_url(reddit_url: str, token: str, proxy: str = None) -> Optional[str]: + + id = get_redgif_id_from_url(reddit_url) + if not id: + log.error('Failed to parse RedGifs ID from %s', reddit_url) + return + + api = redgifs.API() + api.http._proxy = {'http': proxy, 'https': proxy} + api.http.headers.update({'User-Agent': GENERIC_USER_AGENT, 'authorization': f'Bearer {token}'}) + try: + gif = api.get_gif(id) + except Exception as e: + log.error('') + return gif.urls.hd + + +def pre_process_post( + submission: dict, + proxy_manager: ProxyManager, + redgif_manager: RedGifsTokenManager, + domains_to_proxy: list[str] +) -> Optional[Post]: post = reddit_submission_to_post(submission) + proxy = None + parsed_url = urlparse(post.url) + if parsed_url.netloc in domains_to_proxy: + proxy = proxy_manager.get_proxy().address + if post.post_type_id == 2: # image - process_image_post(post) + + # Hacky RedGif support. Will need to be refactored if we have to do similar for other sites + redgif_url = None + if 'redgif' in post.url: + token = redgif_manager.get_redgifs_token() + try: + redgif_url = get_redgif_image_url(submission['url'], token) + except HTTPException as e: + if 'code' in e.error and e.error['code'] == 'TokenDecodeError': + redgif_manager.remove_redgifs_token(proxy or 'localhost') + raise e + + process_image_post(post, url=redgif_url, proxy=proxy) elif post.post_type_id == 6: # gallery process_gallery(post, submission) @@ -28,12 +80,21 @@ def pre_process_post(submission: dict) -> Optional[Post]: return post -def process_image_post(post: Post, hash_size: int = 16) -> Post: - - log.info('Hashing image with URL: %s', post.url) +def process_image_post(post: Post, url: str = None, proxy: str = None, hash_size: int = 16) -> Post: + """ + Process an image post to generate the required hashes + :param proxy: Proxy to request image with + :param post: post object + :param url: Alternate URL to use + :param hash_size: Size of hash + :return: Post object with hashes + """ + log.debug('Hashing image with URL: %s', post.url) + if url: + log.info('Hashing %s', post.url) try: - img = generate_img_by_url_requests(post.url) + img = generate_img_by_url_requests(url or post.url, proxy=proxy) except ImageConversionException as e: log.warning('Image conversion error: %s', e) raise diff --git a/redditrepostsleuth/core/celery/tasks/ingest_tasks.py b/redditrepostsleuth/core/celery/tasks/ingest_tasks.py index a2b86bb..4bb4726 100644 --- a/redditrepostsleuth/core/celery/tasks/ingest_tasks.py +++ b/redditrepostsleuth/core/celery/tasks/ingest_tasks.py @@ -1,17 +1,47 @@ +import json +import random +from dataclasses import dataclass +from datetime import datetime, timedelta +from typing import Optional + +import requests +from celery import Task +from redgifs import HTTPException from sqlalchemy.exc import IntegrityError from redditrepostsleuth.core.celery import celery from redditrepostsleuth.core.celery.basetasks import SqlAlchemyTask -from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post +from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post, get_redgif_image_url +from redditrepostsleuth.core.config import Config +from redditrepostsleuth.core.db.db_utils import get_db_engine +from redditrepostsleuth.core.db.uow.unitofworkmanager import UnitOfWorkManager from redditrepostsleuth.core.exception import InvalidImageUrlException, GalleryNotProcessed, ImageConversionException, \ - ImageRemovedException + ImageRemovedException, RedGifsTokenException from redditrepostsleuth.core.logging import get_configured_logger +from redditrepostsleuth.core.proxy_manager import ProxyManager +from redditrepostsleuth.core.services.eventlogging import EventLogging +from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager +from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post log = get_configured_logger('redditrepostsleuth') - -@celery.task(bind=True, base=SqlAlchemyTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed), retry_kwargs={'max_retries': 10, 'countdown': 300}) +@dataclass +class RedGifsToken: + token: str + expires_at: datetime + proxy: str + +class IngestTask(Task): + def __init__(self): + self.config = Config() + self.uowm = UnitOfWorkManager(get_db_engine(self.config)) + self.event_logger = EventLogging() + self._redgifs_token_manager = RedGifsTokenManager() + self._proxy_manager = ProxyManager(self.uowm, 1000) + self.domains_to_proxy = [] + +@celery.task(bind=True, base=IngestTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed, HTTPException), retry_kwargs={'max_retries': 10, 'countdown': 300}) def save_new_post(self, submission: dict, repost_check: bool = True): # TODO: temp fix until I can fix imgur gifs @@ -24,16 +54,22 @@ def save_new_post(self, submission: dict, repost_check: bool = True): return try: - post = pre_process_post(submission) + post = pre_process_post(submission, self._proxy_manager, self._redgifs_token_manager, []) except (ImageRemovedException, InvalidImageUrlException) as e: return + except GalleryNotProcessed as e: + log.warning('Gallery not finished processing') + raise e + except Exception as e: + log.exception('Failed during post pre-process') + return if not post: return monitored_sub = uow.monitored_sub.get_by_sub(post.subreddit) if monitored_sub and monitored_sub.active: - log.info('Sending ingested post to monitored sub queue') + log.info('Sending ingested post to monitored sub queue for %s', monitored_sub.name) celery.send_task('redditrepostsleuth.core.celery.tasks.monitored_sub_tasks.sub_monitor_check_post', args=[post.post_id, monitored_sub], queue='submonitor', countdown=20) diff --git a/redditrepostsleuth/core/exception.py b/redditrepostsleuth/core/exception.py index b3e8099..2e8d32a 100644 --- a/redditrepostsleuth/core/exception.py +++ b/redditrepostsleuth/core/exception.py @@ -70,4 +70,8 @@ def __init__(self, message): class UserNotFound(RepostSleuthException): def __init__(self, message): - super(UserNotFound, self).__init__(message) \ No newline at end of file + super(UserNotFound, self).__init__(message) + +class RedGifsTokenException(RepostSleuthException): + def __init__(self, message): + super(RedGifsTokenException, self).__init__(message) diff --git a/redditrepostsleuth/core/services/redgifs_token_manager.py b/redditrepostsleuth/core/services/redgifs_token_manager.py new file mode 100644 index 0000000..ff9a20d --- /dev/null +++ b/redditrepostsleuth/core/services/redgifs_token_manager.py @@ -0,0 +1,84 @@ +import json +import logging + +import requests +from redis import Redis + +from redditrepostsleuth.core.config import Config +from redditrepostsleuth.core.exception import RedGifsTokenException +from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT + +log = logging.getLogger(__name__) + +""" +Class for managing and caching RedGifs API tokens. Currently overkill but if we need to backfill the database or +API rate limits get tight this will support caching a token for each proxy to Redis +""" +class RedGifsTokenManager: + def __init__(self): + config = Config() + self.redis = Redis( + host=config.redis_host, + port=config.redis_port, + db=config.redis_database, + password=config.redis_password, + decode_responses=True + ) + + + def _cache_token(self, key: str, token: str) -> None: + """ + Take a given token and cache it to Redis + :param key: key of the token + :param token: API token + """ + log.info('Caching token for %s', key) + self.redis.set(f'redgifs-token:{key}', token, ex=82800) + + def remove_redgifs_token(self, key: str) -> None: + """ + Removed a cached token from Redis with a given key + :param key: key to remove + """ + log.info('Removing token for %s', key) + self.redis.delete(f'redgifs-token:{key}') + + + def get_redgifs_token(self, address: str = 'localhost') -> str: + """ + Either return an existing cached token or create a new one + :param address: address of the proxy being used + :return: Token + """ + cached_token = self.redis.get(f'redgifs-token:{address}') + if not cached_token: + return self._request_and_cache_token(address) + + log.debug('Found cached token for %s', address) + return cached_token + + + def _request_and_cache_token(self, proxy_address: str = 'localhost') -> str: + """ + Hit the Redgif API and request a new auth token. Cache it to Redis + :param proxy_address: Proxy to use, if any + :return: Token + """ + proxies = None + if proxy_address != 'localhost': + proxies = {'http': f'https://{proxy_address}', 'https': f'http://{proxy_address}'} + + token_res = requests.get( + 'https://api.redgifs.com/v2/auth/temporary', + headers={'User-Agent': GENERIC_USER_AGENT}, + proxies=proxies + ) + + if token_res.status_code != 200: + log.error('Failed to get RedGif token. Status Code %s', token_res.status_code) + raise RedGifsTokenException(f'Failed to get RedGif token. Status Code {token_res.status_code}') + + token_data = json.loads(token_res.text) + + self._cache_token(proxy_address or 'localhost', token_data['token']) + return token_data['token'] \ No newline at end of file diff --git a/redditrepostsleuth/core/util/constants.py b/redditrepostsleuth/core/util/constants.py index cd2f3d5..e808a9c 100644 --- a/redditrepostsleuth/core/util/constants.py +++ b/redditrepostsleuth/core/util/constants.py @@ -15,6 +15,8 @@ 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36" } +GENERIC_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' + REDDIT_REMOVAL_REASONS = ['deleted', 'author', 'reddit', 'copyright_takedown', 'content_takedown'] EXCLUDE_FROM_TOP_REPOSTERS = [ diff --git a/redditrepostsleuth/core/util/helpers.py b/redditrepostsleuth/core/util/helpers.py index 06347d3..d3c9d03 100644 --- a/redditrepostsleuth/core/util/helpers.py +++ b/redditrepostsleuth/core/util/helpers.py @@ -250,33 +250,52 @@ def get_default_image_search_settings(config: Config) -> ImageSearchSettings: ) def get_image_search_settings_from_request(req, config: Config) -> ImageSearchSettings: - return ImageSearchSettings( + search_settings = ImageSearchSettings( req.get_param_as_int('target_match_percent', required=True, default=None) or config.default_image_target_match, config.default_image_target_annoy_distance, target_title_match=req.get_param_as_int('target_title_match', required=False, default=None) or config.default_image_target_title_match, - filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False, - default=None) or config.default_image_dead_matches_filter, - filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False, - default=None) or config.default_image_removed_match_filter, - only_older_matches=req.get_param_as_bool('only_older_matches', required=False, - default=None) or config.default_image_only_older_matches, - filter_same_author=req.get_param_as_bool('filter_same_author', required=False, - default=None) or config.default_image_same_author_filter, - filter_crossposts=req.get_param_as_bool('filter_crossposts', required=False, - default=None) or config.default_image_crosspost_filter, + filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False, default=None), + filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False, default=None), + only_older_matches=req.get_param_as_bool('only_older_matches', required=False, default=None), + filter_same_author=req.get_param_as_bool('filter_same_author', required=False, default=None), + filter_crossposts=req.get_param_as_bool('include_crossposts', required=False, default=None), target_meme_match_percent=req.get_param_as_int('target_meme_match_percent', required=False, default=None) or config.default_image_target_meme_match, - meme_filter=req.get_param_as_bool('meme_filter', required=False, - default=None) or config.default_image_meme_filter, - same_sub=req.get_param_as_bool('same_sub', required=False, - default=None) or config.default_image_same_sub_filter, + meme_filter=req.get_param_as_bool('meme_filter', required=False, default=None), + same_sub=req.get_param_as_bool('same_sub', required=False, default=None), max_days_old=req.get_param_as_int('max_days_old', required=False, default=None) or config.default_link_max_days_old_filter, max_depth=10000 ) + if search_settings.filter_dead_matches is None: + search_settings.filter_dead_matches = config.default_image_dead_matches_filter + + if search_settings.filter_removed_matches is None: + search_settings.filter_removed_matches = config.default_image_removed_match_filter + + if search_settings.only_older_matches is None: + search_settings.only_older_matches = config.default_image_only_older_matches + + if search_settings.filter_same_author is None: + search_settings.filter_same_author = config.default_image_same_author_filter + + if search_settings.meme_filter is None: + search_settings.meme_filter = config.default_image_meme_filter + + if search_settings.filter_crossposts is None: + search_settings.filter_crossposts = config.default_image_crosspost_filter + else: + search_settings.filter_crossposts = not search_settings.filter_crossposts + + if search_settings.same_sub is None: + search_settings.same_sub = config.default_image_same_sub_filter + + + return search_settings + def get_default_link_search_settings(config: Config) -> SearchSettings: return SearchSettings( diff --git a/redditrepostsleuth/core/util/imagehashing.py b/redditrepostsleuth/core/util/imagehashing.py index fe010b6..c7caab6 100644 --- a/redditrepostsleuth/core/util/imagehashing.py +++ b/redditrepostsleuth/core/util/imagehashing.py @@ -12,6 +12,7 @@ from redditrepostsleuth.core.db.databasemodels import Post from redditrepostsleuth.core.exception import ImageConversionException, ImageRemovedException, InvalidImageUrlException +from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT log = logging.getLogger(__name__) @@ -51,23 +52,28 @@ def generate_img_by_url(url: str) -> Image: return img if img else None -def generate_img_by_url_requests(url: str) -> Optional[Image]: +def generate_img_by_url_requests(url: str, proxy: str = None) -> Optional[Image]: """ Take a URL and generate a PIL image + :param proxy: Optional proxy to use with request :param url: URL to get :return: PIL image """ if 'redd.it' in url: useragent = 'repostsleuthbot:v1.0.3 Image Hasher (by /u/barrycarey)' else: - useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36' + useragent = GENERIC_USER_AGENT headers = { 'User-Agent': useragent } + proxies = None + if proxy: + proxies = {'http': proxy, 'https': proxy} + try: - res = requests.get(url, headers=headers, timeout=7) + res = requests.get(url, headers=headers, timeout=7, proxies=proxies) except (ConnectionError, Timeout) as e: raise ImageConversionException(str(e)) diff --git a/redditrepostsleuth/ingestsvc/ingestsvc.py b/redditrepostsleuth/ingestsvc/ingestsvc.py index 8f17965..28aa4b1 100644 --- a/redditrepostsleuth/ingestsvc/ingestsvc.py +++ b/redditrepostsleuth/ingestsvc/ingestsvc.py @@ -190,7 +190,7 @@ async def main() -> None: oldest_post = uow.posts.get_newest_post() oldest_id = oldest_post.post_id - #await ingest_range(newest_id, oldest_id) + await ingest_range(newest_id, oldest_id) delay = 0 while True: diff --git a/requirements.txt b/requirements.txt index 1dfe0aa..d271954 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,5 @@ aiohttp==3.9.0 pyjwt==2.8.0 gunicorn==21.2.0 falcon==3.1.1 -cryptography==41.0.6 \ No newline at end of file +cryptography==41.0.6 +redgifs==1.9.0 \ No newline at end of file diff --git a/tests/submonitorsvc/test_subMonitor.py b/tests/submonitorsvc/test_subMonitor.py index fe41e6b..c745db9 100644 --- a/tests/submonitorsvc/test_subMonitor.py +++ b/tests/submonitorsvc/test_subMonitor.py @@ -1,6 +1,8 @@ from unittest import TestCase from unittest.mock import MagicMock, Mock, patch, ANY +from praw.models import Submission + from redditrepostsleuth.core.config import Config from redditrepostsleuth.core.db.databasemodels import Post, MonitoredSub, PostType, UserReview, UserWhitelist from redditrepostsleuth.submonitorsvc.monitored_sub_service import MonitoredSubService @@ -170,7 +172,10 @@ def test__handle_high_volume_reposter_check_over_threshold_remove(self, mock_ban user_whitelist=MagicMock(get_by_username_and_subreddit=MagicMock(return_value=None)) ) mock_response_handler = Mock(send_mod_mail=Mock()) - sub_monitor = MonitoredSubService(MagicMock(), MagicMock(), MagicMock(), MagicMock(), mock_response_handler, + submission = Submission( MagicMock(), id='11') + sub_monitor = MonitoredSubService(MagicMock(), MagicMock(), + MagicMock(submission=MagicMock(return_value=submission)), MagicMock(), + mock_response_handler, config=MagicMock()) monitored_sub = MonitoredSub( name='test_subreddit', @@ -183,7 +188,7 @@ def test__handle_high_volume_reposter_check_over_threshold_remove(self, mock_ban post = Post(subreddit='test_subreddit', author='test_user') sub_monitor.handle_high_volume_reposter_check(post, mock_uow, monitored_sub) mock_ban_user.assert_not_called() - mock_remove_post.assert_called_once_with('Removed', ANY) + mock_remove_post.assert_called_once_with('Removed', submission, mod_note=ANY) mock_response_handler.send_mod_mail.assert_not_called() @patch.object(MonitoredSubService, '_remove_post') @@ -194,7 +199,10 @@ def test__handle_high_volume_reposter_check_over_threshold_remove_and_ban(self, user_whitelist=MagicMock(get_by_username_and_subreddit=MagicMock(return_value=None)) ) mock_response_handler = Mock(send_mod_mail=Mock()) - sub_monitor = MonitoredSubService(MagicMock(), MagicMock(), MagicMock(), MagicMock(), mock_response_handler, + submission = Submission(MagicMock(), id='11') + sub_monitor = MonitoredSubService(MagicMock(), MagicMock(), + MagicMock(submission=MagicMock(return_value=submission)), MagicMock(), + mock_response_handler, config=MagicMock()) monitored_sub = MonitoredSub( name='test_subreddit', @@ -207,7 +215,7 @@ def test__handle_high_volume_reposter_check_over_threshold_remove_and_ban(self, post = Post(subreddit='test_subreddit', author='test_user') sub_monitor.handle_high_volume_reposter_check(post, mock_uow, monitored_sub) mock_ban_user.assert_called_once_with('test_user', 'test_subreddit', 'High volume of reposts detected by Repost Sleuth') - mock_remove_post.assert_called_once_with('Removed', ANY) + mock_remove_post.assert_called_once_with('Removed', submission, mod_note=ANY) mock_response_handler.send_mod_mail.assert_not_called() @patch.object(MonitoredSubService, '_remove_post') diff --git a/worker-requirements.txt b/worker-requirements.txt index 7548efc..d2232e1 100644 --- a/worker-requirements.txt +++ b/worker-requirements.txt @@ -11,4 +11,5 @@ distance==0.1.3 pydantic==1.10.9 sentry-sdk==1.29.2 pyjwt==2.8.0 -cryptography==41.0.6 \ No newline at end of file +cryptography==41.0.6 +redgifs==1.9.0 \ No newline at end of file