gwu-libraries · alepbloyd · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/.env.example b/.env.example
@@ -0,0 +1,5 @@
+BRIGHTDATA_API_TOKEN="put-your-api-token-here"
+BRIGHTDATA_DATASET_ID="put-your-dataset-id-here"
+AWS_BUCKET_NAME="put-your-bucket-name-here"
+BUCKET_ACCESS_KEY="put-your-bucket-access-key-here"
+BUCKET_SECRET_KEY="put-your-bucket-secret-key-here"
diff --git a/.gitignore b/.gitignore
@@ -109,4 +109,11 @@ venv.bak/
 extracts.csv
 keyword_extracts.csv
 neg_extracts.csv
-subject_extracts.csv
+subject_extracts.csv
+
+# VSCode
+.vscode
+.vscode/*
+
+# ENV
+.env
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ Note that there is a `requirements.txt` file, so running this program requires a
 usage: `python vopd.py [-h] [--window WINDOW] [--context CONTEXT] [--subjectfile SUBJECTFILE] [--keywordfile KEYWORDFILE] [---normalizefile NORMALIZEFILE] [--mode MODE] transcript`
 
 ```positional arguments:
-  transcript         filepath to transcript pdf or directory, or (where `mode==tweets`) path to SFM extract Excel file
+  transcript         filepath to transcript pdf or directory, or (where `mode==tweets`) path to SFM extract Excel file, or (where 'mode==bdtweets') path to CSV generated by `brightdata-service.py`
 
 optional arguments:
   -h, --help         show this help message and exit
@@ -43,6 +43,13 @@ where:
 * Sender
 * Message
 
+## Generating BrightData twitter data sets via AWS Lambda function
+
+Requires an API key and credits for [BrightData web data APIs](https://docs.brightdata.com/scraping-automation/web-data-apis/web-scraper-api/overview).
+
+Code in `brightdata-aws-lambda.py` can be used as an AWS Lambda function to trigger a collection of tweets using the Brightdata API. The resulting tweets data set is deposited into the S3 bucket configured in environment variables. This can be configured to trigger at an interval of your choice via AWS Cloudwatch settings. 
+
+You can also trigger a collection manually by running the `brightdata-aws-lambda.py` script, and inserting start and end dates in `YYYY-MM-DD` in the `collect_tweets` function.
 
 ## Output files
 
@@ -61,4 +68,3 @@ The `recycle_keywords.py` utility takes:
 
 It scans through the coding file, looking for keyword severity scores assigned by the human coder, as well as looking for new keywords added by the human coder.  It then updates the scores of existing keywords (using the mode of human-assigned severity scores), and adds new keywords, to the keywords file.
 
-
diff --git a/brightdata-aws-lambda.py b/brightdata-aws-lambda.py
@@ -0,0 +1,97 @@
+import argparse
+import urllib3
+import json
+import datetime as DT
+import os
+import urllib3
+
+http = urllib3.PoolManager()
+
+BRIGHTDATA_RESULT_ADDRESS = "https://api.brightdata.com/datasets/v3/snapshot"
+BRIGHTDATA_API_KEY = os.environ["BRIGHTDATA_API_TOKEN"]
+BRIGHTDATA_DATASET_ID = os.environ["BRIGHTDATA_DATASET_ID"]
+BRIGHTDATA_TRIGGER_API_ADDRESS = f"https://api.brightdata.com/datasets/v3/trigger?dataset_id={BRIGHTDATA_DATASET_ID}&type=discover_new&discover_by=profile_url"
+AWS_BUCKET_NAME = os.environ["AWS_BUCKET_NAME"]
+AWS_ACCESS_KEY = os.environ["BUCKET_ACCESS_KEY"]
+AWS_SECRET_KEY = os.environ["BUCKET_SECRET_KEY"]
+
+def clean_handle_list(input_list):
+    """
+    Helper function for removing white space and @s from handles
+    """
+    stripped_list = [handle.rstrip() for handle in input_list]
+    no_ats_list = [handle.replace("@", "") for handle in stripped_list]
+    return no_ats_list
+
+
+def todays_date():
+    """
+    Helper function to return today's date in YYYY-MM-DD format
+    """
+    return DT.date.today().strftime("%Y-%m-%d")
+
+
+def yesterdays_date():
+    """
+    Helper function to return yesterday's date in YYYY-MM-DD format
+    """
+    return (DT.date.today() - DT.timedelta(days=1)).strftime("%Y-%m-%d")
+
+
+def week_ago_date():
+    """
+    Helper function to return a week ago's date in YYYY-MM-DD format
+    """
+    return (DT.date.today() - DT.timedelta(days=7)).strftime("%Y-%m-%d")
+
+def aws_delivery_block():
+    return {"type":"s3","filename":{"template":"{[datetime]}_{[snapshot_id]}","extension":"csv"},"bucket":AWS_BUCKET_NAME,"credentials":{"aws-access-key":AWS_ACCESS_KEY,"aws-secret-key":AWS_SECRET_KEY},"directory":"daily-tweets"}
+
+def date_range_request_body(start_date, end_date, handle_list):
+    """
+    Generate the body of the API request for triggering collection through
+    BrightData API.
+    start_date and end_date format should be YYYY-MM-DD (i.e. "2024-10-15")
+    handle_list should be a list of twitter handles
+    """
+
+    request_block = []
+    for handle in handle_list:
+        request_block.append(
+            {
+                "url": f"https://x.com/{handle}",
+                "start_date": start_date,
+                "end_date": end_date,
+            }
+        )
+    return request_block
+
+def trigger_brightdata_snapshot(start_date, end_date, handles_list):
+    """
+    Pass in a start_date and end_date in "YYYY-MM-DD" format and a list of twitter handles
+    """
+    response = http.request(
+        'POST',
+        url=BRIGHTDATA_TRIGGER_API_ADDRESS,
+        headers={
+            "Authorization": f"Bearer {BRIGHTDATA_API_KEY}",
+            "Content-Type": "application/json",
+        },
+        body=json.dumps({"deliver": aws_delivery_block(),
+               "input": date_range_request_body(start_date,end_date, handles_list)})
+    )
+    print(response)
+    return response
+
+def collect_tweets(start_date, end_date, handles_list):
+    trigger_brightdata_snapshot(start_date, end_date, handles_list)
+
+with open("handles.csv", 'r') as file:
+    handles = clean_handle_list(file.read().split('\n'))
+
+print(f"start date: {yesterdays_date()}, end date: {todays_date()}, handles_list: {handles}")
+
+def lambda_handler(event, context):
+    collect_tweets(start_date=yesterdays_date(), end_date=todays_date(), handles_list=handles)
+    print(f"Started collection of tweets from {len(handles)} accounts, starting {yesterdays_date()} and ending {todays_date()}")
+
diff --git a/document.py b/document.py
@@ -2,8 +2,10 @@
 import os
 import pandas as pd
 import sys
+import dateutil.parser
 from pdfminer.high_level import extract_text_to_fp
 
+
 class DocumentSet:
     def __init__(self):
         pass
@@ -24,23 +26,22 @@ def __init__(self, transcripts_filepath):
 
         # Compose self.transcript_filepaths list
         if not os.path.exists(transcripts_filepath):
-            print('{} does not exist'.format(transcripts_filepath))
+            print("{} does not exist".format(transcripts_filepath))
             sys.exit(1)
         self.transcript_filepaths = []
         if os.path.isdir(transcripts_filepath):
             for filename in os.listdir(transcripts_filepath):
                 filepath = os.path.join(transcripts_filepath, filename)
-                if os.path.isfile(filepath) and filename.lower().endswith('.pdf'):
+                if os.path.isfile(filepath) and filename.lower().endswith(".pdf"):
                     self.transcript_filepaths.append(filepath)
         else:
             self.transcript_filepaths.append(transcripts_filepath)
 
         # finally, make it an iterable
         self.transcript_filepaths = iter(self.transcript_filepaths)
 
-
     def __next__(self):
-        """ Iterator to yield Document, where each Transcript is a Document """
+        """Iterator to yield Document, where each Transcript is a Document"""
 
         # get the path to the next file
         pdfFile = self.transcript_filepaths.__next__()
@@ -51,15 +52,13 @@ def __next__(self):
 
         return doc
 
-
     def _extract_text(self, pdf_filepath):
-        """ Internal utility function to extract text from a single PDF file """
+        """Internal utility function to extract text from a single PDF file"""
         with open(pdf_filepath, "rb") as fp:
             text_fp = io.StringIO()
             extract_text_to_fp(fp, text_fp)
             return text_fp.getvalue()
 
-
     def _show_data(self, show_file_path):
         show_file_name = os.path.split(show_file_path)[1]
 
@@ -68,64 +67,92 @@ def _show_data(self, show_file_path):
         day = show_file_name[3:5]
         year = show_file_name[6:10]
 
-        show_info['show_file_path'] = show_file_path
-        show_info['show_date'] = month+'/'+day+'/'+year
-        show_info['show_id'] = show_file_name[11:14]
-        show_info['show_name'] = show_file_name[15:-4] # leave off .PDF
+        show_info["show_file_path"] = show_file_path
+        show_info["show_date"] = month + "/" + day + "/" + year
+        show_info["show_id"] = show_file_name[11:14]
+        show_info["show_name"] = show_file_name[15:-4]  # leave off .PDF
         return show_info
 
 
 class SFMExtractDocumentSet(DocumentSet):
     def __init__(self, sfmfilepath):
-        """ Initialize with the path to a (single) SFM extract Excel file"""
+        """Initialize with the path to a (single) SFM extract Excel file"""
         # must be an .xlsx file!
         df = pd.read_excel(sfmfilepath, dtype=str, keep_default_na=False)
         self.df_iterrows = df.iterrows()
 
-
     def __next__(self):
-        """ Iterator to yield Documents, where each Tweet is a Document """
+        """Iterator to yield Documents, where each Tweet is a Document"""
 
         index, line = self.df_iterrows.__next__()
-        text = line['text']
+        text = line["text"]
         md = self._tweet_data(line)
         doc = Document(text=text, metadata=md)
         return doc
 
+    def _tweet_data(self, tweet):
+        tweet_info = {}
+        tweet_info["id"] = "'" + tweet["id"] + "'"
+        tweet_info["tweet_url"] = tweet["tweet_url"]
+        tweet_info["created_at"] = tweet["created_at"]
+        tweet_info["user_screen_name"] = tweet["user_screen_name"]
+        tweet_info["tweet_type"] = tweet["tweet_type"]
+        return tweet_info
+
+
+class BDTwitterDocumentSet(DocumentSet):
+    def __init__(self, bdfilepath):
+        """Initialize with the path to a CSV with brightdata API results"""
+        df = pd.read_csv(bdfilepath)
+        self.df_iterrows = df.iterrows()
+
+    def __next__(self):
+        index, line = self.df_iterrows.__next__()
+        text = line["description"]
+        md = self._tweet_data(line)
+        doc = Document(text=text, metadata=md)
+        return doc
 
     def _tweet_data(self, tweet):
         tweet_info = {}
-        tweet_info['id'] = "'"+tweet['id']+"'"
-        tweet_info['tweet_url'] = tweet['tweet_url']
-        tweet_info['created_at'] = tweet['created_at']
-        tweet_info['user_screen_name'] = tweet['user_screen_name']
-        tweet_info['tweet_type'] = tweet['tweet_type']
+        tweet_info["id"] = tweet["id"]
+        tweet_info["user_screen_name"] = tweet["user_posted"]
+        tweet_info["name"] = tweet["name"]
+        tweet_info["description"] = tweet["description"]
+        tweet_info["date_posted"] = tweet["date_posted"]
+        tweet_info["photos"] = tweet["photos"]
+        tweet_info["url"] = tweet["url"]
+        tweet_info["replies"] = tweet["replies"]
+        tweet_info["reposts"] = tweet["reposts"]
+        tweet_info["likes"] = tweet["likes"]
+        tweet_info["views"] = tweet["views"]
+        tweet_info["hashtags"] = tweet["hashtags"]
+        tweet_info["followers"] = tweet["followers"]
+        tweet_info["biography"] = tweet["biography"]
+        tweet_info["timestamp"] = tweet["timestamp"]
         return tweet_info
 
 
 class EmailExtractDocumentSet(DocumentSet):
     def __init__(self, emailfilepath):
-        """ Initialize with the path to a (single) SFM extract Excel file"""
+        """Initialize with the path to a (single) SFM extract Excel file"""
         # must be an .xlsx file!
 
         df = pd.read_excel(emailfilepath, dtype=str, keep_default_na=False)
         self.df_iterrows = df.iterrows()
 
-
     def __next__(self):
-        """ Iterator to yield Documents, where each Tweet is a Document """
+        """Iterator to yield Documents, where each Tweet is a Document"""
 
         index, line = next(self.df_iterrows)
-        text = line['Message']
+        text = line["Message"]
         md = self._email_data(line)
         doc = Document(text=text, metadata=md)
         return doc
 
-
     def _email_data(self, email):
         email_info = {}
-        email_info['Date'] = email['Date']
-        email_info['From'] = email['From']
-        email_info['Subject'] = email['Subject']
+        email_info["Date"] = email["Date"]
+        email_info["From"] = email["From"]
+        email_info["Subject"] = email["Subject"]
         return email_info
-