-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnews_scrape.py
183 lines (157 loc) · 7.57 KB
/
news_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
import requests
import time
from datetime import datetime
from newspaper import Article
import openai
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Function to fetch news articles
def fetch_news_articles():
api_token = os.getenv('CRYPTONEWS_API_TOKEN')
url = f"https://cryptonews-api.com/api/v1?tickers=BTC&items=30&page=1&token={api_token}"
response = requests.get(url)
if response.status_code == 200:
return response.json()['data']
else:
print("Failed to fetch news articles")
return []
# Variable holding 30 articles
articles = fetch_news_articles()
print(articles)
def scrape_article(url):
try:
article = Article(url)
article.download()
article.parse()
return article.text
except Exception as e:
print(f"Failed to scrape {url}: {str(e)}")
return None
# Function to scrape all articles and store them
def scrape_all_articles(articles):
scraped_articles = []
for article in articles:
article_content = scrape_article(article['news_url'])
if article_content:
scraped_articles.append({
'title': article['title'],
'text': article_content,
'source_name': article['source_name'],
'date': article['date'],
'sentiment': article['sentiment'], # Provided by Cryptonews API
'url': article['news_url']
})
return scraped_articles
# Scrape all 30 articles using Newspaper3k
scraped_articles = scrape_all_articles(articles)
# Print a snippet of each scraped article's details
for article in scraped_articles:
snippet = (article['text'][:200] + '...') if len(article['text']) > 200 else article['text']
print(f"Title: {article['title']}")
print(f"Source: {article['source_name']}")
print(f"Date: {article['date']}")
print(f"Sentiment: {article['sentiment']}")
print(f"URL: {article['url']}")
print(f"Content Snippet:\n{snippet}\n")
print("------------------------------------------------\n")
# Function to analyze an article with GPT-4
def analyze_article_with_gpt(article_text):
try:
# System prompt
system_prompt = (
"You are a highly knowledgeable assistant with expertise in Bitcoin and cryptocurrency markets. "
"Your task is to analyze news articles related to Bitcoin. For each article, you will provide a concise summary. "
"After summarizing, you will rate the sentiment of the article, its relevance to the Bitcoin and cryptocurrency markets, "
"and its overall importance. Rate each of these three aspects on a scale from 0 to 100, where 0 is the lowest and 100 is the highest. "
"Your responses should be factual, unbiased, and based solely on the content of the article. "
"Respond in a structured format that includes the summary followed by the ratings for sentiment, relevance, and importance. "
"For example: 'Summary: [Your summary here]. Sentiment: [0-100], Relevance: [0-100], Importance: [0-100].' "
"Do not have any text following the integer rating for Sentiment, Relevance, nor Importance."
"Avoid speculation and provide analysis based on the information available in the article."
)
# Structured prompt for analysis
analysis_prompt = (
f"Analyze the following Bitcoin-related article and provide a summary, "
f"then rate its sentiment, market relevance, and importance on a scale from 0 to 100. "
f"Do not have any text following the integer rating for Sentiment, Relevance, nor Importance."
f"We will be parsing the response with a predetermined format, STRICTLY follow the format"
f"Respond in the exact following format: "
f"Summary: [Your summary here] "
f"Sentiment: [0-100], Relevance: [0-100], Importance: [0-100].\n\n{article_text}"
)
# Create the chat completion
response = openai.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": analysis_prompt}
],
temperature=0.3,
max_tokens=1200
)
# Assuming the response follows the format, we extract the summary and ratings
response_content = response.choices[0].message.content
# print(response.choices[0].message.content)
summary = response_content.split('Summary: ')[1].split(' Sentiment: ')[0].strip()
# Extracting sentiment, relevance, and importance, handling extra text
sentiment_str = response_content.split('Sentiment: ')[1].split(',')[0].split(' ')[0].strip()
relevance_str = response_content.split('Relevance: ')[1].split(',')[0].split(' ')[0].strip()
importance_str = response_content.split('Importance: ')[1].split('.')[0].split(' ')[0].strip()
# Converting string to integer, handling any non-numeric characters
sentiment = int(''.join(filter(str.isdigit, sentiment_str)))
relevance = int(''.join(filter(str.isdigit, relevance_str)))
importance = int(''.join(filter(str.isdigit, importance_str)))
return {
'summary': summary,
'sentiment': sentiment,
'relevance': relevance,
'importance': importance
}
except Exception as e: # Catching a general exception for simplicity
print(f"An error occurred: {str(e)}")
return None
# Function to analyze and store the results for all articles
def analyze_and_store_articles(articles):
for article in articles:
analysis_results = analyze_article_with_gpt(article['text'])
if analysis_results:
article.update(analysis_results)
# Analyze all articles once and store the results
analyze_and_store_articles(scraped_articles)
# Print the summaries and ratings from the stored results
for article in scraped_articles:
print(f"Title: {article['title']}")
print(f"Summary: {article['summary']}")
print(f"Sentiment Rating: {article['sentiment']}")
print(f"Market Relevance Rating: {article['relevance']}")
print(f"Importance Rating: {article['importance']}")
print("------------------------------------------------\n")
# Function to save results to a file with a timestamp in the filename
def save_results_to_file(articles, results_directory='results'):
# Ensure the results directory exists
if not os.path.exists(results_directory):
os.makedirs(results_directory)
# Generate a timestamped filename
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
filename = f"{results_directory}/news_analysis_{timestamp}.txt"
with open(filename, 'w') as file:
for article in articles:
file.write(f"Title: {article['title']}\n")
file.write(f"Summary: {article['summary']}\n")
file.write(f"Sentiment Rating: {article['sentiment']}\n")
file.write(f"Market Relevance Rating: {article['relevance']}\n")
file.write(f"Importance Rating: {article['importance']}\n")
file.write("------------------------------------------------\n\n")
print(f"Analysis results saved to {filename}")
# Save the results to a file with a timestamped filename
save_results_to_file(scraped_articles)
def main():
articles = fetch_news_articles()
print(articles)
scraped_articles = scrape_all_articles(articles)
analyze_and_store_articles(scraped_articles)
save_results_to_file(scraped_articles)
if __name__ == '__main__':
main()