-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhasdata.py
72 lines (65 loc) · 2.36 KB
/
hasdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import json
import requests
from newspaper import Article
def extract_html_via_api(url, hasdata_api_key):
# Send a request to the API to get the HTML content
api_url = "https://api.hasdata.com/scrape/web"
headers = {
'Content-Type': 'application/json',
'x-api-key': hasdata_api_key
}
data = {
"url": url,
"proxyType": "residential",
"proxyCountry": "US",
"blockResources": False,
"blockAds": False,
"screenshot": False,
"jsRendering": True,
"excludeHtml": False,
"extractEmails": False
}
response = requests.post(api_url, headers=headers, data=json.dumps(data))
if response.status_code == 200:
response_json = response.json()
html_content = response_json.get('content')
if html_content:
print(f"[INFO] HTML content successfully extracted for URL: {url}")
else:
print(f"[WARNING] No HTML content extracted for URL: {url}")
return html_content
else:
print(f"[ERROR] Failed to extract HTML content for URL: {url}, Status Code: {response.status_code}")
return None
def extract_text_from_html(html_content):
try:
article = Article('')
article.set_html(html_content)
article.parse()
print("[INFO] Successfully parsed article text.")
return article.text
except Exception as e:
print(f"[ERROR] Failed to parse article text. Error: {e}")
return None
def extract_serp_via_api(keyword, hasdata_api_key):
# Send a request to the API to get the HTML content
api_url = "https://api.hasdata.com/scrape/google/serp"
headers = {
'Content-Type': 'application/json',
'x-api-key': hasdata_api_key
}
data = {
"q": keyword
}
response = requests.post(api_url, headers=headers, data=json.dumps(data))
if response.status_code == 200:
response_json = response.json()
organic_results = response_json.get('organicResults')
if organic_results:
print(f"[INFO] SERP results successfully extracted for keyword: {keyword}")
else:
print(f"[WARNING] No SERP results for keyword: {keyword}")
return organic_results
else:
print(f"[ERROR] Failed to extract SERP results for keyword: {keyword}, Status Code: {response.status_code}")
return None