-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget100ids.py
51 lines (46 loc) · 2.13 KB
/
get100ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup
from subprocess import CalledProcessError
import requests
import time
import json
import sys
import subprocess
import urllib.request
search_term = input("Enter a search term: ")
search_term.replace(" ", "+")
# Make a get request to PubMed
response = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=" + search_term + "&retmax=100&tool=mosquito_map_tool&[email protected]")
soup = BeautifulSoup(response.content, 'html.parser')
# Print the information of the first 100 articles
for id in soup.find_all('id'):
uid = id.get_text()
paper = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=" + id.get_text() + "&retmode=json&tool=mosquito_map_tool&[email protected]")
json_data = paper.json()
# Print the information of the given article
print("Title: " + json_data['result'][uid]['title'])
print("Authors: ")
for author in json_data['result'][uid]['authors']:
print(author['name'])
print("Publisher: " + json_data['result'][uid]['fulljournalname'])
print("Publish Date: " + json_data['result'][uid]['pubdate'])
print(uid)
# Find the location of the pdf from the file list
# Updated oa_file_list.txt can be found at: ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_file_list.txt
try:
output = subprocess.run("grep " + uid + " oa_file_list.txt", capture_output=True, check=True, shell=True)
path = output.stdout.decode('Latin-1').split("\t",1)[0]
print(path)
path_1 = path.split("/", 4)[0]
path_2 = path.split("/", 4)[1]
path_3 = path.split("/", 4)[2]
path_4 = path.split("/", 4)[3]
print('Beginning file download')
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/" + path_1 + path_2 + path_3
# This currently is not working - research ways of downloading from FTP server
response = urllib.request.urlretrieve(url)
except CalledProcessError as ex:
# The file was not found in the oa_file_list.txt
print('Error: ', ex)
# print(output)
# API yells at you if you don't wait a bit between requests
# time.sleep(1)