Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/OHDSI/KnowledgeBase
Browse files Browse the repository at this point in the history
  • Loading branch information
rkboyce committed Jul 20, 2017
2 parents 331b5fa + 1d9c74a commit 1f86055
Show file tree
Hide file tree
Showing 7 changed files with 470 additions and 107 deletions.
251 changes: 207 additions & 44 deletions LAERTES/PubMed/README.md

Large diffs are not rendered by default.

94 changes: 66 additions & 28 deletions LAERTES/PubMed/pmSearch2rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
import psycopg2 # for postgres connection to Medline

## The result of the query in queryDrugHOIAssociations.psql
#SEARCH_RESULTS = "drug-hoi-associations-from-mesh-September-2016.tsv"
SEARCH_RESULTS = "drug-hoi-associations-from-mesh-September-2016-55831-to-end.tsv"
SEARCH_RESULTS = "drug-hoi-associations-from-mesh-September-2016.tsv"
#SEARCH_RESULTS = "test-drug-hoi-dataset.tsv"

## Set up the db connection to the MEDLINE DB. This is used to collect
## a bit more data and metadata on the MEDLINE entries
Expand Down Expand Up @@ -221,6 +221,7 @@

annotatedCache = {} # indexes annotation ids by pmid
abstractCache = {} # cache for abstract text
titleCache = {} # cache for title text
pubTypeCache = {} # used because some PMIDs have multiple publication type assignments TODO: determine pub types should be assigned to a Collection under the target's graph
drugHoiPMIDCache = {} # used to avoid duplicating PMID - drug - HOI combos for PMIDs that have multiple publication type assignments TODO: determine if a more robust source query is needed
mshPharmIdToUMLSCache = {} # Used to store mappings from MeSH pharmacological grouping IDs to a list of UMLS MetaThesaurus CUIs for the drug concepts that belong in that group
Expand All @@ -239,8 +240,8 @@
s = graph.serialize(format="n3",encoding="utf8", errors="replace")
f.write(s)

for elt in recL[0:1000]: # Debugging
#for elt in recL: # Full run
#for elt in recL[0:1000]: # Debugging
for elt in recL: # Full run
## For now, only process papers tagged as for humans
## TODO: expand the evidence types to include non-human studies
try:
Expand All @@ -255,8 +256,24 @@
print "WARNING: PMID %s not tagged as involving humans. Skipping this record." % elt[PMID]
continue

# get title if not already cached
if not titleCache.has_key(elt[PMID]):
try:
print "INFO: Attempting to retrieve the title for PMID %s from the MEDLINE DB" % elt[PMID]
cur.execute("""select art_arttitle from medcit where pmid = %s and pmid_version = 1""" % elt[PMID])
except Exception as e:
print "ERROR: Attempt to get the title for PMID failed. Error string: %s" % e
sys.exit(1)

rows = cur.fetchall()
if len(rows) == 0:
titleCache[elt[PMID]] = ""
print "INFO: No title found for PMID %s." % elt[PMID]
else:
print "INFO: Title found for PMID %s : %s " % (elt[PMID], rows[0][0])
titleCache[elt[PMID]] = rows[0][0]

# get abstract if not already cached
# TODO: retrieve the title too
if not abstractCache.has_key(elt[PMID]):
try:
print "INFO: Attempting to retrieve the abstract for PMID %s from the MEDLINE DB" % elt[PMID]
Expand Down Expand Up @@ -347,12 +364,14 @@
tplL.append((currentAnnotTargetUuid, ohdsi["MeshStudyType"], Literal("other (publication type)")))


# add the text quote selector but just put the abstract in an oa:exact
# TODO: add the title to oa:exact when there is no abstract (or concatenate both)
# add the text quote selector but just put the title and
# abstract in an oa:exact using the pipe delimiter to separate
# them
textConstraintUuid = URIRef("urn:uuid:%s" % uuid.uuid4())
tplL.append((currentAnnotTargetUuid, oa["hasSelector"], textConstraintUuid))
tplL.append((textConstraintUuid, RDF.type, oa["TextQuoteSelector"]))
abstractTxt = unicode(abstractCache[elt[PMID]], 'utf-8', 'replace')
tiab = "|".join(["TITLE: " + titleCache[elt[PMID]], "ABSTRACT: " + abstractCache[elt[PMID]]])
abstractTxt = unicode(tiab, 'utf-8', 'replace')
tplL.append((textConstraintUuid, oa["exact"], Literal(abstractTxt)))

s = u""
Expand All @@ -365,13 +384,28 @@
# body contains the MESH drug and condition as a semantic tag
print "INFO: working on the body for %s" % elt

# to begin with, avoid duplicating PMID - drug - HOI combos for PMIDs that have multiple publication type assignments
# to begin with, avoid duplicating PMID - drug - HOI combos for
# PMIDs that have multiple publication type assignments or that
# are already picked up because the drug that belonged to a MESH
# pharm action group mentioned in MESH tags for a TIAB was
# specifically mentioned in the text of the TIAB. If a the drug
# that belonged to a MESH pharm action group WAS mentioned in MESH
# tags BUT NOT specifically mentioned in the TIAB text then, this
# is probably a specific MESH mention of the drug that reqiures a
# new body
concat = "%s-%s-%s" % (elt[PMID], elt[ADR_DRUG_UI], elt[ADR_HOI_UI])
if drugHoiPMIDCache.has_key(concat):
print "INFO: skipping generation of a new body graph because the PMID, drug, and HOI (%s) have already been processed. Probably a MEDLINE record with multiple pub type assignments" % concat
if drugHoiPMIDCache.has_key(concat) and (drugHoiPMIDCache[concat] == "MeshTaggedAgent" or drugHoiPMIDCache[concat] == "FilteredAdeAgent") :
print "INFO: skipping generation of a new body graph because the PMID, drug, and HOI (%s) have already been processed. drugHoiPMIDCache[concat]: %s" % (concat, drugHoiPMIDCache[concat])
continue
else:
drugHoiPMIDCache[concat] = None
# This should have two effects: 1) both individual drugs and
# drug groupings will be tagged as MESH mentions, and 2)
# individual drugs that were previously part of a drug
# grouping will be noted as also being MESH tagged. The latter
# case creates a duplication, but one that can be addressed
# using SPARQL by distinguishing UnfilteredAdeAgent set
# members from direct ingredient resources.
drugHoiPMIDCache[concat] = "MeshTaggedAgent"

currentAnnotationBody = "ohdsi-pubmed-mesh-annotation-annotation-body-%s" % annotationBodyCntr
annotationBodyCntr += 1
Expand Down Expand Up @@ -425,11 +459,11 @@
if len(cuiRsltL) == 0:
print "ERROR: very strange that none of the drug concepts in the MeSH pharmacological grouping is able to map to any UMLS MetaThesaurus CUI: %s -- %s" % (descriptorName,mshIdSet)
else:
# query Semmeddb to get the CUIs for tagged pharmacologic
# substances and organic chemicals. NOTE: the IN clause is
# limited by the MySQL max_allowed_packet configuration
# variable so set it to be large (e.g., several megabytes)
print "INFO: checking Semmeddb to see if the title or abstract of PMID %s mentions any of the %s individual drugs" % (elt[PMID], len(cuiRsltL))
# query Semmeddb to get the CUIs tagged for this
# TIAB. NOTE: the IN clause is limited by the MySQL
# max_allowed_packet configuration variable so set it
# to be large (e.g., several megabytes)
#print "INFO: checking Semmeddb to see if the title or abstract of PMID %s mentions any of the %s individual drugs" % (elt[PMID], len(cuiRsltL))

# 1. Get the CUIs associated with the PMID in semmeddb
pmidCuis = pmidToCuiCache.get(elt[PMID])
Expand All @@ -447,16 +481,20 @@
print q

smdb_cur.execute(q)
pmidCuis = fetchall() # all results (even null ones) need to be retrieved to prevent "Unread result found" when the curser is used in a later iteraction
pmidCuis = smdb_cur.fetchall() # all results (even null ones) need to be retrieved to prevent "Unread result found" when the curser is used in a later iteraction

if pmidCuis != None and pmidCuis != []:
pmidToCuiCache[elt[PMID]] = [x[0] for x in pmidCuis]
#print "INFO: cuiRsltL - %s " % cuiRsltL
print "INFO: Found individual Semmeddb concept mentions (including non-drugs) for PMID %s (from query) -- pmidCuis: %s" % (elt[PMID], pmidToCuiCache[elt[PMID]])

if pmidCuis != None:
pmidToCuiCache[elt[PMID]] = pmidCuis
print "INFO: Found individual Semmeddb drug mentions for PMID -- pmidCuis: %s" % ",".join(pmidCuis)
# 2. get the MESH identifiers for the returned CUIs that are in the MESH Pharm group cuiRsltL
mshSubstOfInterestL = [x[1] for x in filter(lambda x: x[0] in pmidCuis, cuiRsltL)]
mshSubstOfInterestLCache[elt[PMID] + elt[ADR_DRUG_UI]] = mshSubstOfInterestL
mshSubstOfInterestL = [x[1] for x in filter(lambda x: x[0] in pmidToCuiCache[elt[PMID]], cuiRsltL)]
if len(mshSubstOfInterestL) > 0:
print "INFO: Mesh drug identifiers being added to cache: %s " % mshSubstOfInterestL
mshSubstOfInterestLCache[elt[PMID] + elt[ADR_DRUG_UI]] = mshSubstOfInterestL

# add each specific substance found in the title or abstract to a 'adeAgents' collection in the body
# add each *specific* substance found in the title or abstract to a 'adeAgents' collection in the body
if len(mshSubstOfInterestL) > 0:
# first check for duplication, it happens a bunch
keepers = []
Expand All @@ -466,7 +504,7 @@
print "INFO: skipping addition of a PMID, drug, and HOI (%s) that have already been processed (probably duplication of pharmacologic entity mapping because of drug groupings)." % concat
continue
else:
drugHoiPMIDCache[concat] = None
drugHoiPMIDCache[concat] = "FilteredAdeAgent"
keepers.append(substanceMshUI)

if len(keepers) > 0:
Expand All @@ -485,8 +523,8 @@

# Now, add the rest of the entities in the pharmacologic group to an 'adeAgentsUnfiltered' collection which is useful for two reasons, 1) SemMedDB is a few months behind the current MEDLINE (so, more recent titles and abstracts will not benefit from the processing steps above), and 2)although noisy for inferring positive drug-HOI associaions, having all drugss in a class is helpful for inferring negative controls.
keepers = [] # NOTE: we deliberately do not add those triples that have been added to the 'adeAgents' collection to the 'adeAgentsUnfiltered' collection
if pmidCuis != None:
substOfInterestL = [x[1] for x in filter(lambda x: x[0] not in pmidCuis, cuiRsltL)] # the drug concepts NOT mentioned in the abstract
if pmidToCuiCache.get(elt[PMID]) != None:
substOfInterestL = [x[1] for x in filter(lambda x: x[0] not in pmidToCuiCache[elt[PMID]], cuiRsltL)] # the drug concepts NOT mentioned in the abstract
else:
substOfInterestL = [x[1] for x in cuiRsltL] # the MESH identifiers all drugs in the MESH pharm grouping

Expand All @@ -496,7 +534,7 @@
print "INFO: skipping addition of a PMID, drug, and HOI (%s) that have already been processed (probably duplication of pharmacologic entity mapping because of drug groupings)." % concat
continue
else:
drugHoiPMIDCache[concat] = None
drugHoiPMIDCache[concat] = "UnfilteredAdeAgent"
keepers.append(substanceMshUI)

if len(keepers) > 0:
Expand Down
2 changes: 1 addition & 1 deletion LAERTES/PubMed/queryDrugHOIAssociations.psql
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ inner join
(
select pmid,
CASE
WHEN value IN ('Clinical Trial', 'Controlled Clinical Trial', 'Clinical Trial, Phase I', 'Clinical Trial, Phase II', 'Clinical Trial, Phase III','Clinical Trial, Phase IV', 'Randomized Controlled Trial','Observational Study','Multicenter Study') THEN 'Clinical Trial'
WHEN value IN ('Clinical Trial', 'Controlled Clinical Trial', 'Clinical Trial, Phase I', 'Clinical Trial, Phase II', 'Clinical Trial, Phase III','Clinical Trial, Phase IV', 'Randomized Controlled Trial','Multicenter Study') THEN 'Clinical Trial'
WHEN value IN ('Observational Study') THEN 'Comparative Study'
ELSE value
END AS pub_type_value, /*This case statement is to allow us to add in new types without needing to update downstream processes*/
Expand Down
99 changes: 99 additions & 0 deletions LAERTES/PubMed/test-drug-hoi-dataset.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
22695300 Anti-HIV Agents D019380 Craniofacial Dysostosis D003394 Case Reports D002363
22695300 Anti-HIV Agents D019380 Craniofacial Dysostosis D003394 Journal Article D016428
6402356 Anticonvulsants D000927 Craniofacial Dysostosis D003394 Journal Article D016428
6402356 Anticonvulsants D000927 Craniofacial Dysostosis D003394 Clinical Trial D016430
6402356 Anticonvulsants D000927 Craniofacial Dysostosis D003394 Clinical Trial D018848
7252657 Anticonvulsants D000927 Craniofacial Dysostosis D003394 Case Reports D002363
22695300 Benzoxazines D048588 Craniofacial Dysostosis D003394 Case Reports D002363
22695300 Benzoxazines D048588 Craniofacial Dysostosis D003394 Journal Article D016428
6263127 Hydantoins D006827 Craniofacial Dysostosis D003394 Journal Article D016428
6263127 Hydantoins D006827 Craniofacial Dysostosis D003394 Case Reports D002363
7252656 Primidone D011324 Craniofacial Dysostosis D003394 Journal Article D016428
4736229 Salicylates D012459 Craniofacial Dysostosis D003394 Journal Article D016428
4974318 Vitamin D D014807 Craniofacial Dysostosis D003394 Journal Article D016428
5812643 Vitamin D D014807 Craniofacial Dysostosis D003394 Journal Article D016428
355 Trifluoperazine D014268 Parkinson Disease, Secondary D010302 Journal Article D016428
355 Trifluoperazine D014268 Parkinson Disease, Secondary D010302 Case Reports D002363
3930 Indoles D007211 Parkinson Disease, Secondary D010302 Journal Article D016428
3930 Indoles D007211 Parkinson Disease, Secondary D010302 Comparative Study D003160
706 Levodopa D007980 Basal Ganglia Diseases D001480 Clinical Trial D016430
706 Levodopa D007980 Basal Ganglia Diseases D001480 Journal Article D016428
706 Levodopa D007980 Basal Ganglia Diseases D001480 Clinical Trial D018848
4858 Antipsychotic Agents D014150 Basal Ganglia Diseases D001480 Clinical Trial D016430
4858 Antipsychotic Agents D014150 Basal Ganglia Diseases D001480 Journal Article D016428
855 Indomethacin D007213 Peptic Ulcer D010437 Journal Article D016428
855 Phenylbutazone D010653 Peptic Ulcer D010437 Journal Article D016428
855 Pilocarpine D010862 Peptic Ulcer D010437 Journal Article D016428
855 Quinolines D011804 Peptic Ulcer D010437 Journal Article D016428
11321 Anti-Inflammatory Agents, Non-Steroidal D000894 Peptic Ulcer D010437 Journal Article D016428
11321 Anti-Inflammatory Agents, Non-Steroidal D000894 Stomach Ulcer D013276 Journal Article D016428
311 Enflurane D004737 Jaundice D007565 Case Reports D002363
311 Enflurane D004737 Jaundice D007565 Journal Article D016428
311 Methyl Ethers D008738 Jaundice D007565 Case Reports D002363
311 Methyl Ethers D008738 Jaundice D007565 Journal Article D016428
658 Adrenergic beta-Agonists D000318 Depression D003863 Journal Article D016428
658 Adrenergic beta-Agonists D000318 Depression D003863 Case Reports D002363
658 Adrenergic beta-Agonists D000318 Hallucinations D006212 Journal Article D016428
658 Adrenergic beta-Agonists D000318 Hallucinations D006212 Case Reports D002363
658 Albuterol D000420 Depression D003863 Journal Article D016428
658 Albuterol D000420 Depression D003863 Case Reports D002363
658 Albuterol D000420 Hallucinations D006212 Journal Article D016428
658 Albuterol D000420 Hallucinations D006212 Case Reports D002363
658 Isoxsuprine D007556 Depression D003863 Journal Article D016428
658 Isoxsuprine D007556 Depression D003863 Case Reports D002363
658 Isoxsuprine D007556 Hallucinations D006212 Journal Article D016428
658 Isoxsuprine D007556 Hallucinations D006212 Case Reports D002363
658 Phenethylamines D010627 Depression D003863 Journal Article D016428
658 Phenethylamines D010627 Depression D003863 Case Reports D002363
658 Phenethylamines D010627 Hallucinations D006212 Journal Article D016428
658 Phenethylamines D010627 Hallucinations D006212 Case Reports D002363
5066 Amantadine D000547 Basal Ganglia Diseases D001480 Clinical Trial D016430
5066 Amantadine D000547 Basal Ganglia Diseases D001480 Comparative Study D003160
5066 Amantadine D000547 Basal Ganglia Diseases D001480 Journal Article D016428
5066 Antipsychotic Agents D014150 Basal Ganglia Diseases D001480 Clinical Trial D016430
5066 Antipsychotic Agents D014150 Basal Ganglia Diseases D001480 Comparative Study D003160
5066 Antipsychotic Agents D014150 Basal Ganglia Diseases D001480 Journal Article D016428
5066 Benztropine D001590 Basal Ganglia Diseases D001480 Clinical Trial D016430
5066 Benztropine D001590 Basal Ganglia Diseases D001480 Comparative Study D003160
5066 Benztropine D001590 Basal Ganglia Diseases D001480 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Arrhythmias, Cardiac D001145 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Diarrhea D003967 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Heart Failure D006333 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Hypoglycemia D007003 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Hypotension D007022 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Muscle Cramp D009120 Journal Article D016428
17766 Adrenergic beta-Antagonists D000319 Skin Diseases D012871 Journal Article D016428
17766 Practolol D011217 Arrhythmias, Cardiac D001145 Journal Article D016428
17766 Practolol D011217 Diarrhea D003967 Journal Article D016428
17766 Practolol D011217 Heart Failure D006333 Journal Article D016428
17766 Practolol D011217 Hypoglycemia D007003 Journal Article D016428
17766 Practolol D011217 Hypotension D007022 Journal Article D016428
17766 Practolol D011217 Muscle Cramp D009120 Journal Article D016428
17766 Practolol D011217 Skin Diseases D012871 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Asthma D001249 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Hypercalcemia D006934 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Hyperglycemia D006943 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Hypoglycemia D007003 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Kidney Failure, Chronic D007676 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Nephrotic Syndrome D009404 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Neurasthenia D009440 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Placental Insufficiency D010927 Journal Article D016428
17162 Adrenergic beta-Antagonists D000319 Vomiting D014839 Journal Article D016428
17162 Practolol D011217 Asthma D001249 Journal Article D016428
17162 Practolol D011217 Hypercalcemia D006934 Journal Article D016428
17162 Practolol D011217 Hyperglycemia D006943 Journal Article D016428
17162 Practolol D011217 Hypoglycemia D007003 Journal Article D016428
17162 Practolol D011217 Kidney Failure, Chronic D007676 Journal Article D016428
17162 Practolol D011217 Nephrotic Syndrome D009404 Journal Article D016428
17162 Practolol D011217 Neurasthenia D009440 Journal Article D016428
17162 Practolol D011217 Placental Insufficiency D010927 Journal Article D016428
17162 Practolol D011217 Vomiting D014839 Journal Article D016428
17162 Propranolol D011433 Asthma D001249 Journal Article D016428
17162 Propranolol D011433 Hypercalcemia D006934 Journal Article D016428
17162 Propranolol D011433 Hyperglycemia D006943 Journal Article D016428
17162 Propranolol D011433 Hypoglycemia D007003 Journal Article D016428
17162 Propranolol D011433 Kidney Failure, Chronic D007676 Journal Article D016428
17162 Propranolol D011433 Nephrotic Syndrome D009404 Journal Article D016428
17162 Propranolol D011433 Neurasthenia D009440 Journal Article D016428
17162 Propranolol D011433 Placental Insufficiency D010927 Journal Article D016428
17162 Propranolol D011433 Vomiting D014839 Journal Article D016428
Loading

0 comments on commit 1f86055

Please sign in to comment.