- name of fulltext index:
fragmentGeneSymbol
- custom Lucene analyzer from Stefan Armbruster: https://github.com/covidgraph/neo4j-additional-analyzers
CALL db.index.fulltext.createNodeIndex("fragmentGeneSymbol", ["Fragment"], ["text"], {analyzer: "synonym"});
- skip gene symbols with special characters in search
- set an additional label to filter them
MATCH (gs:GeneSymbol)
WHERE gs.sid contains('(')
OR gs.sid contains(')')
OR gs.sid contains('/')
OR gs.sid contains('*')
OR gs.sid contains(' ')
OR gs.sid contains('[')
OR gs.sid contains(']')
OR gs.sid contains(':')
SET gs:OmitSpecialChar
RETURN count(distinct gs)
MATCH (gs:GeneSymbol)
WHERE size(gs.sid) = 1
SET gs:OmitLength
- match gene symbols against word list to exclude symbols that are common words
- set an additional label to filter them
MATCH (gs:GeneSymbol), (w:Word)
WHERE toLower(gs.sid) = toLower(w.value)
AND w.match11 = True
SET gs:OmitWord
- match gene symbols against
:Fragment
fulltext index - use
MERGE
to be able to rerun the query
CALL apoc.periodic.iterate(
"MATCH (gs:GeneSymbol) WHERE NOT gs:OmitWord AND NOT gs:OmitSpecialChar AND NOT gs:OmitLength RETURN gs",
"CALL db.index.fulltext.queryNodes('fragmentGeneSymbol', gs.sid) YIELD node, score
MERGE (gs)<-[r:MENTIONS]-(node) SET r.score = score",
{batchSize: 10, parallel: false, iterateList: true});
MATCH (gs:GeneSymbol)<-[r:MENTIONS]-(:Fragment)
RETURN count(r)