Skip to content

Commit

Permalink
Refs #616: adds Makefile with sentence embedding pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
janehmueller committed Nov 29, 2017
1 parent 6700557 commit 5922290
Showing 1 changed file with 18 additions and 0 deletions.
18 changes: 18 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
SPARK_SUBMIT = HADOOP_USER_NAME="ingestion" spark-submit
PYSPARK_SUBMIT = HADOOP_USER_NAME="ingestion" PYSPARK_PYTHON=./SPACY/spacy_env/bin/python spark-submit
HOME = /home/jan.ehmueller
SPARK_SH = $(HOME)/scripts/spark.sh
JAR = $(HOME)/ingestion-assembly-1.0.jar
SPACY_DIR = $(HOME)/pyspark/spacy

sentence-embeddings: sentence-split tokenize embeddings

sentence-split:
$(SPARK_SH) -m yarn -e 5 -d 1 -c de.hpi.ingestion.sentenceembedding.SentenceSplitter $(JAR)

tokenize:
cd $(SPACY_DIR); $(PYSPARK_SUBMIT) --num-executors 23 --executor-cores 4 --executor-memory 10G --packages com.datastax.spark:spark-cassandra-connector_2.11:2.0.1 --conf spark.cassandra.connection.host=odin02,odin03,odin04,odin05,odin06,odin07,odin08 --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./SPACY/spacy_env/bin/python --master yarn --archives spacy_env.zip#SPACY pyspark_spacy.py; cd $(HOME)

embeddings:
$(SPARK_SUBMIT) --class de.hpi.ingestion.framework.JobRunner --num-executors 8 --executor-cores 2 --executor-memory 28G --driver-memory 8G $(JAR) de.hpi.ingestion.sentenceembedding.SentenceEmbeddings

0 comments on commit 5922290

Please sign in to comment.