forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added examples for MLlib book chapter, plus fake spam,ham datasets.
- Loading branch information
Showing
6 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
Dear Spark Learner, Thanks so much for attending the Spark Summit 2014! Check out videos of talks from the summit at ... | ||
Hi Mom, Apologies for being late about emailing and forgetting to send you the package. I hope you and bro have been ... | ||
Wow, hey Fred, just heard about the Spark petabyte sort. I think we need to take time to try it out immediately ... | ||
Hi Spark user list, This is my first question to this list, so thanks in advance for your help! I tried running ... | ||
Thanks Tom for your email. I need to refer you to Alice for this one. I haven't yet figured out that part either ... | ||
Good job yesterday! I was attending your talk, and really enjoyed it. I want to try out GraphX ... | ||
Summit demo got whoops from audience! Had to let you know. --Joe |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Dear sir, I am a Prince in a far kingdom you have not heard of. I want to send you money via wire transfer so please ... | ||
Get Viagra real cheap! Send money right away to ... | ||
Oh my gosh you can be really strong too with these drugs found in the rainforest. Get them cheap right now ... | ||
YOUR COMPUTER HAS BEEN INFECTED! YOU MUST RESET YOUR PASSWORD. Reply to this email with your password and SSN ... | ||
THIS IS NOT A SCAM! Send money and get access to awesome stuff really cheap and never have to ... |
81 changes: 81 additions & 0 deletions
81
examples/src/main/java/org/apache/spark/examples/mllib/JavaBookExample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.mllib; | ||
|
||
import java.util.Arrays; | ||
|
||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.api.java.JavaRDD; | ||
import org.apache.spark.api.java.JavaSparkContext; | ||
import org.apache.spark.api.java.function.Function; | ||
|
||
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS; | ||
import org.apache.spark.mllib.classification.LogisticRegressionModel; | ||
import org.apache.spark.mllib.feature.HashingTF; | ||
import org.apache.spark.mllib.linalg.Vector; | ||
import org.apache.spark.mllib.regression.LabeledPoint; | ||
|
||
public final class JavaBookExample { | ||
|
||
public static void main(String[] args) { | ||
SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample"); | ||
JavaSparkContext sc = new JavaSparkContext(sparkConf); | ||
|
||
// Load 2 types of emails from text files: spam and ham (non-spam). | ||
// Each line has text from one email. | ||
JavaRDD<String> spam = sc.textFile("data/mllib/spam.txt"); | ||
JavaRDD<String> ham = sc.textFile("data/mllib/ham.txt"); | ||
|
||
// Create a HashingTF instance to map email text to vectors of 100 features. | ||
final HashingTF tf = new HashingTF(100); | ||
|
||
// Each email is split into words, and each word is mapped to one feature. | ||
// Create LabeledPoint datasets for positive (spam) and negative (ham) examples. | ||
JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() { | ||
@Override public LabeledPoint call(String email) { | ||
return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" ")))); | ||
} | ||
}); | ||
JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() { | ||
@Override public LabeledPoint call(String email) { | ||
return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" ")))); | ||
} | ||
}); | ||
JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples); | ||
trainingData.cache(); // Cache data since Logistic Regression is an iterative algorithm. | ||
|
||
// Create a Logistic Regression learner which uses the LBFGS optimizer. | ||
LogisticRegressionWithLBFGS lrLearner = new LogisticRegressionWithLBFGS(); | ||
// Set model regularization. A well-chosen regParam can make models more robust. | ||
lrLearner.optimizer().setRegParam(1.0); | ||
// Run the actual learning algorithm on the training data. | ||
LogisticRegressionModel model = lrLearner.run(trainingData.rdd()); | ||
|
||
// Test on a positive example (spam) and a negative one (ham). | ||
// First apply the same HashingTF feature transformation used on the training data. | ||
Vector posTestExample = | ||
tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" "))); | ||
Vector negTestExample = | ||
tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" "))); | ||
// Now use the learned model to predict spam/ham for new emails. | ||
System.out.println("Prediction for positive test example: " + model.predict(posTestExample)); | ||
System.out.println("Prediction for negative test example: " + model.predict(negTestExample)); | ||
|
||
sc.stop(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# | ||
# Licensed to the Apache Software Foundation (ASF) under one or more | ||
# contributor license agreements. See the NOTICE file distributed with | ||
# this work for additional information regarding copyright ownership. | ||
# The ASF licenses this file to You under the Apache License, Version 2.0 | ||
# (the "License"); you may not use this file except in compliance with | ||
# the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
from pyspark import SparkContext | ||
from pyspark.mllib.regression import LabeledPoint | ||
from pyspark.mllib.classification import LogisticRegressionWithSGD | ||
from pyspark.mllib.feature import HashingTF | ||
|
||
|
||
if __name__ == "__main__": | ||
sc = SparkContext(appName="PythonBookExample") | ||
|
||
# Load 2 types of emails from text files: spam and ham (non-spam). | ||
# Each line has text from one email. | ||
spam = sc.textFile("data/mllib/spam.txt") | ||
ham = sc.textFile("data/mllib/ham.txt") | ||
|
||
# Create a HashingTF instance to map email text to vectors of 100 features. | ||
tf = HashingTF(numFeatures = 100) | ||
# Each email is split into words, and each word is mapped to one feature. | ||
spamFeatures = spam.map(lambda email: tf.transform(email.split(" "))) | ||
hamFeatures = ham.map(lambda email: tf.transform(email.split(" "))) | ||
|
||
# Create LabeledPoint datasets for positive (spam) and negative (ham) examples. | ||
positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features)) | ||
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features)) | ||
training_data = positiveExamples.union(negativeExamples) | ||
training_data.cache() # Cache data since Logistic Regression is an iterative algorithm. | ||
|
||
# Run Logistic Regression using the SGD optimizer. | ||
# regParam is model regularization, which can make models more robust. | ||
model = LogisticRegressionWithSGD.train(training_data, regParam=1.0) | ||
|
||
# Test on a positive example (spam) and a negative one (ham). | ||
# First apply the same HashingTF feature transformation used on the training data. | ||
posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) | ||
negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) | ||
|
||
# Now use the learned model to predict spam/ham for new emails. | ||
print "Prediction for positive test example: %g" % model.predict(posTestExample) | ||
print "Prediction for negative test example: %g" % model.predict(negTestExample) | ||
|
||
sc.stop() |
83 changes: 83 additions & 0 deletions
83
examples/src/main/scala/org/apache/spark/examples/ml/BookExamplePipeline.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.ml | ||
|
||
import org.apache.spark.{SparkConf, SparkContext} | ||
import org.apache.spark.sql.{Row, SQLContext} | ||
import org.apache.spark.ml.Pipeline | ||
import org.apache.spark.ml.classification.LogisticRegression | ||
import org.apache.spark.ml.feature.{HashingTF, Tokenizer} | ||
|
||
object BookExamplePipeline { | ||
|
||
case class Document(id: Long, text: String) | ||
|
||
case class LabeledDocument(id: Long, text: String, label: Double) | ||
|
||
def main(args: Array[String]) { | ||
val conf = new SparkConf().setAppName("BookExamplePipeline") | ||
val sc = new SparkContext(conf) | ||
val sqlContext = new SQLContext(sc) | ||
import sqlContext._ | ||
|
||
// Load 2 types of emails from text files: spam and ham (non-spam). | ||
// Each line has text from one email. | ||
val spam = sc.textFile("data/mllib/spam.txt") | ||
val ham = sc.textFile("data/mllib/ham.txt") | ||
|
||
// Create LabeledPoint datasets for positive (spam) and negative (ham) examples. | ||
val positiveExamples = spam.zipWithIndex().map { case (email, index) => | ||
LabeledDocument(index, email, 1.0) | ||
} | ||
val negativeExamples = ham.zipWithIndex().map { case (email, index) => | ||
LabeledDocument(index, email, 0.0) | ||
} | ||
val trainingData = positiveExamples ++ negativeExamples | ||
|
||
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. | ||
// Each stage outputs a column in a SchemaRDD and feeds it to the next stage's input column. | ||
val tokenizer = new Tokenizer() // Splits each email into words | ||
.setInputCol("text") | ||
.setOutputCol("words") | ||
val hashingTF = new HashingTF() // Maps email words to vectors of 100 features. | ||
.setNumFeatures(100) | ||
.setInputCol(tokenizer.getOutputCol) | ||
.setOutputCol("features") | ||
val lr = new LogisticRegression() // LogisticRegression uses inputCol "features" by default. | ||
.setRegParam(1.0) // model regularization | ||
val pipeline = new Pipeline() | ||
.setStages(Array(tokenizer, hashingTF, lr)) | ||
|
||
// Fit the pipeline to training documents. | ||
// RDDs of case classes work well with Pipelines since Spark SQL can infer a schema from | ||
// case classes and convert the data into a SchemaRDD. | ||
val model = pipeline.fit(trainingData) | ||
|
||
// Make predictions on test documents. | ||
// The fitted model automatically transforms features using Tokenizer and HashingTF. | ||
val testData = sc.parallelize(Seq( | ||
Document(0, "O M G GET cheap stuff by sending money to ..."), // positive example (spam) | ||
Document(1, "Hi Dad, I started studying Spark the other ...") // negative example (ham) | ||
)) | ||
val predictions = model.transform(testData) | ||
.select('id, 'prediction).collect() | ||
.map { case Row(id, prediction) => (id, prediction) }.toMap | ||
println(s"Prediction for positive test example: ${predictions(0)}") | ||
println(s"Prediction for negative test example: ${predictions(1)}") | ||
} | ||
} |
65 changes: 65 additions & 0 deletions
65
examples/src/main/scala/org/apache/spark/examples/mllib/BookExample.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.examples.mllib | ||
|
||
import org.apache.spark.{SparkConf, SparkContext} | ||
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS | ||
import org.apache.spark.mllib.feature.HashingTF | ||
import org.apache.spark.mllib.regression.LabeledPoint | ||
|
||
object BookExample { | ||
|
||
def main(args: Array[String]) { | ||
val conf = new SparkConf().setAppName(s"Book example: Scala") | ||
val sc = new SparkContext(conf) | ||
|
||
// Load 2 types of emails from text files: spam and ham (non-spam). | ||
// Each line has text from one email. | ||
val spam = sc.textFile("data/mllib/spam.txt") | ||
val ham = sc.textFile("data/mllib/ham.txt") | ||
|
||
// Create a HashingTF instance to map email text to vectors of 100 features. | ||
val tf = new HashingTF(numFeatures = 100) | ||
// Each email is split into words, and each word is mapped to one feature. | ||
val spamFeatures = spam.map(email => tf.transform(email.split(" "))) | ||
val hamFeatures = ham.map(email => tf.transform(email.split(" "))) | ||
|
||
// Create LabeledPoint datasets for positive (spam) and negative (ham) examples. | ||
val positiveExamples = spamFeatures.map(features => LabeledPoint(1, features)) | ||
val negativeExamples = hamFeatures.map(features => LabeledPoint(0, features)) | ||
val trainingData = positiveExamples ++ negativeExamples | ||
trainingData.cache() // Cache data since Logistic Regression is an iterative algorithm. | ||
|
||
// Create a Logistic Regression learner which uses the LBFGS optimizer. | ||
val lrLearner = new LogisticRegressionWithLBFGS() | ||
// Set model regularization. A well-chosen regParam can make models more robust. | ||
lrLearner.optimizer.setRegParam(1.0) | ||
// Run the actual learning algorithm on the training data. | ||
val model = lrLearner.run(trainingData) | ||
|
||
// Test on a positive example (spam) and a negative one (ham). | ||
// First apply the same HashingTF feature transformation used on the training data. | ||
val posTestExample = tf.transform("O M G GET cheap stuff by sending money to ...".split(" ")) | ||
val negTestExample = tf.transform("Hi Dad, I started studying Spark the other ...".split(" ")) | ||
// Now use the learned model to predict spam/ham for new emails. | ||
println(s"Prediction for positive test example: ${model.predict(posTestExample)}") | ||
println(s"Prediction for negative test example: ${model.predict(negTestExample)}") | ||
|
||
sc.stop() | ||
} | ||
} |