Dependency: spark-core and spark-mllib
<dependencies><dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.2.0</version>
</dependency>
</dependencies>
Correlation
Calculating the correlation between two series of data is a common operation in Statistics. Inspark.ml
we provide the flexibility to calculate pairwise correlations among many series. The supported
correlation methods are currently Pearson’s and Spearman’s correlation.Correlation
computes the correlation matrix for the input Dataset of Vectors using the specified method.
The output will be a DataFrame that contains the correlation matrix of the column of vectors.
JavaCorrelationExample:
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import java.util.Arrays;
import org.apache.spark.api.java.JavaDoubleRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.mllib.linalg.Matrix;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;
import org.apache.spark.mllib.stat.Statistics;
// $example off$
public class JavaCorrelationsExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaCorrelationsExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
// $example on$
JavaDoubleRDD seriesX = jsc.parallelizeDoubles(
Arrays.asList(1.0, 2.0, 3.0, 3.0, 5.0)); // a series
// must have the same number of partitions and cardinality as seriesX
JavaDoubleRDD seriesY = jsc.parallelizeDoubles(
Arrays.asList(11.0, 22.0, 33.0, 33.0, 555.0));
// compute the correlation using Pearson's method. Enter "spearman" for Spearman's method.
// If a method is not specified, Pearson's method will be used by default.
Double correlation = Statistics.corr(seriesX.srdd(), seriesY.srdd(), "pearson");
System.out.println("Correlation is: " + correlation);
// note that each Vector is a row and not a column
JavaRDD<Vector> data = jsc.parallelize(
Arrays.asList(
Vectors.dense(1.0, 10.0, 100.0),
Vectors.dense(2.0, 20.0, 200.0),
Vectors.dense(5.0, 33.0, 366.0)
)
);
// calculate the correlation matrix using Pearson's method.
// Use "spearman" for Spearman's method.
// If a method is not specified, Pearson's method will be used by default.
Matrix correlMatrix = Statistics.corr(data.rdd(), "pearson");
System.out.println(correlMatrix.toString());
// $example off$
jsc.stop();
}
}
Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaCorrelationExample.java" in the Spark repo.
Hypothesis testing
Hypothesis testing is a powerful tool in statistics to determine whether a result is statistically significant, whether this result occurred by chance or not.spark.ml
currently supports Pearson’s
Chi-squared tests for independence.ChiSquareTest
conducts Pearson’s independence test for every feature against the label.
For each feature, the (feature, label) pairs are converted into a contingency matrix for which
the Chi-squared statistic is computed. All label and feature values must be categorical.import org.apache.spark.sql.SparkSession;
// $example on$
import java.util.Arrays;
import java.util.List;
import org.apache.spark.ml.linalg.Vectors;
import org.apache.spark.ml.linalg.VectorUDT;
import org.apache.spark.ml.stat.ChiSquareTest;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.types.*;
// $example off$
/**
* An example for Chi-square hypothesis testing.
* Run with
* <pre>
* bin/run-example ml.JavaChiSquareTestExample
* </pre>
*/
public class JavaChiSquareTestExample {
public static void main(String[] args) {
SparkSession spark = SparkSession
.builder()
.appName("JavaChiSquareTestExample")
.getOrCreate();
// $example on$
List<Row> data = Arrays.asList(
RowFactory.create(0.0, Vectors.dense(0.5, 10.0)),
RowFactory.create(0.0, Vectors.dense(1.5, 20.0)),
RowFactory.create(1.0, Vectors.dense(1.5, 30.0)),
RowFactory.create(0.0, Vectors.dense(3.5, 30.0)),
RowFactory.create(0.0, Vectors.dense(3.5, 40.0)),
RowFactory.create(1.0, Vectors.dense(3.5, 40.0))
);
StructType schema = new StructType(new StructField[]{
new StructField("label", DataTypes.DoubleType, false, Metadata.empty()),
new StructField("features", new VectorUDT(), false, Metadata.empty()),
});
Dataset<Row> df = spark.createDataFrame(data, schema);
Row r = ChiSquareTest.test(df, "features", "label").head();
System.out.println("pValues: " + r.get(0).toString());
System.out.println("degreesOfFreedom: " + r.getList(1).toString());
System.out.println("statistics: " + r.get(2).toString());
// $example off$
spark.stop();
}
}
Find full example code at "examples/src/main/java/org/apache/spark/examples/ml/JavaChiSquareTestExample.java" in the Spark repo.
No comments:
Post a Comment