public class RDDConverterUtils extends Object
| Modifier and Type | Class and Description |
|---|---|
static class |
RDDConverterUtils.DataFrameExtractIDFunction |
| Modifier and Type | Field and Description |
|---|---|
static String |
DF_ID_COLUMN |
| Constructor and Description |
|---|
RDDConverterUtils() |
| Modifier and Type | Method and Description |
|---|---|
static org.apache.spark.api.java.JavaRDD<String> |
binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mcIn,
CSVFileFormatProperties props,
boolean strict) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc,
boolean toVector) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc,
boolean toVector)
Deprecated.
|
static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint> |
binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in)
Converter from binary block rdd to rdd of labeled points.
|
static org.apache.spark.api.java.JavaRDD<String> |
binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input,
MatrixCharacteristics mcOut,
boolean outputEmptyBlocks) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input,
MatrixCharacteristics mc,
boolean hasHeader,
String delim,
boolean fill,
double fillValue) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaRDD<String> input,
MatrixCharacteristics mcOut,
boolean hasHeader,
String delim,
boolean fill,
double fillValue)
Example usage:
|
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df,
MatrixCharacteristics mc,
boolean containsID,
boolean isVector) |
static void |
libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
String pathIn,
String pathX,
String pathY,
MatrixCharacteristics mcOutX)
Converts a libsvm text input file into two binary block matrices for features
and labels, and saves these to the specified output files.
|
static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> |
stringToSerializableText(org.apache.spark.api.java.JavaPairRDD<Long,String> in) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input,
MatrixCharacteristics mcOut,
boolean outputEmptyBlocks) |
public static final String DF_ID_COLUMN
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, MatrixCharacteristics mcOut, boolean outputEmptyBlocks) throws DMLRuntimeException
DMLRuntimeExceptionpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input, MatrixCharacteristics mcOut, boolean outputEmptyBlocks) throws DMLRuntimeException
DMLRuntimeExceptionpublic static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint> binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in)
in - matrix as JavaPairRDD<MatrixIndexes, MatrixBlock>public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mc)
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mcIn, CSVFileFormatProperties props, boolean strict)
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue) throws DMLRuntimeException
DMLRuntimeExceptionpublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, MatrixCharacteristics mcOut, boolean hasHeader, String delim, boolean fill, double fillValue) throws DMLRuntimeException
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils
import org.apache.sysml.runtime.matrix.MatrixCharacteristics
import org.apache.spark.api.java.JavaSparkContext
val A = sc.textFile("ranA.csv")
val Amc = new MatrixCharacteristics
val Abin = RDDConverterUtils.csvToBinaryBlock(new JavaSparkContext(sc), A, Amc, false, ",", false, 0)
sc - java spark contextinput - rdd of stringsmcOut - matrix characteristicshasHeader - if true, has headerdelim - delimiter as a stringfill - if true, fill in empty values with fillValuefillValue - fill value used to fill empty valuesJavaPairRDD<MatrixIndexes, MatrixBlock>DMLRuntimeException - if DMLRuntimeException occurspublic static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector)
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc,
boolean toVector)
@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mc, boolean toVector)
public static void libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
String pathIn,
String pathX,
String pathY,
MatrixCharacteristics mcOutX)
throws DMLRuntimeException
Note: We use org.apache.spark.mllib.util.MLUtils.loadLibSVMFile for parsing
the libsvm input files in order to ensure consistency with Spark.
sc - java spark contextpathIn - path to libsvm input filepathX - path to binary block output file of featurespathY - path to binary block output file of labelsmcOutX - matrix characteristics of output matrix XDMLRuntimeException - if output path not writable or conversion failureCopyright © 2017 The Apache Software Foundation. All rights reserved.