public class RDDConverterUtils extends Object
Modifier and Type | Class and Description |
---|---|
static class |
RDDConverterUtils.BinaryCellToBinaryBlockFunction |
static class |
RDDConverterUtils.DataFrameExtractIDFunction |
Modifier and Type | Field and Description |
---|---|
static String |
DF_ID_COLUMN |
Constructor and Description |
---|
RDDConverterUtils() |
Modifier and Type | Method and Description |
---|---|
static org.apache.spark.api.java.JavaRDD<String> |
binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mcIn,
org.apache.sysml.runtime.io.FileFormatPropertiesCSV props,
boolean strict) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc,
boolean toVector) |
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> |
binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc,
boolean toVector)
Deprecated.
|
static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint> |
binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in)
Converter from binary block rdd to rdd of labeled points.
|
static org.apache.spark.api.java.JavaRDD<String> |
binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in,
MatrixCharacteristics mc) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input,
MatrixCharacteristics mcOut,
boolean outputEmptyBlocks) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input,
MatrixCharacteristics mc,
boolean hasHeader,
String delim,
boolean fill,
double fillValue) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaRDD<String> input,
MatrixCharacteristics mcOut,
boolean hasHeader,
String delim,
boolean fill,
double fillValue)
Example usage:
|
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df,
MatrixCharacteristics mc,
boolean containsID,
boolean isVector) |
static void |
libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
String pathIn,
String pathX,
String pathY,
MatrixCharacteristics mcOutX)
Converts a libsvm text input file into two binary block matrices for features
and labels, and saves these to the specified output files.
|
static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> |
stringToSerializableText(org.apache.spark.api.java.JavaPairRDD<Long,String> in) |
static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> |
textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc,
org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input,
MatrixCharacteristics mcOut,
boolean outputEmptyBlocks,
org.apache.sysml.runtime.io.FileFormatPropertiesMM mmProps) |
public static final String DF_ID_COLUMN
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, MatrixCharacteristics mcOut, boolean outputEmptyBlocks, org.apache.sysml.runtime.io.FileFormatPropertiesMM mmProps)
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixCell> input, MatrixCharacteristics mcOut, boolean outputEmptyBlocks)
public static org.apache.spark.api.java.JavaRDD<org.apache.spark.ml.feature.LabeledPoint> binaryBlockToLabeledPoints(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in)
in
- matrix as JavaPairRDD<MatrixIndexes, MatrixBlock>
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mc)
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mcIn, org.apache.sysml.runtime.io.FileFormatPropertiesCSV props, boolean strict)
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, MatrixCharacteristics mc, boolean hasHeader, String delim, boolean fill, double fillValue)
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, MatrixCharacteristics mcOut, boolean hasHeader, String delim, boolean fill, double fillValue)
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils
import org.apache.sysml.runtime.matrix.MatrixCharacteristics
import org.apache.spark.api.java.JavaSparkContext
val A = sc.textFile("ranA.csv")
val Amc = new MatrixCharacteristics
val Abin = RDDConverterUtils.csvToBinaryBlock(new JavaSparkContext(sc), A, Amc, false, ",", false, 0)
sc
- java spark contextinput
- rdd of stringsmcOut
- matrix characteristicshasHeader
- if true, has headerdelim
- delimiter as a stringfill
- if true, fill in empty values with fillValuefillValue
- fill value used to fill empty valuesJavaPairRDD<MatrixIndexes, MatrixBlock>
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, MatrixCharacteristics mc, boolean containsID, boolean isVector)
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mc, boolean toVector)
@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> in, MatrixCharacteristics mc, boolean toVector)
public static void libsvmToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, String pathIn, String pathX, String pathY, MatrixCharacteristics mcOutX)
Note: We use org.apache.spark.mllib.util.MLUtils.loadLibSVMFile
for parsing
the libsvm input files in order to ensure consistency with Spark.
sc
- java spark contextpathIn
- path to libsvm input filepathX
- path to binary block output file of featurespathY
- path to binary block output file of labelsmcOutX
- matrix characteristics of output matrix XCopyright © 2018 The Apache Software Foundation. All rights reserved.