Class FrameRDDConverterUtils
- java.lang.Object
-
- org.apache.sysds.runtime.instructions.spark.utils.FrameRDDConverterUtils
-
public class FrameRDDConverterUtils extends Object
-
-
Nested Class Summary
Nested Classes Modifier and Type Class Description static class
FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction
static class
FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction
static class
FrameRDDConverterUtils.LongWritableTextToLongTextFunction
static class
FrameRDDConverterUtils.LongWritableToSerFunction
-
Constructor Summary
Constructors Constructor Description FrameRDDConverterUtils()
-
Method Summary
All Methods Static Methods Concrete Methods Deprecated Methods Modifier and Type Method Description static org.apache.spark.api.java.JavaRDD<String>
binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>
binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>
binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
Deprecated.static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock>
binaryBlockToMatrixBlock(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn, DataCharacteristics mcOut)
static org.apache.spark.api.java.JavaRDD<String>
binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn)
static int
convertDFSchemaToFrameSchema(org.apache.spark.sql.types.StructType dfschema, String[] colnames, Types.ValueType[] fschema, boolean containsID)
NOTE: regarding the support of vector columns, we make the following schema restriction: single vector column, which allows inference of the vector length without data access and covers the common case.static org.apache.spark.sql.types.StructType
convertFrameSchemaToDFSchema(Types.ValueType[] fschema, boolean containsID)
This function will convert Frame schema into DataFrame schemastatic org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row>
csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, String fnameIn, String delim, Types.ValueType[] schema)
static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row>
csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> dataRdd, String delim, Types.ValueType[] schema)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, Pair<String[],Types.ValueType[]> out)
static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,FrameBlock>
matrixBlockToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics mcIn)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
matrixBlockToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics dcIn)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> in, DataCharacteristics mcOut, Types.ValueType[] schema)
static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock>
textCellToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<Long,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema)
-
-
-
Method Detail
-
csvToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
-
csvToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> csvToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> input, DataCharacteristics mcOut, Types.ValueType[] schema, boolean hasHeader, String delim, boolean fill, double fillValue, Set<String> naStrings)
-
binaryBlockToCsv
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToCsv(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mcIn, FileFormatPropertiesCSV props, boolean strict)
-
textCellToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> textCellToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,org.apache.hadoop.io.Text> in, DataCharacteristics mcOut, Types.ValueType[] schema)
-
textCellToBinaryBlockLongIndex
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> textCellToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<Long,org.apache.hadoop.io.Text> input, DataCharacteristics mc, Types.ValueType[] schema)
-
binaryBlockToTextCell
public static org.apache.spark.api.java.JavaRDD<String> binaryBlockToTextCell(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn)
-
matrixBlockToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<org.apache.hadoop.io.LongWritable,FrameBlock> matrixBlockToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics mcIn)
-
matrixBlockToBinaryBlockLongIndex
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> matrixBlockToBinaryBlockLongIndex(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> input, DataCharacteristics dcIn)
-
binaryBlockToMatrixBlock
public static org.apache.spark.api.java.JavaPairRDD<MatrixIndexes,MatrixBlock> binaryBlockToMatrixBlock(org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> input, DataCharacteristics mcIn, DataCharacteristics mcOut)
-
dataFrameToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID)
-
dataFrameToBinaryBlock
public static org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> dataFrameToBinaryBlock(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, DataCharacteristics mc, boolean containsID, Pair<String[],Types.ValueType[]> out)
-
binaryBlockToDataFrame
public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SparkSession sparkSession, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
-
binaryBlockToDataFrame
@Deprecated public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> binaryBlockToDataFrame(org.apache.spark.sql.SQLContext sqlContext, org.apache.spark.api.java.JavaPairRDD<Long,FrameBlock> in, DataCharacteristics mc, Types.ValueType[] schema)
Deprecated.
-
convertFrameSchemaToDFSchema
public static org.apache.spark.sql.types.StructType convertFrameSchemaToDFSchema(Types.ValueType[] fschema, boolean containsID)
This function will convert Frame schema into DataFrame schema- Parameters:
fschema
- frame schemacontainsID
- true if contains ID column- Returns:
- Spark StructType of StructFields representing schema
-
convertDFSchemaToFrameSchema
public static int convertDFSchemaToFrameSchema(org.apache.spark.sql.types.StructType dfschema, String[] colnames, Types.ValueType[] fschema, boolean containsID)
NOTE: regarding the support of vector columns, we make the following schema restriction: single vector column, which allows inference of the vector length without data access and covers the common case.- Parameters:
dfschema
- schema as StructTypecolnames
- column namesfschema
- array of SystemDS ValueTypescontainsID
- if true, contains ID column- Returns:
- 0-based column index of vector column, -1 if no vector.
-
csvToRowRDD
public static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row> csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, String fnameIn, String delim, Types.ValueType[] schema)
-
csvToRowRDD
public static org.apache.spark.api.java.JavaRDD<org.apache.spark.sql.Row> csvToRowRDD(org.apache.spark.api.java.JavaSparkContext sc, org.apache.spark.api.java.JavaRDD<String> dataRdd, String delim, Types.ValueType[] schema)
-
-