-
Notifications
You must be signed in to change notification settings - Fork 31
Open
Description
I wish we can support pyspark.ml.feature.OneHotEncoder, StringIndexer.
Mini Repro:
import org.apache.spark.ml.feature.StringIndexer
val df_string = spark.createDataFrame(
Seq((0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c"))
).toDF("id", "category")
val indexer = new StringIndexer()
.setInputCol("category")
.setOutputCol("categoryIndex")
val indexed = indexer.fit(df_string).transform(df_string)
indexed.show()
import org.apache.spark.ml.feature.OneHotEncoder
val df_onehot = spark.createDataFrame(Seq(
(0.0, 1.0),
(1.0, 0.0),
(2.0, 1.0),
(0.0, 2.0),
(0.0, 1.0),
(2.0, 0.0)
)).toDF("categoryIndex1", "categoryIndex2")
val encoder = new OneHotEncoder()
.setInputCols(Array("categoryIndex1", "categoryIndex2"))
.setOutputCols(Array("categoryVec1", "categoryVec2"))
val model = encoder.fit(df_onehot)
val encoded = model.transform(df_onehot)
encoded.show()
Unsupported messages:
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
!Expression <AggregateExpression> finalmerge_stringindexeraggregator(merge buf#144) AS StringIndexerAggregator(org.apache.spark.sql.Row)#138 cannot run on GPU because aggFunc expression ComplexTypedAggregateExpression StringIndexerAggregator(org.apache.spark.sql.Row) (BinaryType is not supported); expression AggregateExpression finalmerge_stringindexeraggregator(merge buf#144) AS StringIndexerAggregator(org.apache.spark.sql.Row)#138 produces an unsupported type BinaryType
! <ComplexTypedAggregateExpression> StringIndexerAggregator(org.apache.spark.sql.Row) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
! <CreateExternalRow> createexternalrow(category#125.toString, StructField(category,StringType,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow
! <Invoke> category#125.toString cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke
@Expression <AttributeReference> category#125 could run on GPU
! <EncodeUsingSerializer> encodeusingserializer(input[0, java.lang.Object, true], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.EncodeUsingSerializer
! <BoundReference> input[0, java.lang.Object, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
! <DecodeUsingSerializer> decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.DecodeUsingSerializer
! <BoundReference> input[0, binary, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
! <EncodeUsingSerializer> encodeusingserializer(input[0, java.lang.Object, true], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.EncodeUsingSerializer
! <BoundReference> input[0, java.lang.Object, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
!Expression <AttributeReference> StringIndexerAggregator(org.apache.spark.sql.Row)#138 cannot run on GPU because expression AttributeReference StringIndexerAggregator(org.apache.spark.sql.Row)#138 produces an unsupported type BinaryType
!Expression <Alias> StringIndexerAggregator(org.apache.spark.sql.Row)#138 AS StringIndexerAggregator(org.apache.spark.sql.Row)#139 cannot run on GPU because expression Alias StringIndexerAggregator(org.apache.spark.sql.Row)#138 AS StringIndexerAggregator(org.apache.spark.sql.Row)#139 produces an unsupported type BinaryType; input expression AttributeReference StringIndexerAggregator(org.apache.spark.sql.Row)#138 (BinaryType is not supported)
!Expression <AttributeReference> StringIndexerAggregator(org.apache.spark.sql.Row)#138 cannot run on GPU because expression AttributeReference StringIndexerAggregator(org.apache.spark.sql.Row)#138 produces an unsupported type BinaryType
!Exec <ShuffleExchangeExec> cannot run on GPU because unsupported data types in output: BinaryType [buf#144]; unsupported data types in input: BinaryType [buf#144]; Columnar exchange without columnar children is inefficient
@Partitioning <SinglePartition$> could run on GPU
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
!Expression <AggregateExpression> partial_stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@7b35e0b8, Some(createexternalrow(category#125.toString, StructField(category,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(category,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) AS buf#144 cannot run on GPU because expression AggregateExpression partial_stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@7b35e0b8, Some(createexternalrow(category#125.toString, StructField(category,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(category,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) AS buf#144 produces an unsupported type BinaryType; aggFunc expression ComplexTypedAggregateExpression StringIndexerAggregator(org.apache.spark.sql.Row) (BinaryType is not supported)
! <ComplexTypedAggregateExpression> StringIndexerAggregator(org.apache.spark.sql.Row) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.aggregate.ComplexTypedAggregateExpression
! <CreateExternalRow> createexternalrow(category#125.toString, StructField(category,StringType,true)) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.CreateExternalRow
! <Invoke> category#125.toString cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.Invoke
@Expression <AttributeReference> category#125 could run on GPU
! <EncodeUsingSerializer> encodeusingserializer(input[0, java.lang.Object, true], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.EncodeUsingSerializer
! <BoundReference> input[0, java.lang.Object, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
! <DecodeUsingSerializer> decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.DecodeUsingSerializer
! <BoundReference> input[0, binary, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
! <EncodeUsingSerializer> encodeusingserializer(input[0, java.lang.Object, true], true) cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.objects.EncodeUsingSerializer
! <BoundReference> input[0, java.lang.Object, true] cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.catalyst.expressions.BoundReference
!Expression <AttributeReference> buf#143 cannot run on GPU because expression AttributeReference buf#143 produces an unsupported type BinaryType
!Expression <AttributeReference> buf#144 cannot run on GPU because expression AttributeReference buf#144 produces an unsupported type BinaryType
! <LocalTableScanExec> cannot run on GPU because GPU does not currently support the operator class org.apache.spark.sql.execution.LocalTableScanExec
@Expression <AttributeReference> category#125 could run on GPU
Metadata
Metadata
Assignees
Labels
No labels