dotnet · imback82 · Oct 2, 2020 · Sep 3, 2020 · Sep 4, 2020 · Sep 4, 2020
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -25,7 +25,7 @@ public BucketizerTests(SparkFixture fixture)
         [Fact]
         public void TestBucketizer()
         {
-            var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };
+            var expectedSplits = new double[]{ double.MinValue, 0.0, 10.0, 50.0, double.MaxValue };
 
             string expectedHandle = "skip";
             string expectedUid = "uid";
@@ -60,18 +60,7 @@ public void TestBucketizer()
                 Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
             }
 
-            Assert.NotEmpty(bucketizer.ExplainParams());
-
-            Param handleInvalidParam = bucketizer.GetParam("handleInvalid");
-            Assert.NotEmpty(handleInvalidParam.Doc);
-            Assert.NotEmpty(handleInvalidParam.Name);
-            Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid());
-
-            Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam));
-            bucketizer.Set(handleInvalidParam, "keep");
-            Assert.Equal("keep", bucketizer.GetHandleInvalid());
-
-            Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid());
+            FeatureBaseTests<Bucketizer>.TestBase(bucketizer, "handleInvalid", "keep");
         }
 
         [Fact]

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs
@@ -22,7 +22,7 @@ public CountVectorizerModelTests(SparkFixture fixture)
         }
 
         /// <summary>
-        /// Test that we can create a CountVectorizerModel, pass in a specifc vocabulary to use
+        /// Test that we can create a CountVectorizerModel, pass in a specific vocabulary to use
         /// when creating the model. Verify the standard features methods as well as load/save.
         /// </summary>
         [Fact]
@@ -68,6 +68,8 @@ public void TestCountVectorizerModel()
             Assert.IsType<int>(countVectorizerModel.GetVocabSize());
             Assert.NotEmpty(countVectorizerModel.ExplainParams());
             Assert.NotEmpty(countVectorizerModel.ToString());
+
+            FeatureBaseTests<CountVectorizerModel>.TestBase(countVectorizerModel, "minDF", 100);
         } 
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs
@@ -67,6 +67,8 @@ public void TestCountVectorizer()
 
             Assert.NotEmpty(countVectorizer.ExplainParams());
             Assert.NotEmpty(countVectorizer.ToString());
+
+            FeatureBaseTests<CountVectorizer>.TestBase(countVectorizer, "minDF", 0.4);
         }
 
         /// <summary>

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs
@@ -0,0 +1,32 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.ML.Feature.Param;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    public static class FeatureBaseTests<T>
+    {
+        internal static void TestBase(
+            FeatureBase<T> testObject, 
+            string paramName, 
+            object paramValue)
+        {
+            Assert.NotEmpty(testObject.ExplainParams());
+
+            Param param = testObject.GetParam(paramName);
+            Assert.NotEmpty(param.Doc);
+            Assert.NotEmpty(param.Name);
+            Assert.Equal(param.Parent, testObject.Uid());
+
+            Assert.NotEmpty(testObject.ExplainParam(param));
+            testObject.Set(param, paramValue);
+            Assert.IsAssignableFrom<Identifiable>(testObject.Clear(param));
+
+            Assert.IsType<string>(testObject.Uid());
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs
@@ -0,0 +1,57 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class FeatureHasherTests
+    {
+        private readonly SparkSession _spark;
+
+        public FeatureHasherTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestFeatureHasher()
+        {
+            DataFrame dataFrame = _spark.CreateDataFrame(
+                new List<GenericRow>
+                {
+                    new GenericRow(new object[] {2.0D, true, "1", "foo"}),
+                    new GenericRow(new object[] {3.0D, false, "2", "bar"})
+                },
+                new StructType(new List<StructField>
+                {
+                    new StructField("real", new DoubleType()),
+                    new StructField("bool", new BooleanType()),
+                    new StructField("stringNum", new StringType()),
+                    new StructField("string", new StringType())
+                }));
+
+            FeatureHasher hasher = new FeatureHasher()
+                .SetInputCols(new List<string>() {"real", "bool", "stringNum", "string"})
+                .SetOutputCol("features")
+                .SetCategoricalCols(new List<string>() {"real", "string"})
+                .SetNumFeatures(10);
+
+            Assert.IsType<string>(hasher.GetOutputCol());
+            Assert.IsType <string[]>(hasher.GetInputCols());
+            Assert.IsType<List<string>>(hasher.GetCategoricalCols());
+            Assert.IsType<int>(hasher.GetNumFeatures());
+            Assert.IsType<StructType>(hasher.TransformSchema(dataFrame.Schema()));
+            Assert.IsType<DataFrame>(hasher.Transform(dataFrame));
+
+            FeatureBaseTests<FeatureHasher>.TestBase(hasher, "numFeatures", 1000);
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -57,6 +57,8 @@ public void TestHashingTF()
 
             hashingTf.SetBinary(true);
             Assert.True(hashingTf.GetBinary());
+
+            FeatureBaseTests<HashingTF>.TestBase(hashingTf, "numFeatures", 1000);
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -65,6 +65,8 @@ public void TestIDFModel()
                 IDFModel loadedModel = IDFModel.Load(modelPath);
                 Assert.Equal(idfModel.Uid(), loadedModel.Uid());
             }
+
+            FeatureBaseTests<IDFModel>.TestBase(idfModel, "minDocFreq", 1000);
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -44,6 +44,8 @@ public void TestIDFModel()
                 IDF loadedIdf = IDF.Load(savePath);
                 Assert.Equal(idf.Uid(), loadedIdf.Uid());
             }
+
+            FeatureBaseTests<IDF>.TestBase(idf, "minDocFreq", 1000);
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -50,6 +50,8 @@ public void TestTokenizer()
             }
 
             Assert.Equal(expectedUid, tokenizer.Uid());
+
+            FeatureBaseTests<Tokenizer>.TestBase(tokenizer, "inputCol", "input_col");
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs
@@ -47,6 +47,8 @@ public void TestWord2VecModel()
                 Word2VecModel loadedModel = Word2VecModel.Load(savePath);
                 Assert.Equal(model.Uid(), loadedModel.Uid());
             }
+
+            FeatureBaseTests<Word2VecModel>.TestBase(model, "maxIter", 2);
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs
@@ -67,6 +67,8 @@ public void TestWord2Vec()
                 Word2Vec loadedWord2Vec = Word2Vec.Load(savePath);
                 Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid());
             }
+
+            FeatureBaseTests<Word2Vec>.TestBase(word2vec, "maxIter", 2);
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs
@@ -0,0 +1,145 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.Collections.Generic;
+using System.Linq;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    public class FeatureHasher: FeatureBase<FeatureHasher>, IJvmObjectReferenceProvider
+    {
+        private static readonly string s_featureHasherClassName = 
+            "org.apache.spark.ml.feature.FeatureHasher";
+
+        internal FeatureHasher() : base(s_featureHasherClassName)
+        {
+        }
+
+        internal FeatureHasher(string uid) : base(s_featureHasherClassName, uid)
+        {
+        }
+
+        internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject)
+        {
+        }
+
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Loads the <see cref="FeatureHasher"/> that was previously saved using Save.
+        /// </summary>
+        /// <param name="path">
+        /// The path the previous <see cref="FeatureHasher"/> was saved to.
+        /// </param>
+        /// <returns>New <see cref="FeatureHasher"/> object</returns>
+        public static FeatureHasher Load(string path) =>
+            WrapAsFeatureHasher(
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                    s_featureHasherClassName,"load", path));
+
+        /// <summary>
+        /// Gets a list of the columns which have been specified as categorical columns.
+        /// </summary>
+        /// <returns>List of categorical columns, set by SetCategoricalCols</returns>
+        public IEnumerable<string> GetCategoricalCols() => 
+            ((string[])_jvmObject.Invoke("getCategoricalCols")).ToList();
+
+        /// <summary>
+        /// Marks columns as categorical columns.
+        /// </summary>
+        /// <param name="value">List of column names to mark as categorical columns</param>
+        /// <returns>New <see cref="FeatureHasher"/> object</returns>
+        public FeatureHasher SetCategoricalCols(IEnumerable<string> value) => 
+            WrapAsFeatureHasher(_jvmObject.Invoke("setCategoricalCols", value));
+
+        /// <summary>
+        /// Gets the columns that the <see cref="FeatureHasher"/> should read from and convert into
+        /// hashes. This would have been set by SetInputCol.
+        /// </summary>
+        /// <returns>IEnumerable&lt;string&gt;, the input columns</returns>
+        public IEnumerable<string> GetInputCols() => (string[])_jvmObject.Invoke("getInputCols");
+
+        /// <summary>
+        /// Sets the columns that the <see cref="FeatureHasher"/> should read from and convert into
+        /// hashes.
+        /// </summary>
+        /// <param name="value">The name of the column to as use the source of the hash</param>
+        /// <returns>New <see cref="FeatureHasher"/> object</returns>
+        public FeatureHasher SetInputCols(IEnumerable<string> value) => 
+            WrapAsFeatureHasher(_jvmObject.Invoke("setInputCols", value));
+
+        /// <summary>
+        /// Gets the number of features that should be used. Since a simple modulo is used to
+        /// transform the hash function to a column index, it is advisable to use a power of two
+        /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the
+        /// columns.
+        /// </summary>
+        /// <returns>The number of features to be used</returns>
+        public int GetNumFeatures() => (int)_jvmObject.Invoke("getNumFeatures");
+
+        /// <summary>
+        /// Sets the number of features that should be used. Since a simple modulo is used to
+        /// transform the hash function to a column index, it is advisable to use a power of two as
+        /// the numFeatures parameter; otherwise the features will not be mapped evenly to the
+        /// columns.
+        /// </summary>
+        /// <param name="value">int</param>
+        /// <returns>New <see cref="FeatureHasher"/> object</returns>
+        public FeatureHasher SetNumFeatures(int value) => 
+            WrapAsFeatureHasher(_jvmObject.Invoke("setNumFeatures", value));
+
+        /// <summary>
+        /// Gets the name of the column the output data will be written to. This is set by
+        /// SetInputCol.
+        /// </summary>
+        /// <returns>string, the output column</returns>
+        public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol");
+
+        /// <summary>
+        /// The <see cref="FeatureHasher"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column which will contain the hash</param>
+        /// <returns>New <see cref="FeatureHasher"/> object</returns>
+        public FeatureHasher SetOutputCol(string value) => 
+            WrapAsFeatureHasher(_jvmObject.Invoke("setOutputCol", value));
+
+        /// <summary>
+        /// Transforms the input <see cref="DataFrame"/>. It is recommended that you validate that
+        /// the transform will succeed by calling TransformSchema.
+        /// </summary>
+        /// <param name="value">Input <see cref="DataFrame"/> to transform</param>
+        /// <returns>Transformed <see cref="DataFrame"/></returns>
+        public DataFrame Transform(DataFrame value) => 
+            new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", value));
+
+        /// <summary>
+        /// Check transform validity and derive the output schema from the input schema.
+        /// 
+        /// This checks for validity of interactions between parameters during Transform and
+        /// raises an exception if any parameter value is invalid.   
+        ///
+        /// Typical implementation should first conduct verification on schema change and parameter
+        /// validity, including complex parameter interaction checks.
+        /// </summary>
+        /// <param name="value">
+        /// The <see cref="StructType"/> of the <see cref="DataFrame"/> which will be transformed.
+        /// </param>
+        /// <returns>
+        /// The <see cref="StructType"/> of the output schema that would have been derived from the
+        /// input schema, if Transform had been called.
+        /// </returns>
+        public StructType TransformSchema(StructType value) => 
+            new StructType(
+                (JvmObjectReference)_jvmObject.Invoke("transformSchema", 
+                    DataType.FromJson(_jvmObject.Jvm, value.Json)));
+
+        private static FeatureHasher WrapAsFeatureHasher(object obj) => 
+            new FeatureHasher((JvmObjectReference)obj);
+    }
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,8 @@ public void TestTokenizer() @@
                 }
                 Assert.Equal(expectedUid, tokenizer.Uid());
+                FeatureBaseTests<Tokenizer>.TestBase(tokenizer, "inputCol", "input_col");
             }
         }
     }