From c9339144161b3fdffc8734a41a798e0a28f788f2 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 3 Sep 2020 21:34:49 +0100 Subject: [PATCH 01/17] FeatureHasher --- .../IpcTests/ML/Feature/FeatureHasherTests.cs | 50 ++++++ .../ML/Feature/FeatureHasher.cs | 145 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs new file mode 100644 index 000000000..42c224877 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -0,0 +1,50 @@ +using System; +using System.Collections.Generic; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class FeatureHasherTests + { + private readonly SparkSession _spark; + + public FeatureHasherTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestFeatureHasher() + { + DataFrame dataFrame = _spark.CreateDataFrame( + new List() + { + new GenericRow(new object[] {2.0D, true, "1", "foo"}), + new GenericRow(new object[] {3.0D, false, "2", "bar"}) + }, + new StructType(new List() + { + new StructField("real", new DoubleType()), + new StructField("bool", new BooleanType()), + new StructField("stringNum", new StringType()), + new StructField("string", new StringType()) + })); + + FeatureHasher hasher = new FeatureHasher() + .SetInputCols(new List() {"real", "bool", "stringNum", "string"}) + .SetOutputCol("features") + .SetCategoricalCols(new List() {"real", "string"}) + .SetNumFeatures(10); + + Assert.IsType>(hasher.GetCategoricalCols()); + Assert.IsType(hasher.GetNumFeatures()); + + Assert.IsType(hasher.TransformSchema(dataFrame.Schema())); + Assert.IsType(hasher.Transform(dataFrame)); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs new file mode 100644 index 000000000..f836fd860 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -0,0 +1,145 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using System.Linq; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; + +namespace Microsoft.Spark.ML.Feature +{ + public class FeatureHasher: FeatureBase, IJvmObjectReferenceProvider + { + private static readonly string s_featureHasherClassName = + "org.apache.spark.ml.feature.FeatureHasher"; + + internal FeatureHasher() : base(s_featureHasherClassName) + { + } + + internal FeatureHasher(string uid) : base(s_featureHasherClassName, uid) + { + } + + internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject) + { + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Loads the that was previously saved using Save. + /// + /// + /// The path the previous was saved to. + /// + /// New object + public static FeatureHasher Load(string path) => + WrapAsFeatureHasher( + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_featureHasherClassName,"load", path)); + + /// + /// Gets a list of the columns which have been specified as categorical columns. + /// + /// List of categorical columns, set by `SetCategoricalCols` + public IEnumerable GetCategoricalCols() => + ((string[])_jvmObject.Invoke("getCategoricalCols")).ToList(); + + /// + /// Marks columns as categorical columns. + /// + /// List of column names to mark as a categorical column + /// New object + public FeatureHasher SetCategoricalCols(IEnumerable value) => + WrapAsFeatureHasher(_jvmObject.Invoke("setCategoricalCols", value)); + + /// + /// Gets the columns that the should read from and convert into + /// hashes. This would have been set by SetInputCol. + /// + /// string, the input column + public string GetInputCols() => (string)_jvmObject.Invoke("getInputCols"); + + /// + /// Sets the column that the should read from and convert into + /// hashes. + /// + /// The name of the column to as the source of the buckets + /// New object + public FeatureHasher SetInputCols(IEnumerable value) => + WrapAsFeatureHasher(_jvmObject.Invoke("setInputCols", value)); + + /// + /// Gets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two + /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. + /// + /// The number of features to be used + public int GetNumFeatures() => (int)_jvmObject.Invoke("getNumFeatures"); + + /// + /// Sets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two as + /// the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. + /// + /// int + /// New object + public FeatureHasher SetNumFeatures(int value) => + WrapAsFeatureHasher(_jvmObject.Invoke("setNumFeatures", value)); + + /// + /// Gets the name of the column the output data will be written to. This is set by + /// SetInputCol + /// + /// string, the output column + public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column which contains the hash + /// New object + public FeatureHasher SetOutputCol(string value) => + WrapAsFeatureHasher(_jvmObject.Invoke("setOutputCol", value)); + + /// + /// Transforms the input . It is recommended that you validate that + /// the transform will succeed by calling `TransformSchema`. + /// + /// Input to transform + /// Transformed + public DataFrame Transform(DataFrame value) => + new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", value)); + + /// + /// Check transform validity and derive the output schema from the input schema. + /// We check validity for interactions between parameters during transformSchema and + /// raise an exception if any parameter value is invalid. Parameter value checks which do + /// not depend on other parameters are handled by Param.validate(). + /// + /// Typical implementation should first conduct verification on schema change and parameter + /// validity, including complex parameter interaction checks. + /// + /// + /// The of the which will be transformed + /// + /// + /// The of the output schema that would have been derived form the + /// input schema, if Transform had been called + /// + public StructType TransformSchema(StructType value) => + new StructType( + (JvmObjectReference)_jvmObject.Invoke("transformSchema", + DataType.FromJson(_jvmObject.Jvm, value.Json))); + + private static FeatureHasher WrapAsFeatureHasher(object obj) => + new FeatureHasher((JvmObjectReference)obj); + } +} From 08011080339fab8aeb1a8d9e189f90e92f15659e Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 4 Sep 2020 08:32:52 +0100 Subject: [PATCH 02/17] tidy --- .../IpcTests/ML/Feature/FeatureHasherTests.cs | 11 ++++++++--- .../Microsoft.Spark/ML/Feature/FeatureHasher.cs | 15 +++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 42c224877..24b0b22ee 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -1,3 +1,7 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + using System; using System.Collections.Generic; using Microsoft.Spark.ML.Feature; @@ -21,12 +25,12 @@ public FeatureHasherTests(SparkFixture fixture) public void TestFeatureHasher() { DataFrame dataFrame = _spark.CreateDataFrame( - new List() + new List { new GenericRow(new object[] {2.0D, true, "1", "foo"}), new GenericRow(new object[] {3.0D, false, "2", "bar"}) }, - new StructType(new List() + new StructType(new List { new StructField("real", new DoubleType()), new StructField("bool", new BooleanType()), @@ -40,9 +44,10 @@ public void TestFeatureHasher() .SetCategoricalCols(new List() {"real", "string"}) .SetNumFeatures(10); + Assert.IsType(hasher.GetOutputCol()); + Assert.IsType (hasher.GetInputCols()); Assert.IsType>(hasher.GetCategoricalCols()); Assert.IsType(hasher.GetNumFeatures()); - Assert.IsType(hasher.TransformSchema(dataFrame.Schema())); Assert.IsType(hasher.Transform(dataFrame)); } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs index f836fd860..2e31ba6b6 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -62,7 +62,7 @@ public FeatureHasher SetCategoricalCols(IEnumerable value) => /// hashes. This would have been set by SetInputCol. /// /// string, the input column - public string GetInputCols() => (string)_jvmObject.Invoke("getInputCols"); + public IEnumerable GetInputCols() => (string[])_jvmObject.Invoke("getInputCols"); /// /// Sets the column that the should read from and convert into @@ -95,7 +95,7 @@ public FeatureHasher SetNumFeatures(int value) => /// /// Gets the name of the column the output data will be written to. This is set by - /// SetInputCol + /// SetInputCol. /// /// string, the output column public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); @@ -120,19 +120,18 @@ public DataFrame Transform(DataFrame value) => /// /// Check transform validity and derive the output schema from the input schema. - /// We check validity for interactions between parameters during transformSchema and - /// raise an exception if any parameter value is invalid. Parameter value checks which do - /// not depend on other parameters are handled by Param.validate(). + /// This checks for validity of interactions between parameters during transformSchema and + /// raises an exception if any parameter value is invalid. /// /// Typical implementation should first conduct verification on schema change and parameter /// validity, including complex parameter interaction checks. /// /// - /// The of the which will be transformed + /// The of the which will be transformed. /// /// - /// The of the output schema that would have been derived form the - /// input schema, if Transform had been called + /// The of the output schema that would have been derived from the + /// input schema, if Transform had been called. /// public StructType TransformSchema(StructType value) => new StructType( From ac262058bf34f8f0b4b9eebebc3f5dc4eb389139 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 4 Sep 2020 08:45:53 +0100 Subject: [PATCH 03/17] tidying comments --- .../Microsoft.Spark/ML/Feature/FeatureHasher.cs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs index 2e31ba6b6..bfcec18d0 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -45,14 +45,14 @@ public static FeatureHasher Load(string path) => /// /// Gets a list of the columns which have been specified as categorical columns. /// - /// List of categorical columns, set by `SetCategoricalCols` + /// List of categorical columns, set by SetCategoricalCols public IEnumerable GetCategoricalCols() => ((string[])_jvmObject.Invoke("getCategoricalCols")).ToList(); /// /// Marks columns as categorical columns. /// - /// List of column names to mark as a categorical column + /// List of column names to mark as categorical columns /// New object public FeatureHasher SetCategoricalCols(IEnumerable value) => WrapAsFeatureHasher(_jvmObject.Invoke("setCategoricalCols", value)); @@ -68,7 +68,7 @@ public FeatureHasher SetCategoricalCols(IEnumerable value) => /// Sets the column that the should read from and convert into /// hashes. /// - /// The name of the column to as the source of the buckets + /// The name of the column to as use the source of the hash /// New object public FeatureHasher SetInputCols(IEnumerable value) => WrapAsFeatureHasher(_jvmObject.Invoke("setInputCols", value)); @@ -104,14 +104,14 @@ public FeatureHasher SetNumFeatures(int value) => /// The will create a new column in the DataFrame, this is the /// name of the new column. /// - /// The name of the new column which contains the hash + /// The name of the new column which will contain the hash /// New object public FeatureHasher SetOutputCol(string value) => WrapAsFeatureHasher(_jvmObject.Invoke("setOutputCol", value)); /// /// Transforms the input . It is recommended that you validate that - /// the transform will succeed by calling `TransformSchema`. + /// the transform will succeed by calling TransformSchema. /// /// Input to transform /// Transformed @@ -120,7 +120,8 @@ public DataFrame Transform(DataFrame value) => /// /// Check transform validity and derive the output schema from the input schema. - /// This checks for validity of interactions between parameters during transformSchema and + /// + /// This checks for validity of interactions between parameters during Transform and /// raises an exception if any parameter value is invalid. /// /// Typical implementation should first conduct verification on schema change and parameter From d6094e31c49349ebe2f33c3ea57c69d089ed73a0 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 17:22:12 +0100 Subject: [PATCH 04/17] adding extra test for FeatureBase --- .../IpcTests/ML/Feature/BucketizerTests.cs | 15 ++-------- .../ML/Feature/CountVectorizerModelTests.cs | 4 ++- .../ML/Feature/CountVectorizerTests.cs | 2 ++ .../IpcTests/ML/Feature/FeatureBaseTests.cs | 29 +++++++++++++++++++ .../IpcTests/ML/Feature/FeatureHasherTests.cs | 2 ++ .../IpcTests/ML/Feature/HashingTFTests.cs | 2 ++ .../IpcTests/ML/Feature/IDFModelTests.cs | 2 ++ .../IpcTests/ML/Feature/IDFTests.cs | 2 ++ .../IpcTests/ML/Feature/TokenizerTests.cs | 2 ++ .../IpcTests/ML/Feature/Word2VecModelTests.cs | 2 ++ .../IpcTests/ML/Feature/Word2VecTests.cs | 2 ++ 11 files changed, 50 insertions(+), 14 deletions(-) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index e9193fd0b..b2d75355a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -25,7 +25,7 @@ public BucketizerTests(SparkFixture fixture) [Fact] public void TestBucketizer() { - var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; + var expectedSplits = new double[]{ double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; string expectedHandle = "skip"; string expectedUid = "uid"; @@ -60,18 +60,7 @@ public void TestBucketizer() Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); } - Assert.NotEmpty(bucketizer.ExplainParams()); - - Param handleInvalidParam = bucketizer.GetParam("handleInvalid"); - Assert.NotEmpty(handleInvalidParam.Doc); - Assert.NotEmpty(handleInvalidParam.Name); - Assert.Equal(handleInvalidParam.Parent, bucketizer.Uid()); - - Assert.NotEmpty(bucketizer.ExplainParam(handleInvalidParam)); - bucketizer.Set(handleInvalidParam, "keep"); - Assert.Equal("keep", bucketizer.GetHandleInvalid()); - - Assert.Equal("error", bucketizer.Clear(handleInvalidParam).GetHandleInvalid()); + FeatureBaseTests.TestBase(bucketizer, "handleInvalid", "keep"); } [Fact] diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs index 97458d173..a52b13658 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs @@ -22,7 +22,7 @@ public CountVectorizerModelTests(SparkFixture fixture) } /// - /// Test that we can create a CountVectorizerModel, pass in a specifc vocabulary to use + /// Test that we can create a CountVectorizerModel, pass in a specific vocabulary to use /// when creating the model. Verify the standard features methods as well as load/save. /// [Fact] @@ -68,6 +68,8 @@ public void TestCountVectorizerModel() Assert.IsType(countVectorizerModel.GetVocabSize()); Assert.NotEmpty(countVectorizerModel.ExplainParams()); Assert.NotEmpty(countVectorizerModel.ToString()); + + FeatureBaseTests.TestBase(countVectorizerModel, "minDF", 100); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs index 9e022ba69..e9651280d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -67,6 +67,8 @@ public void TestCountVectorizer() Assert.NotEmpty(countVectorizer.ExplainParams()); Assert.NotEmpty(countVectorizer.ToString()); + + FeatureBaseTests.TestBase(countVectorizer, "minDF", 0.4); } /// diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs new file mode 100644 index 000000000..566efd0c7 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs @@ -0,0 +1,29 @@ + +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.ML.Feature.Param; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + public class FeatureBaseTests + { + internal static void TestBase( + FeatureBase testObject, + string paramName, + object paramValue) + { + Assert.NotEmpty(testObject.ExplainParams()); + + Param handleInvalidParam = testObject.GetParam(paramName); + Assert.NotEmpty(handleInvalidParam.Doc); + Assert.NotEmpty(handleInvalidParam.Name); + Assert.Equal(handleInvalidParam.Parent, testObject.Uid()); + + Assert.NotEmpty(testObject.ExplainParam(handleInvalidParam)); + testObject.Set(handleInvalidParam, paramValue); + Assert.IsAssignableFrom(testObject.Clear(handleInvalidParam)); + + Assert.IsType(testObject.Uid()); + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 24b0b22ee..8a1c397cc 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -50,6 +50,8 @@ public void TestFeatureHasher() Assert.IsType(hasher.GetNumFeatures()); Assert.IsType(hasher.TransformSchema(dataFrame.Schema())); Assert.IsType(hasher.Transform(dataFrame)); + + FeatureBaseTests.TestBase(hasher, "numFeatures", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index df459ed7a..9d3fdf4a8 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -57,6 +57,8 @@ public void TestHashingTF() hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); + + FeatureBaseTests.TestBase(hashingTf, "numFeatures", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 202187809..3467dea0c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -65,6 +65,8 @@ public void TestIDFModel() IDFModel loadedModel = IDFModel.Load(modelPath); Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } + + FeatureBaseTests.TestBase(idfModel, "minDocFreq", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs index 72da97887..6c635fd7d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -44,6 +44,8 @@ public void TestIDFModel() IDF loadedIdf = IDF.Load(savePath); Assert.Equal(idf.Uid(), loadedIdf.Uid()); } + + FeatureBaseTests.TestBase(idf, "minDocFreq", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 4b1998f50..5c8ab9a8c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -50,6 +50,8 @@ public void TestTokenizer() } Assert.Equal(expectedUid, tokenizer.Uid()); + + FeatureBaseTests.TestBase(tokenizer, "inputCol", "input_col"); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs index a5227149b..a64a70512 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs @@ -47,6 +47,8 @@ public void TestWord2VecModel() Word2VecModel loadedModel = Word2VecModel.Load(savePath); Assert.Equal(model.Uid(), loadedModel.Uid()); } + + FeatureBaseTests.TestBase(model, "maxIter", 2); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs index 1d5da5335..a117d3426 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs @@ -67,6 +67,8 @@ public void TestWord2Vec() Word2Vec loadedWord2Vec = Word2Vec.Load(savePath); Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid()); } + + FeatureBaseTests.TestBase(word2vec, "maxIter", 2); } } } From ffd0cfc00f0cc484269ff434000cae3d49583c80 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 19:47:29 +0100 Subject: [PATCH 05/17] Trigger Build From a0356cc836b8f47976900d37042e32337a50db64 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 20:15:18 +0100 Subject: [PATCH 06/17] missing file header --- .../IpcTests/ML/Feature/FeatureBaseTests.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs index 566efd0c7..3daae39cc 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs @@ -1,3 +1,6 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. using Microsoft.Spark.ML.Feature; using Microsoft.Spark.ML.Feature.Param; @@ -5,7 +8,7 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { - public class FeatureBaseTests + public static class FeatureBaseTests { internal static void TestBase( FeatureBase testObject, From 7921e049d8393229ac82b12dba304aac9eaaf88a Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 20:19:23 +0100 Subject: [PATCH 07/17] comments --- src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs index bfcec18d0..a312318fa 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -61,11 +61,11 @@ public FeatureHasher SetCategoricalCols(IEnumerable value) => /// Gets the columns that the should read from and convert into /// hashes. This would have been set by SetInputCol. /// - /// string, the input column + /// IEnumerable<string>, the input columns public IEnumerable GetInputCols() => (string[])_jvmObject.Invoke("getInputCols"); /// - /// Sets the column that the should read from and convert into + /// Sets the columns that the should read from and convert into /// hashes. /// /// The name of the column to as use the source of the hash From b5fcee270221d4560b4ded8b73454d3a8f319e2c Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 20:21:21 +0100 Subject: [PATCH 08/17] naming better --- .../IpcTests/ML/Feature/FeatureBaseTests.cs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs index 3daae39cc..413b84a98 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs @@ -17,14 +17,14 @@ internal static void TestBase( { Assert.NotEmpty(testObject.ExplainParams()); - Param handleInvalidParam = testObject.GetParam(paramName); - Assert.NotEmpty(handleInvalidParam.Doc); - Assert.NotEmpty(handleInvalidParam.Name); - Assert.Equal(handleInvalidParam.Parent, testObject.Uid()); + Param param = testObject.GetParam(paramName); + Assert.NotEmpty(param.Doc); + Assert.NotEmpty(param.Name); + Assert.Equal(param.Parent, testObject.Uid()); - Assert.NotEmpty(testObject.ExplainParam(handleInvalidParam)); - testObject.Set(handleInvalidParam, paramValue); - Assert.IsAssignableFrom(testObject.Clear(handleInvalidParam)); + Assert.NotEmpty(testObject.ExplainParam(param)); + testObject.Set(param, paramValue); + Assert.IsAssignableFrom(testObject.Clear(param)); Assert.IsType(testObject.Uid()); } From 01d40f8299806e6a891c2eeb346c58c8548b3e8d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 8 Sep 2020 22:08:54 +0100 Subject: [PATCH 09/17] indentation --- .../IpcTests/ML/Feature/BucketizerTests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index b2d75355a..d915742e8 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -25,7 +25,8 @@ public BucketizerTests(SparkFixture fixture) [Fact] public void TestBucketizer() { - var expectedSplits = new double[]{ double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; + var expectedSplits = + new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; string expectedHandle = "skip"; string expectedUid = "uid"; From 89694dcf9616bebe900e3984a18da193deb252cd Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 9 Sep 2020 06:59:12 +0100 Subject: [PATCH 10/17] changes after feedback --- .../IpcTests/ML/Feature/BucketizerTests.cs | 8 ++++---- .../IpcTests/ML/Feature/FeatureBaseTests.cs | 6 ++++++ .../IpcTests/ML/Feature/FeatureHasherTests.cs | 6 +++++- src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs | 9 ++++----- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index d915742e8..30d7f1824 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -28,10 +28,10 @@ public void TestBucketizer() var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; - string expectedHandle = "skip"; - string expectedUid = "uid"; - string expectedInputCol = "input_col"; - string expectedOutputCol = "output_col"; + const string expectedHandle = "skip"; + const string expectedUid = "uid"; + const string expectedInputCol = "input_col"; + const string expectedOutputCol = "output_col"; var bucketizer = new Bucketizer(expectedUid); bucketizer.SetInputCol(expectedInputCol) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs index 413b84a98..cc1732c18 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs @@ -10,6 +10,12 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { public static class FeatureBaseTests { + /// + /// Tests the common functionality across all ML.Feature classes. + /// + /// The object that implemented FeatureBase + /// The name of a parameter that can be set on this object + /// A parameter value that can be set on this object internal static void TestBase( FeatureBase testObject, string paramName, diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 8a1c397cc..39cfe4f14 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -20,7 +20,11 @@ public FeatureHasherTests(SparkFixture fixture) { _spark = fixture.Spark; } - + + /// + /// Create a , create a and test the + /// available methods. Test the FeatureBase methods using . + /// [Fact] public void TestFeatureHasher() { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs index a312318fa..f36dc0854 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -61,7 +61,7 @@ public FeatureHasher SetCategoricalCols(IEnumerable value) => /// Gets the columns that the should read from and convert into /// hashes. This would have been set by SetInputCol. /// - /// IEnumerable<string>, the input columns + /// List of the input columns, set by SetInputCols public IEnumerable GetInputCols() => (string[])_jvmObject.Invoke("getInputCols"); /// @@ -88,7 +88,7 @@ public FeatureHasher SetInputCols(IEnumerable value) => /// the numFeatures parameter; otherwise the features will not be mapped evenly to the /// columns. /// - /// int + /// int value of number of features /// New object public FeatureHasher SetNumFeatures(int value) => WrapAsFeatureHasher(_jvmObject.Invoke("setNumFeatures", value)); @@ -101,8 +101,7 @@ public FeatureHasher SetNumFeatures(int value) => public string GetOutputCol() => (string)_jvmObject.Invoke("getOutputCol"); /// - /// The will create a new column in the DataFrame, this is the - /// name of the new column. + /// Sets the name of the new column in the created by Transform. /// /// The name of the new column which will contain the hash /// New object @@ -122,7 +121,7 @@ public DataFrame Transform(DataFrame value) => /// Check transform validity and derive the output schema from the input schema. /// /// This checks for validity of interactions between parameters during Transform and - /// raises an exception if any parameter value is invalid. + /// raises an exception if any parameter value is invalid. /// /// Typical implementation should first conduct verification on schema change and parameter /// validity, including complex parameter interaction checks. From 2d8eaa1bbab8fb3753f1408e76097054c88cd3f5 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 9 Sep 2020 07:00:51 +0100 Subject: [PATCH 11/17] test summary --- .../IpcTests/ML/Feature/BucketizerTests.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 30d7f1824..bb7ae410a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -22,6 +22,10 @@ public BucketizerTests(SparkFixture fixture) _spark = fixture.Spark; } + /// + /// Create a , create a and test the + /// available methods. Test the FeatureBase methods using . + /// [Fact] public void TestBucketizer() { From bad829a95743bd42a3be00d42972919efa59761f Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 9 Sep 2020 07:50:16 +0100 Subject: [PATCH 12/17] trigger build From 02f06a94b718c03af209156fa0bc8271950e878d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 29 Sep 2020 20:47:41 +0100 Subject: [PATCH 13/17] changes after review --- .../IpcTests/ML/Feature/BucketizerTests.cs | 6 +++--- .../IpcTests/ML/Feature/CountVectorizerModelTests.cs | 6 +++--- .../IpcTests/ML/Feature/CountVectorizerTests.cs | 6 +++--- .../IpcTests/ML/Feature/FeatureBaseTests.cs | 12 ++++++++++-- .../IpcTests/ML/Feature/FeatureHasherTests.cs | 12 ++++++------ .../IpcTests/ML/Feature/HashingTFTests.cs | 6 +++--- .../IpcTests/ML/Feature/IDFModelTests.cs | 6 +++--- .../IpcTests/ML/Feature/IDFTests.cs | 6 +++--- .../IpcTests/ML/Feature/TokenizerTests.cs | 6 +++--- .../IpcTests/ML/Feature/Word2VecModelTests.cs | 6 +++--- .../IpcTests/ML/Feature/Word2VecTests.cs | 6 +++--- .../Microsoft.Spark/ML/Feature/FeatureHasher.cs | 9 ++++++--- 12 files changed, 49 insertions(+), 38 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index bb7ae410a..6f281c1ca 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -13,11 +13,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class BucketizerTests + public class BucketizerTests : FeatureBaseTests { private readonly SparkSession _spark; - public BucketizerTests(SparkFixture fixture) + public BucketizerTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -65,7 +65,7 @@ public void TestBucketizer() Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); } - FeatureBaseTests.TestBase(bucketizer, "handleInvalid", "keep"); + TestFeatureBase(bucketizer, "handleInvalid", "keep"); } [Fact] diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs index a52b13658..e8ea1ade4 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs @@ -12,11 +12,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class CountVectorizerModelTests + public class CountVectorizerModelTests : FeatureBaseTests { private readonly SparkSession _spark; - public CountVectorizerModelTests(SparkFixture fixture) + public CountVectorizerModelTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -69,7 +69,7 @@ public void TestCountVectorizerModel() Assert.NotEmpty(countVectorizerModel.ExplainParams()); Assert.NotEmpty(countVectorizerModel.ToString()); - FeatureBaseTests.TestBase(countVectorizerModel, "minDF", 100); + TestFeatureBase(countVectorizerModel, "minDF", 100); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs index e9651280d..5d046dc87 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -13,11 +13,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class CountVectorizerTests + public class CountVectorizerTests : FeatureBaseTests { private readonly SparkSession _spark; - public CountVectorizerTests(SparkFixture fixture) + public CountVectorizerTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -68,7 +68,7 @@ public void TestCountVectorizer() Assert.NotEmpty(countVectorizer.ExplainParams()); Assert.NotEmpty(countVectorizer.ToString()); - FeatureBaseTests.TestBase(countVectorizer, "minDF", 0.4); + TestFeatureBase(countVectorizer, "minDF", 0.4); } /// diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs index cc1732c18..01903e510 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureBaseTests.cs @@ -4,19 +4,27 @@ using Microsoft.Spark.ML.Feature; using Microsoft.Spark.ML.Feature.Param; +using Microsoft.Spark.Sql; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { - public static class FeatureBaseTests + public class FeatureBaseTests { + private readonly SparkSession _spark; + + protected FeatureBaseTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + /// /// Tests the common functionality across all ML.Feature classes. /// /// The object that implemented FeatureBase /// The name of a parameter that can be set on this object /// A parameter value that can be set on this object - internal static void TestBase( + public void TestFeatureBase( FeatureBase testObject, string paramName, object paramValue) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 39cfe4f14..2faac108d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -12,11 +12,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class FeatureHasherTests + public class FeatureHasherTests : FeatureBaseTests { private readonly SparkSession _spark; - public FeatureHasherTests(SparkFixture fixture) + public FeatureHasherTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -43,19 +43,19 @@ public void TestFeatureHasher() })); FeatureHasher hasher = new FeatureHasher() - .SetInputCols(new List() {"real", "bool", "stringNum", "string"}) + .SetInputCols(new List() { "real", "bool", "stringNum", "string" }) .SetOutputCol("features") .SetCategoricalCols(new List() {"real", "string"}) .SetNumFeatures(10); Assert.IsType(hasher.GetOutputCol()); - Assert.IsType (hasher.GetInputCols()); - Assert.IsType>(hasher.GetCategoricalCols()); + Assert.IsType(hasher.GetInputCols()); + Assert.IsType(hasher.GetCategoricalCols()); Assert.IsType(hasher.GetNumFeatures()); Assert.IsType(hasher.TransformSchema(dataFrame.Schema())); Assert.IsType(hasher.Transform(dataFrame)); - FeatureBaseTests.TestBase(hasher, "numFeatures", 1000); + TestFeatureBase(hasher, "numFeatures", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 9d3fdf4a8..246b4516e 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class HashingTFTests + public class HashingTFTests : FeatureBaseTests { private readonly SparkSession _spark; - public HashingTFTests(SparkFixture fixture) + public HashingTFTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -58,7 +58,7 @@ public void TestHashingTF() hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); - FeatureBaseTests.TestBase(hashingTf, "numFeatures", 1000); + TestFeatureBase(hashingTf, "numFeatures", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 3467dea0c..1894373a6 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class IDFModelTests + public class IDFModelTests : FeatureBaseTests { private readonly SparkSession _spark; - public IDFModelTests(SparkFixture fixture) + public IDFModelTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -66,7 +66,7 @@ public void TestIDFModel() Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } - FeatureBaseTests.TestBase(idfModel, "minDocFreq", 1000); + TestFeatureBase(idfModel, "minDocFreq", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs index 6c635fd7d..64698ac9a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class IDFTests + public class IDFTests : FeatureBaseTests { private readonly SparkSession _spark; - public IDFTests(SparkFixture fixture) + public IDFTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -45,7 +45,7 @@ public void TestIDFModel() Assert.Equal(idf.Uid(), loadedIdf.Uid()); } - FeatureBaseTests.TestBase(idf, "minDocFreq", 1000); + TestFeatureBase(idf, "minDocFreq", 1000); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 5c8ab9a8c..af76ac523 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class TokenizerTests + public class TokenizerTests : FeatureBaseTests { private readonly SparkSession _spark; - public TokenizerTests(SparkFixture fixture) + public TokenizerTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -51,7 +51,7 @@ public void TestTokenizer() Assert.Equal(expectedUid, tokenizer.Uid()); - FeatureBaseTests.TestBase(tokenizer, "inputCol", "input_col"); + TestFeatureBase(tokenizer, "inputCol", "input_col"); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs index a64a70512..04c7d7a79 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecModelTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class Word2VecModelTests + public class Word2VecModelTests : FeatureBaseTests { private readonly SparkSession _spark; - public Word2VecModelTests(SparkFixture fixture) + public Word2VecModelTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -48,7 +48,7 @@ public void TestWord2VecModel() Assert.Equal(model.Uid(), loadedModel.Uid()); } - FeatureBaseTests.TestBase(model, "maxIter", 2); + TestFeatureBase(model, "maxIter", 2); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs index a117d3426..1c36eb2c2 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/Word2VecTests.cs @@ -11,11 +11,11 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class Word2VecTests + public class Word2VecTests : FeatureBaseTests { private readonly SparkSession _spark; - public Word2VecTests(SparkFixture fixture) + public Word2VecTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } @@ -68,7 +68,7 @@ public void TestWord2Vec() Assert.Equal(word2vec.Uid(), loadedWord2Vec.Uid()); } - FeatureBaseTests.TestBase(word2vec, "maxIter", 2); + TestFeatureBase(word2vec, "maxIter", 2); } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs index f36dc0854..fb89b1051 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureHasher.cs @@ -40,14 +40,16 @@ internal FeatureHasher(JvmObjectReference jvmObject) : base(jvmObject) public static FeatureHasher Load(string path) => WrapAsFeatureHasher( SparkEnvironment.JvmBridge.CallStaticJavaMethod( - s_featureHasherClassName,"load", path)); + s_featureHasherClassName, + "load", + path)); /// /// Gets a list of the columns which have been specified as categorical columns. /// /// List of categorical columns, set by SetCategoricalCols public IEnumerable GetCategoricalCols() => - ((string[])_jvmObject.Invoke("getCategoricalCols")).ToList(); + (string[])_jvmObject.Invoke("getCategoricalCols"); /// /// Marks columns as categorical columns. @@ -135,7 +137,8 @@ public DataFrame Transform(DataFrame value) => /// public StructType TransformSchema(StructType value) => new StructType( - (JvmObjectReference)_jvmObject.Invoke("transformSchema", + (JvmObjectReference)_jvmObject.Invoke( + "transformSchema", DataType.FromJson(_jvmObject.Jvm, value.Json))); private static FeatureHasher WrapAsFeatureHasher(object obj) => From ecb9e5f66424d313b23a7ff79141b656c875d1fa Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Tue, 29 Sep 2020 20:23:35 +0000 Subject: [PATCH 14/17] formatting --- .../IpcTests/ML/Feature/FeatureHasherTests.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 2faac108d..50fecd630 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -15,12 +15,12 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature public class FeatureHasherTests : FeatureBaseTests { private readonly SparkSession _spark; - + public FeatureHasherTests(SparkFixture fixture) : base(fixture) { _spark = fixture.Spark; } - + /// /// Create a , create a and test the /// available methods. Test the FeatureBase methods using . @@ -45,7 +45,7 @@ public void TestFeatureHasher() FeatureHasher hasher = new FeatureHasher() .SetInputCols(new List() { "real", "bool", "stringNum", "string" }) .SetOutputCol("features") - .SetCategoricalCols(new List() {"real", "string"}) + .SetCategoricalCols(new List() { "real", "string" }) .SetNumFeatures(10); Assert.IsType(hasher.GetOutputCol()); @@ -54,7 +54,7 @@ public void TestFeatureHasher() Assert.IsType(hasher.GetNumFeatures()); Assert.IsType(hasher.TransformSchema(dataFrame.Schema())); Assert.IsType(hasher.Transform(dataFrame)); - + TestFeatureBase(hasher, "numFeatures", 1000); } } From b28c1a732f846281758bb44d3a04942d9c99f74e Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Tue, 29 Sep 2020 20:26:36 +0000 Subject: [PATCH 15/17] formatting --- .../IpcTests/ML/Feature/FeatureHasherTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs index 50fecd630..fe169a9f0 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/FeatureHasherTests.cs @@ -31,8 +31,8 @@ public void TestFeatureHasher() DataFrame dataFrame = _spark.CreateDataFrame( new List { - new GenericRow(new object[] {2.0D, true, "1", "foo"}), - new GenericRow(new object[] {3.0D, false, "2", "bar"}) + new GenericRow(new object[] { 2.0D, true, "1", "foo" }), + new GenericRow(new object[] { 3.0D, false, "2", "bar" }) }, new StructType(new List { From e94a601ed74cc56f6799e714a1840f9fec5f22a4 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Tue, 29 Sep 2020 20:30:54 +0000 Subject: [PATCH 16/17] Reverting change --- .../IpcTests/ML/Feature/BucketizerTests.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 6f281c1ca..949ba06da 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -32,10 +32,10 @@ public void TestBucketizer() var expectedSplits = new double[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue }; - const string expectedHandle = "skip"; - const string expectedUid = "uid"; - const string expectedInputCol = "input_col"; - const string expectedOutputCol = "output_col"; + string expectedHandle = "skip"; + string expectedUid = "uid"; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; var bucketizer = new Bucketizer(expectedUid); bucketizer.SetInputCol(expectedInputCol) From 54d55dec7105025505bbf1ae397d7b1d79cc8057 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 2 Oct 2020 08:59:53 +0100 Subject: [PATCH 17/17] retrigger build