dotnet · imback82 · Mar 25, 2020 · Dec 16, 2019 · Dec 29, 2019 · Dec 29, 2019
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -3,6 +3,8 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Collections.Generic;
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
@@ -47,6 +49,15 @@ public void TestBucketizer()
             Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
             Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
             Assert.Equal(expectedSplits, bucketizer.GetSplits());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                string savePath = Path.Join(tempDirectory.Path, "bucket");
+                bucketizer.Save(savePath);
+
+                Bucketizer loadedBucketizer = Bucketizer.Load(savePath);
+                Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
+            }
         }
 
         [Fact]

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -0,0 +1,65 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Microsoft.Spark.E2ETest.Utils;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class HashingTFTests
+    {
+        private readonly SparkSession _spark;
+
+        public HashingTFTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestHashingTF()
+        {
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+            int expectedFeatures = 10;
+
+            Assert.IsType<HashingTF>(new HashingTF());
+
+            HashingTF hashingTf = new HashingTF("my-unique-id")
+                .SetNumFeatures(expectedFeatures)
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol);
+
+            Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures());
+            Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
+            Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());
+
+            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
+                " as input_col");
+
+            DataFrame output = hashingTf.Transform(input);
+            DataFrame outputVector = output.Select(expectedOutputCol);
+
+            Assert.Contains(expectedOutputCol, outputVector.Columns());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                string savePath = Path.Join(tempDirectory.Path, "hashingTF");
+                hashingTf.Save(savePath);
+
+                HashingTF loadedHashingTf = HashingTF.Load(savePath);
+                Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
+            }
+
+            hashingTf.SetBinary(true);
+            Assert.True(hashingTf.GetBinary());
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -0,0 +1,70 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class IDFModelTests
+    {
+        private readonly SparkSession _spark;
+
+        public IDFModelTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestIDFModel()
+        {
+            int expectedDocFrequency = 1980;
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
+
+            DataFrame sentenceData =
+                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
+
+            Tokenizer tokenizer = new Tokenizer()
+                .SetInputCol("sentence")
+                .SetOutputCol("words");
+
+            DataFrame wordsData = tokenizer.Transform(sentenceData);
+
+            HashingTF hashingTF = new HashingTF()
+                .SetInputCol("words")
+                .SetOutputCol(expectedInputCol)
+                .SetNumFeatures(20);
+
+            DataFrame featurizedData = hashingTF.Transform(wordsData);
+
+            IDF idf = new IDF()
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol)
+                .SetMinDocFreq(expectedDocFrequency);
+
+            IDFModel idfModel = idf.Fit(featurizedData);
+
+            DataFrame rescaledData = idfModel.Transform(featurizedData);
+            Assert.Contains(expectedOutputCol, rescaledData.Columns());
+
+            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
+            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
+            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
+                idfModel.Save(modelPath);
+
+                IDFModel loadedModel = IDFModel.Load(modelPath);
+                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
+            }
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -0,0 +1,49 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class IDFTests
+    {
+        private readonly SparkSession _spark;
+
+        public IDFTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestIDFModel()
+        {
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
+            int expectedDocFrequency = 100;
+
+            IDF idf = new IDF()
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol)
+                .SetMinDocFreq(expectedDocFrequency);
+
+            Assert.Equal(expectedInputCol, idf.GetInputCol());
+            Assert.Equal(expectedOutputCol, idf.GetOutputCol());
+            Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                string savePath = Path.Join(tempDirectory.Path, "IDF");
+                idf.Save(savePath);
+
+                IDF loadedIdf = IDF.Load(savePath);
+                Assert.Equal(idf.Uid(), loadedIdf.Uid());
+            }
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -0,0 +1,55 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class TokenizerTests
+    {
+        private readonly SparkSession _spark;
+
+        public TokenizerTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestTokenizer()
+        {
+            string expectedUid = "theUid";
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+
+            DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
+                " from range(100)");
+
+            Tokenizer tokenizer = new Tokenizer(expectedUid)
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol);
+
+            DataFrame output = tokenizer.Transform(input);
+
+            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
+            Assert.Equal(expectedInputCol, tokenizer.GetInputCol());
+            Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                string savePath = Path.Join(tempDirectory.Path, "Tokenizer");
+                tokenizer.Save(savePath);
+
+                Tokenizer loadedTokenizer = Tokenizer.Load(savePath);
+                Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid());
+            }
+
+            Assert.Equal(expectedUid, tokenizer.Uid());
+        }
+    }
+}