From 45404bb0b3f7a784370c1a85bb598ab5fd15924e Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 16 Dec 2019 23:25:30 +0000
Subject: [PATCH 01/47] bare bones bucketizer

---
 .../Microsoft.Spark.Extensions.ML/Class1.cs   | 12 +++
 .../Microsoft.Spark.Extensions.ML.csproj      |  7 ++
 src/csharp/Microsoft.Spark.sln                |  7 ++
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 96 +++++++++++++++++++
 .../Microsoft.Spark/Microsoft.Spark.csproj    |  5 +-
 src/csharp/Microsoft.Spark/RDD.cs             |  2 +-
 6 files changed, 124 insertions(+), 5 deletions(-)
 create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
 create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
new file mode 100644
index 000000000..5874db8d0
--- /dev/null
+++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
@@ -0,0 +1,12 @@
+﻿using System;
+
+namespace Microsoft.Spark.Extensions.ML
+{
+    public class Pipeline<T> where T : new()
+    {
+        public T Load(string path)
+        {
+            return new T();
+        }
+    }
+}
diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj
new file mode 100644
index 000000000..27560206d
--- /dev/null
+++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj
@@ -0,0 +1,7 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+    <PropertyGroup>
+        <TargetFramework>netstandard2.0</TargetFramework>
+    </PropertyGroup>
+
+</Project>
diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln
index b31c377c7..4b76eb777 100644
--- a/src/csharp/Microsoft.Spark.sln
+++ b/src/csharp/Microsoft.Spark.sln
@@ -33,6 +33,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Delta.E2ETest", "Extensions\Microsoft.Spark.Extensions.Delta.E2ETest\Microsoft.Spark.Extensions.Delta.E2ETest.csproj", "{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.ML", "Extensions\Microsoft.Spark.Extensions.ML\Microsoft.Spark.Extensions.ML.csproj", "{38672397-3BC7-4818-A84A-7EE1618311CA}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -83,6 +85,10 @@ Global
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.Build.0 = Release|Any CPU
+		{38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -92,6 +98,7 @@ Global
 		{4E379DB3-7741-43C2-B32D-17AD96FEA7D0} = {C8C53525-4FEB-4B5B-91A2-619566C72F3E}
 		{2048446B-45AB-4304-B230-50EDF6E8E6A4} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
+		{38672397-3BC7-4818-A84A-7EE1618311CA} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
new file mode 100644
index 000000000..28a2c768a
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -0,0 +1,96 @@
+using System;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    public class Bucketizer : IJvmObjectReferenceProvider
+    {
+       
+        internal Bucketizer(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+        public Bucketizer()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor("org.apache.spark.ml.feature.Bucketizer");
+        }
+        
+        public Bucketizer(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor("org.apache.spark.ml.feature.Bucketizer", uid);
+        }
+
+        public static Bucketizer Load(string path)
+        {
+            return 
+                WrapAsBucketizer(
+                    SparkEnvironment.JvmBridge.CallStaticJavaMethod("org.apache.spark.ml.feature.Bucketizer", "load",
+                        path));
+        } 
+        
+        public void Save(string path)
+        {
+                _jvmObject.Invoke("org.apache.spark.ml.feature.Bucketizer", "save",path);
+        } 
+        
+        private readonly JvmObjectReference _jvmObject = null;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        public Bucketizer SetSplits(double[] value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
+        }
+
+        public double[] GetSplits()
+        {
+            return (double[])_jvmObject.Invoke("getSplits");
+        }
+
+        public string GetInputCol()
+        {
+            return (string)_jvmObject.Invoke("getInputCol");
+        }
+        
+        public Bucketizer SetInputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        public Bucketizer SetOutputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
+        }
+
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
+        }
+        private static Bucketizer WrapAsBucketizer(object obj)
+        {
+            return new Bucketizer((JvmObjectReference)obj);
+        }
+
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+
+        public string GetHandleInvalid()
+        {
+            return (string)_jvmObject.Invoke("getHandleInvalid");
+        }
+        
+        public Bucketizer SetHandleInvalid(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value));
+        }
+
+        public StructType TransformSchema(StructType schema)
+        {
+            return (StructType)_jvmObject.Invoke("transformScherma", schema);
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index 297e3eb41..778862e3c 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -32,10 +32,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
-             Link="jars\%(Filename)%(Extension)"
-             Pack="true"
-             PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 
diff --git a/src/csharp/Microsoft.Spark/RDD.cs b/src/csharp/Microsoft.Spark/RDD.cs
index 7eda57c61..556884560 100644
--- a/src/csharp/Microsoft.Spark/RDD.cs
+++ b/src/csharp/Microsoft.Spark/RDD.cs
@@ -102,7 +102,7 @@ internal RDD(
             _prevSerializedMode = prevSerializedMode;
         }
 
-        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+            JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
         /// <summary>
         /// Persist this RDD with the default storage level (MEMORY_ONLY).

From 95d0014b6971ec267939e8ae6de8a7f43cb3fa4f Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sun, 29 Dec 2019 12:55:56 +0000
Subject: [PATCH 02/47] implement bucketizer

---
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 206 ++++++++++++++++++
 .../ML/Param/DoubleArrayArrayParam.cs         |  38 ++++
 2 files changed, 244 insertions(+)
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
 create mode 100644 src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
new file mode 100644
index 000000000..260febd5e
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -0,0 +1,206 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.ML.Param;
+using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// `Bucketizer` maps a column of continuous features to a column of feature buckets.
+    /// 
+    /// `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note
+    /// that when both the `inputCol` and `inputCols` parameters are set, an Exception will be
+    /// thrown. The `splits` parameter is only used for single column usage, and `splitsArray` is
+    /// for multiple columns.
+    /// </summary>
+    public class Bucketizer : IJvmObjectReferenceProvider
+    {
+        private readonly JvmObjectReference _jvmObject = null;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        internal Bucketizer(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        /// <summary>
+        /// Create a `Bucketizer` without any parameters
+        /// </summary>
+        public Bucketizer()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Bucketizer");
+        }
+
+        /// <summary>
+        /// Create a `Bucketizer` with a UID that is used to give the `Bucketizer` a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public Bucketizer(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Bucketizer", uid);
+        }
+        
+        /// <summary>
+        /// Split points for splitting a single column into buckets. To split multiple columns use
+        /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time.
+        /// </summary>
+        /// <param name="value">
+        /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
+        /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last
+        /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
+        /// increasing. Values outside the splits specified will be treated as errors.
+        /// </param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetSplits(double[] value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
+        }
+
+        /// <summary>
+        /// Split points fot splitting multiple columns into buckets. To split a single column use
+        /// `SetSplits`.  You cannot use both `SetSplits` and `SetSplitsArray` at the same time.
+        /// </summary>
+        /// <param name="value">
+        /// The array of split points for mapping continuous features into buckets for multiple 
+        /// columns. For each input column, with n+1 splits, there are n buckets. A bucket defined
+        /// by splits x,y holds values in the range [x,y) except the last bucket, which also
+        /// includes y. The splits should be of length &gt;= 3 and strictly increasing.
+        /// Values outside the splits specified will be treated as errors.</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetSplitsArray(double[][] value)
+        {
+            DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject,
+                "setSplitsArray",
+                "wrapper for double[][] from csharp", value);
+
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray",
+                doubleArrayArray.ReferenceValue));
+        }
+
+        /// <summary>
+        /// Sets the column that the `Bucketizer` should read from and convert into buckets
+        /// </summary>
+        /// <param name="value">The name of the column to as the source of the buckets</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetInputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// Sets the columns that `Bucketizer` should read from and convert into buckets.
+        ///
+        /// Each column is one set of buckets so if you have two input columns you can have two
+        ///  sets of buckets and two output columns.
+        /// </summary>
+        /// <param name="value">List of input columns to use as sources for buckets</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetInputCols(List<string> value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
+        }
+
+        /// <summary>
+        /// The `Bucketizer` will create a new column in the DataFrame, this is the name of the
+        /// new column.
+        /// </summary>
+        /// <param name="value">The name of the new column which contains the bucket ID</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetOutputCol(string value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
+        }
+        
+        /// <summary>
+        /// The list of columns that the `Bucketizer` will create in the DataFrame.
+        /// </summary>
+        /// <param name="value">List of column names which will contain the bucket ID</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetOutputCols(List<string> value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
+        }
+        
+        /// <summary>
+        /// Executes the `Bucketizer` and transforms the DataFrame to include the new column or
+        /// columns with the bucketed data.
+        /// </summary>
+        /// <param name="source">The DataFrame to add the bucketed data to</param>
+        /// <returns>`DataFrame` containing the original data and the new bucketed columns</returns>
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform"
+                , source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        ///  `Bucketizer`
+        /// </summary>
+        /// <param name="obj">The `JvmObjectReference` to convert into a dotnet `Bucketizer`</param>
+        /// <returns>`Bucketizer`</returns>
+        private static Bucketizer WrapAsBucketizer(object obj)
+        {
+            return new Bucketizer((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the `Bucketizer`. If no `UID` is passed in when creating
+        ///  the `Bucketizer` then a random `UID` is created when the `Bucketizer` is created.
+        /// </summary>
+        /// <returns>string `UID` identifying the `Bucketizer`</returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+
+        /// <summary>
+        /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep"
+        /// </summary>
+        /// <returns>`BucketizerInvalidOptions`</returns>
+        public BucketizerInvalidOptions GetHandleInvalid()
+        {
+            string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid");
+            if (BucketizerInvalidOptions.TryParse(handleInvalid, true, 
+                out BucketizerInvalidOptions result))
+            {
+                return result;
+            }
+            
+            return result;
+        }
+
+        /// <summary>
+        /// Tells the `Bucketizer` what to do with invalid data.
+        ///
+        /// Choices are "skip", "error" or "keep". Default is "error"
+        /// </summary>
+        /// <param name="value">`BucketizerInvalidOptions`, "skip", "error" or "keep"</param>
+        /// <returns>`Bucketizer`</returns>
+        public Bucketizer SetHandleInvalid(BucketizerInvalidOptions value)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
+        }
+        
+        /// <summary>
+        /// dotnet version of the options that can be passed to the `Bucketizer` to tell it how to
+        ///  handle invalid data.
+        /// </summary>
+        public enum BucketizerInvalidOptions
+        {
+            unknown,
+            skip,
+            error,
+            keep
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
new file mode 100644
index 000000000..7afe243c5
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
@@ -0,0 +1,38 @@
+using System;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Newtonsoft.Json;
+
+namespace Microsoft.Spark.ML.Param
+{
+    /// <summary>
+    /// Internal class used to help the `Bucketizer` pass a double[][] into the JVM.
+    /// </summary>
+    class DoubleArrayArrayParam : IJvmObjectReferenceProvider
+    {
+        private readonly JvmObjectReference _jvmObject;
+
+        public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.param.DoubleArrayArrayParam",
+                parent, name, doc);
+
+            string json = JsonConvert.SerializeObject(param);
+            ReferenceValue = jsonDecode(json);
+        }
+        
+        private JvmObjectReference jsonDecode(string json)
+        {
+            return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json);
+        }
+        public JvmObjectReference Reference { get; }
+        
+        /// <summary>
+        /// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to
+        /// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the
+        /// JVM side to get a double[][]. ReferenceValue is the double[][].
+        /// </summary>
+        public JvmObjectReference ReferenceValue { get; }
+    }
+}

From fb2d0190c33fad85dcd716bc9ee50440e46e0b69 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sun, 29 Dec 2019 13:21:49 +0000
Subject: [PATCH 03/47] first tests

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
new file mode 100644
index 000000000..bd6ff9231
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -0,0 +1,39 @@
+using System;
+using System.Linq;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class BucketizerTests 
+    {
+        private readonly SparkSession _spark;
+
+        public BucketizerTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+        
+        [Fact]
+        public void TestBucketizer()
+        {
+            Bucketizer bucketizer = new Bucketizer("uid")
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col")
+                .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.skip)
+                .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue});
+
+            Assert.Equal(Bucketizer.BucketizerInvalidOptions.skip,
+                bucketizer.GetHandleInvalid());
+
+            Assert.Equal("uid", bucketizer.Uid());
+            
+            DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");
+
+            DataFrame output = bucketizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+        }
+    }
+}

From d759e60110ba0a59bdb1e7ecc3c1b4a9c86cc857 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sun, 29 Dec 2019 13:27:51 +0000
Subject: [PATCH 04/47] multi column tests

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index bd6ff9231..002a9812f 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -1,4 +1,5 @@
 using System;
+using System.Collections.Generic;
 using System.Linq;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
@@ -35,5 +36,28 @@ public void TestBucketizer()
             DataFrame output = bucketizer.Transform(input);
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
         }
+        
+        [Fact]
+        public void TestBucketizer_MultipleColumns()
+        {
+            Bucketizer bucketizer = new Bucketizer()
+                .SetInputCols(new List<string>(){"input_col_a", "input_col_b"})
+                .SetOutputCols(new List<string>(){"output_col_a", "output_col_b"})
+                .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.keep)
+                .SetSplitsArray(new []{
+                    new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
+                    new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
+                });
+
+            Assert.Equal(Bucketizer.BucketizerInvalidOptions.keep,
+                bucketizer.GetHandleInvalid());
+
+            DataFrame input = 
+                _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)");
+
+            DataFrame output = bucketizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a"));
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b"));
+        }
     }
 }

From 97ef66865fde34f7ff32d619113f06bc478a2bf9 Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 8 Jan 2020 21:40:36 +0000
Subject: [PATCH 05/47] Update
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs

Co-Authored-By: Steve Suh <suhsteve@gmail.com>
---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 260febd5e..766bd64a5 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -31,7 +31,7 @@ internal Bucketizer(JvmObjectReference jvmObject)
         }
 
         /// <summary>
-        /// Create a `Bucketizer` without any parameters
+        /// Create a <see cref="Bucketizer"/> without any parameters
         /// </summary>
         public Bucketizer()
         {

From 45439742ad039efeee768465cd8d030438c7512f Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 8 Jan 2020 21:40:43 +0000
Subject: [PATCH 06/47] Update
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs

Co-Authored-By: Steve Suh <suhsteve@gmail.com>
---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 766bd64a5..8d4882364 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -104,7 +104,7 @@ public Bucketizer SetInputCol(string value)
         /// </summary>
         /// <param name="value">List of input columns to use as sources for buckets</param>
         /// <returns>`Bucketizer`</returns>
-        public Bucketizer SetInputCols(List<string> value)
+        public Bucketizer SetInputCols(IEnumerable<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
         }

From fd18cf425bd8e873a4436e0aa21f6d191c1587ef Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 8 Jan 2020 21:41:00 +0000
Subject: [PATCH 07/47] Update
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs

Co-Authored-By: Steve Suh <suhsteve@gmail.com>
---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 8d4882364..a169cc5e7 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -186,7 +186,7 @@ public BucketizerInvalidOptions GetHandleInvalid()
         /// </summary>
         /// <param name="value">`BucketizerInvalidOptions`, "skip", "error" or "keep"</param>
         /// <returns>`Bucketizer`</returns>
-        public Bucketizer SetHandleInvalid(BucketizerInvalidOptions value)
+        public Bucketizer SetHandleInvalid(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }

From 64551c93636e7178d7a85535c26d8d254f804c74 Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 8 Jan 2020 21:41:20 +0000
Subject: [PATCH 08/47] Update
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs

Co-Authored-By: Steve Suh <suhsteve@gmail.com>
---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index a169cc5e7..f6b05305e 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -167,7 +167,7 @@ public string Uid()
         /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep"
         /// </summary>
         /// <returns>`BucketizerInvalidOptions`</returns>
-        public BucketizerInvalidOptions GetHandleInvalid()
+        public string GetHandleInvalid()
         {
             string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid");
             if (BucketizerInvalidOptions.TryParse(handleInvalid, true, 

From fb70f403d8af4ae1a659247c039d042e8eb97b56 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 8 Jan 2020 21:41:54 +0000
Subject: [PATCH 09/47] tidying

---
 .../IpcTests/ML/Feature/BucketizerTests.cs       |  5 ++++-
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs     | 16 +++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 002a9812f..72da9cbc5 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -1,6 +1,9 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
 using System;
 using System.Collections.Generic;
-using System.Linq;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 260febd5e..22d62128c 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -22,14 +22,14 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class Bucketizer : IJvmObjectReferenceProvider
     {
-        private readonly JvmObjectReference _jvmObject = null;
-        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
-
         internal Bucketizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-
+        
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+        
         /// <summary>
         /// Create a `Bucketizer` without any parameters
         /// </summary>
@@ -51,7 +51,7 @@ public Bucketizer(string uid)
         
         /// <summary>
         /// Split points for splitting a single column into buckets. To split multiple columns use
-        /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time.
+        /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time
         /// </summary>
         /// <param name="value">
         /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
@@ -135,7 +135,8 @@ public Bucketizer SetOutputCols(List<string> value)
         /// columns with the bucketed data.
         /// </summary>
         /// <param name="source">The DataFrame to add the bucketed data to</param>
-        /// <returns>`DataFrame` containing the original data and the new bucketed columns</returns>
+        /// <returns>`DataFrame` containing the original data and the new bucketed
+        ///             columns</returns>
         public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform"
@@ -146,7 +147,8 @@ public DataFrame Transform(DataFrame source)
         /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
         ///  `Bucketizer`
         /// </summary>
-        /// <param name="obj">The `JvmObjectReference` to convert into a dotnet `Bucketizer`</param>
+        /// <param name="obj">The `JvmObjectReference` to convert into a dotnet
+        ///                     `Bucketizer`</param>
         /// <returns>`Bucketizer`</returns>
         private static Bucketizer WrapAsBucketizer(object obj)
         {

From 9891847c940a3d9777d53aea25499e30eb281597 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 8 Jan 2020 22:17:19 +0000
Subject: [PATCH 10/47] changes after review

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    |  25 ++--
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 110 ++++++++----------
 2 files changed, 61 insertions(+), 74 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 72da9cbc5..9ec077d23 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -11,7 +11,7 @@
 namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
 {
     [Collection("Spark E2E Tests")]
-    public class BucketizerTests 
+    public class BucketizerTests
     {
         private readonly SparkSession _spark;
 
@@ -19,43 +19,44 @@ public BucketizerTests(SparkFixture fixture)
         {
             _spark = fixture.Spark;
         }
-        
+
         [Fact]
         public void TestBucketizer()
         {
             Bucketizer bucketizer = new Bucketizer("uid")
                 .SetInputCol("input_col")
                 .SetOutputCol("output_col")
-                .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.skip)
+                .SetHandleInvalid("skip")
                 .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue});
 
-            Assert.Equal(Bucketizer.BucketizerInvalidOptions.skip,
+            Assert.Equal("skip",
                 bucketizer.GetHandleInvalid());
 
             Assert.Equal("uid", bucketizer.Uid());
-            
+
             DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");
 
             DataFrame output = bucketizer.Transform(input);
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
         }
-        
+
         [Fact]
         public void TestBucketizer_MultipleColumns()
         {
             Bucketizer bucketizer = new Bucketizer()
-                .SetInputCols(new List<string>(){"input_col_a", "input_col_b"})
-                .SetOutputCols(new List<string>(){"output_col_a", "output_col_b"})
-                .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.keep)
-                .SetSplitsArray(new []{
+                .SetInputCols(new List<string>() {"input_col_a", "input_col_b"})
+                .SetOutputCols(new List<string>() {"output_col_a", "output_col_b"})
+                .SetHandleInvalid("keep")
+                .SetSplitsArray(new[]
+                {
                     new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
                     new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
                 });
 
-            Assert.Equal(Bucketizer.BucketizerInvalidOptions.keep,
+            Assert.Equal("keep",
                 bucketizer.GetHandleInvalid());
 
-            DataFrame input = 
+            DataFrame input =
                 _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)");
 
             DataFrame output = bucketizer.Transform(input);
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 25465a0b1..a90582584 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -13,12 +13,13 @@
 namespace Microsoft.Spark.ML.Feature
 {
     /// <summary>
-    /// `Bucketizer` maps a column of continuous features to a column of feature buckets.
+    /// <see cref="Bucketizer"/> maps a column of continuous features to a column of feature
+    /// buckets.
     /// 
-    /// `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note
-    /// that when both the `inputCol` and `inputCols` parameters are set, an Exception will be
-    /// thrown. The `splits` parameter is only used for single column usage, and `splitsArray` is
-    /// for multiple columns.
+    /// <see cref="Bucketizer"/> can map multiple columns at once by setting the inputCols
+    /// parameter. Note that when both the inputCol and inputCols parameters are set, an Exception
+    /// will be thrown. The splits parameter is only used for single column usage, and splitsArray
+    /// is for multiple columns.
     /// </summary>
     public class Bucketizer : IJvmObjectReferenceProvider
     {
@@ -26,10 +27,10 @@ internal Bucketizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-        
+
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
-        
+
         /// <summary>
         /// Create a <see cref="Bucketizer"/> without any parameters
         /// </summary>
@@ -40,7 +41,8 @@ public Bucketizer()
         }
 
         /// <summary>
-        /// Create a `Bucketizer` with a UID that is used to give the `Bucketizer` a unique ID
+        /// Create a <see cref="Bucketizer"/> with a UID that is used to give the
+        /// <see cref="Bucketizer"/> a unique ID
         /// </summary>
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Bucketizer(string uid)
@@ -48,10 +50,10 @@ public Bucketizer(string uid)
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
                 "org.apache.spark.ml.feature.Bucketizer", uid);
         }
-        
+
         /// <summary>
         /// Split points for splitting a single column into buckets. To split multiple columns use
-        /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time
+        /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
         /// </summary>
         /// <param name="value">
         /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
@@ -59,7 +61,7 @@ public Bucketizer(string uid)
         /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
         /// increasing. Values outside the splits specified will be treated as errors.
         /// </param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetSplits(double[] value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
@@ -67,7 +69,7 @@ public Bucketizer SetSplits(double[] value)
 
         /// <summary>
         /// Split points fot splitting multiple columns into buckets. To split a single column use
-        /// `SetSplits`.  You cannot use both `SetSplits` and `SetSplitsArray` at the same time.
+        /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
         /// </summary>
         /// <param name="value">
         /// The array of split points for mapping continuous features into buckets for multiple 
@@ -75,7 +77,7 @@ public Bucketizer SetSplits(double[] value)
         /// by splits x,y holds values in the range [x,y) except the last bucket, which also
         /// includes y. The splits should be of length &gt;= 3 and strictly increasing.
         /// Values outside the splits specified will be treated as errors.</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetSplitsArray(double[][] value)
         {
             DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject,
@@ -87,122 +89,106 @@ public Bucketizer SetSplitsArray(double[][] value)
         }
 
         /// <summary>
-        /// Sets the column that the `Bucketizer` should read from and convert into buckets
+        /// Sets the column that the <see cref="Bucketizer"/> should read from and convert into
+        /// buckets
         /// </summary>
         /// <param name="value">The name of the column to as the source of the buckets</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetInputCol(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
         }
 
         /// <summary>
-        /// Sets the columns that `Bucketizer` should read from and convert into buckets.
+        /// Sets the columns that <see cref="Bucketizer"/> should read from and convert into
+        /// buckets.
         ///
         /// Each column is one set of buckets so if you have two input columns you can have two
         ///  sets of buckets and two output columns.
         /// </summary>
         /// <param name="value">List of input columns to use as sources for buckets</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetInputCols(IEnumerable<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
         }
 
         /// <summary>
-        /// The `Bucketizer` will create a new column in the DataFrame, this is the name of the
-        /// new column.
+        /// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column which contains the bucket ID</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetOutputCol(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
         }
-        
+
         /// <summary>
-        /// The list of columns that the `Bucketizer` will create in the DataFrame.
+        /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
         /// </summary>
         /// <param name="value">List of column names which will contain the bucket ID</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetOutputCols(List<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
         }
-        
+
         /// <summary>
-        /// Executes the `Bucketizer` and transforms the DataFrame to include the new column or
-        /// columns with the bucketed data.
+        /// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new
+        /// column or columns with the bucketed data.
         /// </summary>
         /// <param name="source">The DataFrame to add the bucketed data to</param>
-        /// <returns>`DataFrame` containing the original data and the new bucketed
-        ///             columns</returns>
+        /// <returns><see cref="DataFrame"/> containing the original data and the new bucketed
+        /// columns</returns>
         public DataFrame Transform(DataFrame source)
         {
-            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform"
-                , source));
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
 
         /// <summary>
         /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        ///  `Bucketizer`
+        /// <see cref="Bucketizer"/>
         /// </summary>
-        /// <param name="obj">The `JvmObjectReference` to convert into a dotnet
-        ///                     `Bucketizer`</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="Bucketizer"/></param>
+        /// <returns><see cref="Bucketizer"/></returns>
         private static Bucketizer WrapAsBucketizer(object obj)
         {
             return new Bucketizer((JvmObjectReference)obj);
         }
 
         /// <summary>
-        /// The uid that was used to create the `Bucketizer`. If no `UID` is passed in when creating
-        ///  the `Bucketizer` then a random `UID` is created when the `Bucketizer` is created.
+        /// The uid that was used to create the <see cref="Bucketizer"/>. If no UID is passed in
+        /// when creating the <see cref="Bucketizer"/> then a random UID is created when the
+        /// <see cref="Bucketizer"/> is created.
         /// </summary>
-        /// <returns>string `UID` identifying the `Bucketizer`</returns>
+        /// <returns>string UID identifying the <see cref="Bucketizer"/></returns>
         public string Uid()
         {
             return (string)_jvmObject.Invoke("uid");
         }
 
         /// <summary>
-        /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep"
+        /// How should the <see cref="Bucketizer"/> handle invalid data, choices are "skip",
+        /// "error" or "keep"
         /// </summary>
-        /// <returns>`BucketizerInvalidOptions`</returns>
+        /// <returns>string showing the way Spark will handle invalid data</returns>
         public string GetHandleInvalid()
         {
-            string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid");
-            if (BucketizerInvalidOptions.TryParse(handleInvalid, true, 
-                out BucketizerInvalidOptions result))
-            {
-                return result;
-            }
-            
-            return result;
+            return (string)_jvmObject.Invoke("getHandleInvalid");
         }
 
         /// <summary>
-        /// Tells the `Bucketizer` what to do with invalid data.
+        /// Tells the <see cref="Bucketizer"/> what to do with invalid data.
         ///
         /// Choices are "skip", "error" or "keep". Default is "error"
         /// </summary>
-        /// <param name="value">`BucketizerInvalidOptions`, "skip", "error" or "keep"</param>
-        /// <returns>`Bucketizer`</returns>
+        /// <param name="value">"skip", "error" or "keep"</param>
+        /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetHandleInvalid(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }
-        
-        /// <summary>
-        /// dotnet version of the options that can be passed to the `Bucketizer` to tell it how to
-        ///  handle invalid data.
-        /// </summary>
-        public enum BucketizerInvalidOptions
-        {
-            unknown,
-            skip,
-            error,
-            keep
-        }
     }
 }

From e2ce7369e71110d25c5f7c466363e3356bd1c55d Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Thu, 9 Jan 2020 22:41:40 +0000
Subject: [PATCH 11/47] TF-IDF

---
 .../IpcTests/ML/Feature/HashingTFTests.cs     |  40 ++++++
 .../IpcTests/ML/Feature/IDFModelTests.cs      |  41 ++++++
 .../IpcTests/ML/Feature/IDFTests.cs           |  40 ++++++
 .../IpcTests/ML/Feature/TokenizerTests.cs     |  39 ++++++
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  |   1 -
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 113 +++++++++++++++++
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs  | 118 ++++++++++++++++++
 .../Microsoft.Spark/ML/Feature/IDFModel.cs    | 105 ++++++++++++++++
 .../Microsoft.Spark/ML/Feature/Tokenizer.cs   | 104 +++++++++++++++
 9 files changed, 600 insertions(+), 1 deletion(-)
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
 create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
new file mode 100644
index 000000000..37de01c40
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -0,0 +1,40 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class HashingTFTests
+    {
+        private readonly SparkSession _spark;
+
+        public HashingTFTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestHashingTF()
+        {
+            HashingTF HashingTF = new HashingTF("uid")
+                .SetNumFeatures(10)
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col");
+
+            Assert.Equal("uid", HashingTF.Uid());
+
+            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
+                                            " as input_col");
+
+            DataFrame output = HashingTF.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
new file mode 100644
index 000000000..3c88f5872
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -0,0 +1,41 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class IDFModelTests
+    {
+        private readonly SparkSession _spark;
+
+        public IDFModelTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestIDFModel()
+        {
+            IDF idf = new IDF("uid")
+                .SetMinDocFreq(2)
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col");
+
+            Assert.Equal("uid", idf.Uid());
+
+            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
+                                            " as input_col");
+
+            IDFModel model = idf.Fit(input);
+            model.Transform(input);
+
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
new file mode 100644
index 000000000..c901e813a
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -0,0 +1,40 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class IDFTests
+    {
+        private readonly SparkSession _spark;
+
+        public IDFTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestIDF()
+        {
+            IDF idf = new IDF("uid")
+                .SetMinDocFreq(2)
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col");
+
+            Assert.Equal("uid", idf.Uid());
+
+            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
+                                            " as input_col");
+
+            IDFModel model = idf.Fit(input);
+            
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
new file mode 100644
index 000000000..19eb9216f
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -0,0 +1,39 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class TokenizerTests
+    {
+        private readonly SparkSession _spark;
+
+        public TokenizerTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestTokenizer()
+        {
+            Tokenizer Tokenizer = new Tokenizer("uid")
+                .SetInputCol("input_col")
+                .SetOutputCol("output_col");
+
+            Assert.Equal("uid", Tokenizer.Uid());
+
+            DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + 
+                                                " from range(100)");
+
+            DataFrame output = Tokenizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index a90582584..18afccc87 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -8,7 +8,6 @@
 using Microsoft.Spark.Interop.Ipc;
 using Microsoft.Spark.ML.Param;
 using Microsoft.Spark.Sql;
-using Microsoft.Spark.Sql.Types;
 
 namespace Microsoft.Spark.ML.Feature
 {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
new file mode 100644
index 000000000..6356ea53b
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -0,0 +1,113 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// A <see cref="HashingTF"/> Maps a sequence of terms to their term frequencies using the
+    /// hashing trick. Currently we use Austin Appleby's MurmurHash 3 algorithm
+    /// (MurmurHash3_x86_32) to calculate the hash code value for the term object. Since a simple
+    /// modulo is used to transform the hash function to a column index, it is advisable to use a
+    /// power of two as the numFeatures parameter; otherwise the features will not be mapped evenly
+    /// to the columns.
+    /// </summary>
+    public class HashingTF : IJvmObjectReferenceProvider
+    {
+       
+        /// <summary>
+        /// Create a <see cref="HashingTF"/> without any parameters
+        /// </summary>
+        public HashingTF()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.HashingTF");
+        }
+
+        /// <summary>
+        /// Create a <see cref="HashingTF"/> with a UID that is used to give the
+        /// <see cref="HashingTF"/> a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public HashingTF(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.HashingTF", uid);
+        }
+        
+        internal HashingTF(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Sets the column that the <see cref="HashingTF"/> should read from
+        /// </summary>
+        /// <param name="value">The name of the column to as the source</param>
+        /// <returns><see cref="HashingTF"/></returns>
+        public HashingTF SetInputCol(string value)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// The <see cref="HashingTF"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column
+        /// </param>
+        /// <returns><see cref="HashingTF"/></returns>
+        public HashingTF SetOutputCol(string value)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value));
+        }
+
+        public HashingTF SetNumFeatures(int value)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value));
+        }
+        
+        /// <summary>
+        /// Executes the <see cref="HashingTF"/> and transforms the DataFrame to include the new
+        /// column or columns with the tokens.
+        /// </summary>
+        /// <param name="source">The DataFrame to add the tokens to</param>
+        /// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="HashingTF"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="HashingTF"/></param>
+        /// <returns><see cref="HashingTF"/></returns>
+        private static HashingTF WrapAsHashingTF(object obj)
+        {
+            return new HashingTF((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the <see cref="HashingTF"/>. If no UID is passed in
+        /// when creating the <see cref="HashingTF"/> then a random UID is created when the
+        /// <see cref="HashingTF"/> is created.
+        /// </summary>
+        /// <returns>string UID identifying the <see cref="HashingTF"/></returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
new file mode 100644
index 000000000..663b887d0
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -0,0 +1,118 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// Inverse document frequency (IDF). The standard formulation is used:
+    ///     idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is
+    /// the number of documents that contain term t.
+    /// 
+    /// This implementation supports filtering out terms which do not appear in a minimum number
+    /// of documents (controlled by the variable minDocFreq). For terms that are not in at least
+    /// minDocFreq documents, the IDF is found as 0, resulting in TF-IDFs of 0.
+    /// </summary>
+    public class IDF : IJvmObjectReferenceProvider
+    {
+       
+        /// <summary>
+        /// Create a <see cref="IDF"/> without any parameters
+        /// </summary>
+        public IDF()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.IDF");
+        }
+
+        /// <summary>
+        /// Create a <see cref="IDF"/> with a UID that is used to give the
+        /// <see cref="IDF"/> a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public IDF(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.IDF", uid);
+        }
+        
+        internal IDF(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Sets the column that the <see cref="IDF"/> should read from
+        /// </summary>
+        /// <param name="value">The name of the column to as the source</param>
+        /// <returns><see cref="IDF"/></returns>
+        public IDF SetInputCol(string value)
+        {
+            return WrapAsIDF(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// The <see cref="IDF"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column
+        /// </param>
+        /// <returns><see cref="IDF"/></returns>
+        public IDF SetOutputCol(string value)
+        {
+            return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value));
+        }
+
+        /// <summary>
+        /// Minimum of documents in which a term should appear for filtering
+        /// </summary>
+        /// <param name="value"></param>
+        /// <returns></returns>
+        public IDF SetMinDocFreq(int value)
+        {
+            return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value));
+        }
+        
+        /// <summary>
+        /// Fits a model to the input data.
+        /// </summary>
+        /// <param name="source">The DataFrame to fit the model to</param>
+        /// <returns><see cref="IDFModel"/></returns>
+        public IDFModel Fit(DataFrame source)
+        {
+            return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="IDF"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="IDF"/></param>
+        /// <returns><see cref="IDF"/></returns>
+        private static IDF WrapAsIDF(object obj)
+        {
+            return new IDF((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the <see cref="IDF"/>. If no UID is passed in
+        /// when creating the <see cref="IDF"/> then a random UID is created when the
+        /// <see cref="IDF"/> is created.
+        /// </summary>
+        /// <returns>string UID identifying the <see cref="IDF"/></returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
new file mode 100644
index 000000000..d9cc13882
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -0,0 +1,105 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// A <see cref="IDFModel"/> that converts the input string to lowercase and then splits it by
+    /// white spaces.
+    /// </summary>
+    public class IDFModel : IJvmObjectReferenceProvider
+    {
+       
+        /// <summary>
+        /// Create a <see cref="IDFModel"/> without any parameters
+        /// </summary>
+        public IDFModel()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.IDFModel");
+        }
+
+        /// <summary>
+        /// Create a <see cref="IDFModel"/> with a UID that is used to give the
+        /// <see cref="IDFModel"/> a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public IDFModel(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.IDFModel", uid);
+        }
+        
+        internal IDFModel(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Sets the column that the <see cref="IDFModel"/> should read from and convert into
+        /// buckets
+        /// </summary>
+        /// <param name="value">The name of the column to as the source</param>
+        /// <returns><see cref="IDFModel"/></returns>
+        public IDFModel SetInputCol(string value)
+        {
+            return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// The <see cref="IDFModel"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column which contains the tokens
+        /// </param>
+        /// <returns><see cref="IDFModel"/></returns>
+        public IDFModel SetOutputCol(string value)
+        {
+            return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value));
+        }
+        
+        /// <summary>
+        /// Executes the <see cref="IDFModel"/> and transforms the DataFrame to include the new
+        /// column or columns with the tokens.
+        /// </summary>
+        /// <param name="source">The DataFrame to add the tokens to</param>
+        /// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="IDFModel"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="IDFModel"/></param>
+        /// <returns><see cref="IDFModel"/></returns>
+        private static IDFModel WrapAsIDFModel(object obj)
+        {
+            return new IDFModel((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the <see cref="IDFModel"/>. If no UID is passed in
+        /// when creating the <see cref="IDFModel"/> then a random UID is created when the
+        /// <see cref="IDFModel"/> is created.
+        /// </summary>
+        /// <returns>string UID identifying the <see cref="IDFModel"/></returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
new file mode 100644
index 000000000..3b2d395e9
--- /dev/null
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -0,0 +1,104 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using Microsoft.Spark.Interop;
+using Microsoft.Spark.Interop.Ipc;
+using Microsoft.Spark.Sql;
+
+namespace Microsoft.Spark.ML.Feature
+{
+    /// <summary>
+    /// A <see cref="Tokenizer"/> that converts the input string to lowercase and then splits it by
+    /// white spaces.
+    /// </summary>
+    public class Tokenizer : IJvmObjectReferenceProvider
+    {
+       
+        /// <summary>
+        /// Create a <see cref="Tokenizer"/> without any parameters
+        /// </summary>
+        public Tokenizer()
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Tokenizer");
+        }
+
+        /// <summary>
+        /// Create a <see cref="Tokenizer"/> with a UID that is used to give the
+        /// <see cref="Tokenizer"/> a unique ID
+        /// </summary>
+        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
+        public Tokenizer(string uid)
+        {
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
+                "org.apache.spark.ml.feature.Tokenizer", uid);
+        }
+        
+        internal Tokenizer(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
+        }
+
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+
+        /// <summary>
+        /// Sets the column that the <see cref="Tokenizer"/> should read from
+        /// </summary>
+        /// <param name="value">The name of the column to as the source</param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        public Tokenizer SetInputCol(string value)
+        {
+            return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value));
+        }
+
+        /// <summary>
+        /// The <see cref="Tokenizer"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column
+        /// </param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        public Tokenizer SetOutputCol(string value)
+        {
+            return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value));
+        }
+        
+        /// <summary>
+        /// Executes the <see cref="Tokenizer"/> and transforms the DataFrame to include the new
+        /// column
+        /// </summary>
+        /// <param name="source">The DataFrame to transform</param>
+        /// <returns><see cref="DataFrame"/></returns>
+        public DataFrame Transform(DataFrame source)
+        {
+            return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
+        }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="Tokenizer"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="Tokenizer"/></param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        private static Tokenizer WrapAsTokenizer(object obj)
+        {
+            return new Tokenizer((JvmObjectReference)obj);
+        }
+
+        /// <summary>
+        /// The uid that was used to create the <see cref="Tokenizer"/>. If no UID is passed in
+        /// when creating the <see cref="Tokenizer"/> then a random UID is created when the
+        /// <see cref="Tokenizer"/> is created.
+        /// </summary>
+        /// <returns>string UID identifying the <see cref="Tokenizer"/></returns>
+        public string Uid()
+        {
+            return (string)_jvmObject.Invoke("uid");
+        }
+    }
+}

From 3cc3f8def7f45ab021e500c79cb6545a1b1b8592 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Thu, 9 Jan 2020 23:40:45 +0000
Subject: [PATCH 12/47] removing step

---
 .../IpcTests/ML/Feature/HashingTFTests.cs                        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 37de01c40..80f35e7bd 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -34,7 +34,6 @@ public void TestHashingTF()
                                             " as input_col");
 
             DataFrame output = HashingTF.Transform(input);
-            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
         }
     }
 }

From 6cfd0e4e9a2e8a54cf57fbe0dc514c3c53093896 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Thu, 9 Jan 2020 23:50:12 +0000
Subject: [PATCH 13/47] single test for IDF and IDFModel

---
 .../IpcTests/ML/Feature/IDFModelTests.cs      | 20 +++++-----
 .../IpcTests/ML/Feature/IDFTests.cs           | 40 -------------------
 2 files changed, 11 insertions(+), 49 deletions(-)
 delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 3c88f5872..06a4a3be0 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -23,19 +23,21 @@ public IDFModelTests(SparkFixture fixture)
         [Fact]
         public void TestIDFModel()
         {
-            IDF idf = new IDF("uid")
-                .SetMinDocFreq(2)
-                .SetInputCol("input_col")
-                .SetOutputCol("output_col");
+            DataFrame sentenceData =
+                _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
+            Tokenizer tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words");
+            DataFrame wordsData = tokenizer.Transform(sentenceData);
 
-            Assert.Equal("uid", idf.Uid());
+            HashingTF hashingTF = new HashingTF()
+                .SetInputCol("words").SetOutputCol("rawFeatures").SetNumFeatures(20);
 
-            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
-                                            " as input_col");
+            DataFrame featurizedData = hashingTF.Transform(wordsData);
 
-            IDFModel model = idf.Fit(input);
-            model.Transform(input);
+            IDF idf = new IDF().SetInputCol("rawFeatures").SetOutputCol("features");
+            IDFModel idfModel = idf.Fit(featurizedData);
 
+            DataFrame rescaledData = idfModel.Transform(featurizedData);
+            
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
deleted file mode 100644
index c901e813a..000000000
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
+++ /dev/null
@@ -1,40 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-// See the LICENSE file in the project root for more information.
-
-using System;
-using System.Collections.Generic;
-using Microsoft.Spark.ML.Feature;
-using Microsoft.Spark.Sql;
-using Xunit;
-
-namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
-{
-    [Collection("Spark E2E Tests")]
-    public class IDFTests
-    {
-        private readonly SparkSession _spark;
-
-        public IDFTests(SparkFixture fixture)
-        {
-            _spark = fixture.Spark;
-        }
-
-        [Fact]
-        public void TestIDF()
-        {
-            IDF idf = new IDF("uid")
-                .SetMinDocFreq(2)
-                .SetInputCol("input_col")
-                .SetOutputCol("output_col");
-
-            Assert.Equal("uid", idf.Uid());
-
-            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
-                                            " as input_col");
-
-            IDFModel model = idf.Fit(input);
-            
-        }
-    }
-}

From 633a843618ab8156af6c51b8f64ed4150196c0a7 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 07:49:05 +0000
Subject: [PATCH 14/47] SerDe to handle double[][] for Bucketizer

---
 .../IpcTests/ML/Feature/BucketizerTests.cs     | 12 +++++++-----
 .../Interop/Ipc/PayloadHelper.cs               | 15 +++++++++++++++
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs   | 18 +++++++-----------
 .../org/apache/spark/api/dotnet/SerDe.scala    |  8 +++++++-
 .../org/apache/spark/api/dotnet/SerDe.scala    |  8 +++++++-
 .../org/apache/spark/api/dotnet/SerDe.scala    |  8 +++++++-
 6 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 9ec077d23..7ee217eca 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -43,15 +43,17 @@ public void TestBucketizer()
         [Fact]
         public void TestBucketizer_MultipleColumns()
         {
+            double[][] splitsArray = new[]
+            {
+                new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
+                new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
+            };
+                
             Bucketizer bucketizer = new Bucketizer()
                 .SetInputCols(new List<string>() {"input_col_a", "input_col_b"})
                 .SetOutputCols(new List<string>() {"output_col_a", "output_col_b"})
                 .SetHandleInvalid("keep")
-                .SetSplitsArray(new[]
-                {
-                    new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
-                    new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
-                });
+                .SetSplitsArray(splitsArray);
 
             Assert.Equal("keep",
                 bucketizer.GetHandleInvalid());
diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
index 5bfeee865..06dcb8969 100644
--- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
+++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
@@ -24,6 +24,7 @@ internal class PayloadHelper
         private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' };
         private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' };
         private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' };
+        private static readonly byte[] s_doubleArrayArrayTypeId = new[] {(byte)'A'};
         private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' };
         private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' };
         private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' };
@@ -135,6 +136,19 @@ internal static void ConvertArgsToBytes(
                                     SerDe.Write(destination, d);
                                 }
                                 break;
+                            
+                            case double[][] argDoubleArrayArray:
+                                SerDe.Write(destination, s_doubleArrayArrayTypeId);
+                                SerDe.Write(destination, argDoubleArrayArray.Length);
+                                foreach (double[] doubleArray in argDoubleArrayArray)
+                                {
+                                    SerDe.Write(destination, doubleArray.Length);
+                                    foreach (double d in doubleArray)
+                                    {
+                                        SerDe.Write(destination, d);
+                                    }
+                                }
+                                break;
 
                             case IEnumerable<byte[]> argByteArrayEnumerable:
                                 SerDe.Write(destination, s_byteArrayTypeId);
@@ -286,6 +300,7 @@ internal static byte[] GetTypeId(Type type)
                     if (type == typeof(int[]) ||
                         type == typeof(long[]) ||
                         type == typeof(double[]) ||
+                        type == typeof(double[][]) ||
                         typeof(IEnumerable<byte[]>).IsAssignableFrom(type) ||
                         typeof(IEnumerable<string>).IsAssignableFrom(type))
                     {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index a90582584..af2041945 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -27,10 +27,7 @@ internal Bucketizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-
-        private readonly JvmObjectReference _jvmObject;
-        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
-
+        
         /// <summary>
         /// Create a <see cref="Bucketizer"/> without any parameters
         /// </summary>
@@ -50,7 +47,10 @@ public Bucketizer(string uid)
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
                 "org.apache.spark.ml.feature.Bucketizer", uid);
         }
-
+        
+        private readonly JvmObjectReference _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+        
         /// <summary>
         /// Split points for splitting a single column into buckets. To split multiple columns use
         /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
@@ -80,12 +80,8 @@ public Bucketizer SetSplits(double[] value)
         /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer SetSplitsArray(double[][] value)
         {
-            DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject,
-                "setSplitsArray",
-                "wrapper for double[][] from csharp", value);
-
-            return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray",
-                doubleArrayArray.ReferenceValue));
+            double[][][] wrappedValue = new[] {value};
+            return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", wrappedValue));
         }
 
         /// <summary>
diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 6d1ba1077..169e244e1 100644
--- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -118,6 +118,11 @@ object SerDe {
     (0 until len).map(_ => readDouble(in)).toArray
   }
 
+  def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = {
+     val len = readInt(in)
+      (0 until len).map(_ => readDoubleArr(in)).toArray
+  }
+
   def readBooleanArr(in: DataInputStream): Array[Boolean] = {
     val len = readInt(in)
     (0 until len).map(_ => readBoolean(in)).toArray
@@ -140,6 +145,7 @@ object SerDe {
       case 'g' => readLongArr(dis)
       case 'c' => readStringArr(dis)
       case 'd' => readDoubleArr(dis)
+      case 'A' => readDoubleArrArr(dis)
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
@@ -360,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
+}
\ No newline at end of file
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 6d1ba1077..169e244e1 100644
--- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -118,6 +118,11 @@ object SerDe {
     (0 until len).map(_ => readDouble(in)).toArray
   }
 
+  def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = {
+     val len = readInt(in)
+      (0 until len).map(_ => readDoubleArr(in)).toArray
+  }
+
   def readBooleanArr(in: DataInputStream): Array[Boolean] = {
     val len = readInt(in)
     (0 until len).map(_ => readBoolean(in)).toArray
@@ -140,6 +145,7 @@ object SerDe {
       case 'g' => readLongArr(dis)
       case 'c' => readStringArr(dis)
       case 'd' => readDoubleArr(dis)
+      case 'A' => readDoubleArrArr(dis)
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
@@ -360,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
+}
\ No newline at end of file
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 6d1ba1077..169e244e1 100644
--- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -118,6 +118,11 @@ object SerDe {
     (0 until len).map(_ => readDouble(in)).toArray
   }
 
+  def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = {
+     val len = readInt(in)
+      (0 until len).map(_ => readDoubleArr(in)).toArray
+  }
+
   def readBooleanArr(in: DataInputStream): Array[Boolean] = {
     val len = readInt(in)
     (0 until len).map(_ => readBoolean(in)).toArray
@@ -140,6 +145,7 @@ object SerDe {
       case 'g' => readLongArr(dis)
       case 'c' => readStringArr(dis)
       case 'd' => readDoubleArr(dis)
+      case 'A' => readDoubleArrArr(dis)
       case 'b' => readBooleanArr(dis)
       case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x))
       case 'r' => readBytesArr(dis)
@@ -360,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
+}
\ No newline at end of file

From f4ecbb0106564d7835c3238cce32297a083da68e Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 08:25:29 +0000
Subject: [PATCH 15/47] remove DoubleArrayArrayParam

---
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  |  1 -
 .../ML/Param/DoubleArrayArrayParam.cs         | 38 -------------------
 .../org/apache/spark/api/dotnet/SerDe.scala   |  2 +-
 .../org/apache/spark/api/dotnet/SerDe.scala   |  2 +-
 .../org/apache/spark/api/dotnet/SerDe.scala   |  2 +-
 5 files changed, 3 insertions(+), 42 deletions(-)
 delete mode 100644 src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index af2041945..380adf887 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -6,7 +6,6 @@
 using System.Collections.Generic;
 using Microsoft.Spark.Interop;
 using Microsoft.Spark.Interop.Ipc;
-using Microsoft.Spark.ML.Param;
 using Microsoft.Spark.Sql;
 using Microsoft.Spark.Sql.Types;
 
diff --git a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
deleted file mode 100644
index 7afe243c5..000000000
--- a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs
+++ /dev/null
@@ -1,38 +0,0 @@
-using System;
-using Microsoft.Spark.Interop;
-using Microsoft.Spark.Interop.Ipc;
-using Newtonsoft.Json;
-
-namespace Microsoft.Spark.ML.Param
-{
-    /// <summary>
-    /// Internal class used to help the `Bucketizer` pass a double[][] into the JVM.
-    /// </summary>
-    class DoubleArrayArrayParam : IJvmObjectReferenceProvider
-    {
-        private readonly JvmObjectReference _jvmObject;
-
-        public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param)
-        {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.param.DoubleArrayArrayParam",
-                parent, name, doc);
-
-            string json = JsonConvert.SerializeObject(param);
-            ReferenceValue = jsonDecode(json);
-        }
-        
-        private JvmObjectReference jsonDecode(string json)
-        {
-            return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json);
-        }
-        public JvmObjectReference Reference { get; }
-        
-        /// <summary>
-        /// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to
-        /// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the
-        /// JVM side to get a double[][]. ReferenceValue is the double[][].
-        /// </summary>
-        public JvmObjectReference ReferenceValue { get; }
-    }
-}
diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 169e244e1..2dca7fbdb 100644
--- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -366,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
\ No newline at end of file
+}
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 169e244e1..2dca7fbdb 100644
--- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -366,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
\ No newline at end of file
+}
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 169e244e1..2dca7fbdb 100644
--- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -366,4 +366,4 @@ private object SerializationFormats {
   val BYTE = "byte"
   val STRING = "string"
   val ROW = "row"
-}
\ No newline at end of file
+}

From b3d4d0fc780df0e1df3690dc1620d7960be405ea Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 21:58:35 +0000
Subject: [PATCH 16/47] SerDe for double[][]

---
 .../Microsoft.Spark/Interop/Ipc/JvmBridge.cs  |  9 +++
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 69 ++++++++++++++++++-
 .../org/apache/spark/api/dotnet/SerDe.scala   | 10 +++
 .../org/apache/spark/api/dotnet/SerDe.scala   | 10 +++
 .../org/apache/spark/api/dotnet/SerDe.scala   | 10 +++
 5 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
index 961200ef3..f8b2e9648 100644
--- a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
+++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
@@ -364,6 +364,15 @@ private object ReadCollection(Stream s)
                         doubleArray[itemIndex] = SerDe.ReadDouble(s);
                     }
                     returnValue = doubleArray;
+                    break;
+                case 'A':
+                    var doubleArrayArray = new double[numOfItemsInList][];
+                    for (int itemIndex = 0; itemIndex < numOfItemsInList; ++itemIndex)
+                    {
+                        doubleArrayArray[itemIndex] = ReadCollection(s) as double[];
+                    }
+                    returnValue = doubleArrayArray;
+                    
                     break;
                 case 'b':
                     var boolArray = new bool[numOfItemsInList];
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 380adf887..0c3acbe2b 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
 using Microsoft.Spark.Interop;
 using Microsoft.Spark.Interop.Ipc;
 using Microsoft.Spark.Sql;
@@ -50,6 +51,20 @@ public Bucketizer(string uid)
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
+        /// <summary>
+        /// Split points for splitting a single column into buckets. To split multiple columns use
+        /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
+        /// </summary>
+        /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
+        /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last
+        /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
+        /// increasing. Values outside the splits specified will be treated as errors.
+        /// <returns><see cref="Bucketizer"/></returns>
+        public double[] GetSplits()
+        {
+            return (double[])_jvmObject.Invoke("getSplits");
+        }
+        
         /// <summary>
         /// Split points for splitting a single column into buckets. To split multiple columns use
         /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
@@ -66,6 +81,16 @@ public Bucketizer SetSplits(double[] value)
             return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
         }
 
+        /// <summary>
+        /// Split points fot splitting multiple columns into buckets. To split a single column use
+        /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
+        /// </summary>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public double[][] GetSplitsArray()
+        {
+            return (double[][])_jvmObject.Invoke("getSplitsArray");
+        } 
+        
         /// <summary>
         /// Split points fot splitting multiple columns into buckets. To split a single column use
         /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
@@ -83,6 +108,16 @@ public Bucketizer SetSplitsArray(double[][] value)
             return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", wrappedValue));
         }
 
+        /// <summary>
+        /// Gets the column that the <see cref="Bucketizer"/> should read from and convert into
+        /// buckets
+        /// </summary>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public string GetInputCol()
+        {
+            return (string)_jvmObject.Invoke("getInputCol");
+        }
+        
         /// <summary>
         /// Sets the column that the <see cref="Bucketizer"/> should read from and convert into
         /// buckets
@@ -93,6 +128,19 @@ public Bucketizer SetInputCol(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
         }
+        
+        /// <summary>
+        /// Gets the columns that <see cref="Bucketizer"/> should read from and convert into
+        /// buckets.
+        ///
+        /// Each column is one set of buckets so if you have two input columns you can have two
+        ///  sets of buckets and two output columns.
+        /// </summary>
+       /// <returns><see cref="Bucketizer"/></returns>
+        public IEnumerable<string> GetInputCols()
+        {
+            return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList();
+        }
 
         /// <summary>
         /// Sets the columns that <see cref="Bucketizer"/> should read from and convert into
@@ -107,7 +155,17 @@ public Bucketizer SetInputCols(IEnumerable<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
         }
-
+        
+        /// <summary>
+        /// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        // <returns><see cref="Bucketizer"/></returns>
+        public string GetOutputCol()
+        {
+            return (string)_jvmObject.Invoke("getOutputCol");
+        }
+        
         /// <summary>
         /// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
@@ -119,6 +177,15 @@ public Bucketizer SetOutputCol(string value)
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
         }
 
+        /// <summary>
+        /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
+        /// </summary>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public IEnumerable<string> GetOutputCols()
+        {
+            return ((string[])_jvmObject.Invoke("getOutputCols")).ToList();
+        }
+        
         /// <summary>
         /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
         /// </summary>
diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 2dca7fbdb..ca5973b96 100644
--- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -195,6 +195,7 @@ object SerDe {
       case "void" => dos.writeByte('n')
       case "character" => dos.writeByte('c')
       case "double" => dos.writeByte('d')
+      case "doublearray" => dos.writeByte('A')
       case "long" => dos.writeByte('g')
       case "integer" => dos.writeByte('i')
       case "logical" => dos.writeByte('b')
@@ -258,6 +259,9 @@ object SerDe {
         case "[D" =>
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
+        case "[[D" =>
+            writeType(dos, "list")
+            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
@@ -343,6 +347,12 @@ object SerDe {
     value.foreach(v => out.writeDouble(v))
   }
 
+  def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = {
+    writeType(out, "doublearray")
+    out.writeInt(value.length)
+    value.foreach(v => writeDoubleArr(out, v))
+  }
+
   def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = {
     writeType(out, "logical")
     out.writeInt(value.length)
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 2dca7fbdb..ca5973b96 100644
--- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -195,6 +195,7 @@ object SerDe {
       case "void" => dos.writeByte('n')
       case "character" => dos.writeByte('c')
       case "double" => dos.writeByte('d')
+      case "doublearray" => dos.writeByte('A')
       case "long" => dos.writeByte('g')
       case "integer" => dos.writeByte('i')
       case "logical" => dos.writeByte('b')
@@ -258,6 +259,9 @@ object SerDe {
         case "[D" =>
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
+        case "[[D" =>
+            writeType(dos, "list")
+            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
@@ -343,6 +347,12 @@ object SerDe {
     value.foreach(v => out.writeDouble(v))
   }
 
+  def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = {
+    writeType(out, "doublearray")
+    out.writeInt(value.length)
+    value.foreach(v => writeDoubleArr(out, v))
+  }
+
   def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = {
     writeType(out, "logical")
     out.writeInt(value.length)
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 2dca7fbdb..ca5973b96 100644
--- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -195,6 +195,7 @@ object SerDe {
       case "void" => dos.writeByte('n')
       case "character" => dos.writeByte('c')
       case "double" => dos.writeByte('d')
+      case "doublearray" => dos.writeByte('A')
       case "long" => dos.writeByte('g')
       case "integer" => dos.writeByte('i')
       case "logical" => dos.writeByte('b')
@@ -258,6 +259,9 @@ object SerDe {
         case "[D" =>
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
+        case "[[D" =>
+            writeType(dos, "list")
+            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
@@ -343,6 +347,12 @@ object SerDe {
     value.foreach(v => out.writeDouble(v))
   }
 
+  def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = {
+    writeType(out, "doublearray")
+    out.writeInt(value.length)
+    value.foreach(v => writeDoubleArr(out, v))
+  }
+
   def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = {
     writeType(out, "logical")
     out.writeInt(value.length)

From 500e7ad691477fb3e711f13c731bc50e5ceb26bd Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 22:00:36 +0000
Subject: [PATCH 17/47] spacing as per other fields

---
 src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
index 06dcb8969..8b6977025 100644
--- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
+++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
@@ -24,7 +24,7 @@ internal class PayloadHelper
         private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' };
         private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' };
         private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' };
-        private static readonly byte[] s_doubleArrayArrayTypeId = new[] {(byte)'A'};
+        private static readonly byte[] s_doubleArrayArrayTypeId = new[] {( byte)'A' };
         private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' };
         private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' };
         private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' };

From 298f4ece6b36487fd83fc30d68dbecc48e6ccb2a Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 22:03:11 +0000
Subject: [PATCH 18/47] formatting

---
 .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala    | 4 ++--
 .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala    | 4 ++--
 .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index ca5973b96..4a6b27a58 100644
--- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -260,8 +260,8 @@ object SerDe {
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
         case "[[D" =>
-            writeType(dos, "list")
-            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
+          writeType(dos, "list")
+          writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index ca5973b96..4a6b27a58 100644
--- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -260,8 +260,8 @@ object SerDe {
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
         case "[[D" =>
-            writeType(dos, "list")
-            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
+          writeType(dos, "list")
+          writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index ca5973b96..4a6b27a58 100644
--- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -260,8 +260,8 @@ object SerDe {
           writeType(dos, "list")
           writeDoubleArr(dos, value.asInstanceOf[Array[Double]])
         case "[[D" =>
-            writeType(dos, "list")
-            writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
+          writeType(dos, "list")
+          writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]])
         case "[Z" =>
           writeType(dos, "list")
           writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]])

From 72d36fd0fa85d665a2d475fbecf59cb8e629f260 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 22:09:21 +0000
Subject: [PATCH 19/47] adding getters to tests

---
 artifaa.                                                  | 0
 build.sh                                                  | 0
 eng/common/build.sh                                       | 0
 .../IpcTests/ML/Feature/BucketizerTests.cs                | 8 ++++++++
 4 files changed, 8 insertions(+)
 create mode 100644 artifaa.
 mode change 100644 => 100755 build.sh
 mode change 100644 => 100755 eng/common/build.sh

diff --git a/artifaa. b/artifaa.
new file mode 100644
index 000000000..e69de29bb
diff --git a/build.sh b/build.sh
old mode 100644
new mode 100755
diff --git a/eng/common/build.sh b/eng/common/build.sh
old mode 100644
new mode 100755
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 7ee217eca..dbaec110c 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -38,6 +38,10 @@ public void TestBucketizer()
 
             DataFrame output = bucketizer.Transform(input);
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+
+            Assert.IsType<string>(bucketizer.GetInputCol());
+            Assert.IsType<string>(bucketizer.GetOutputCol());
+            Assert.IsType<double[]>(bucketizer.GetSplits());
         }
 
         [Fact]
@@ -64,6 +68,10 @@ public void TestBucketizer_MultipleColumns()
             DataFrame output = bucketizer.Transform(input);
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a"));
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b"));
+            
+            Assert.IsType<List<string>>(bucketizer.GetInputCols());
+            Assert.IsType<List<string>>(bucketizer.GetOutputCols());
+            Assert.IsType<double[][]>(bucketizer.GetSplitsArray());
         }
     }
 }

From 696186c84bafa2736a3b2ee136b1051ce36d6b89 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 13 Jan 2020 22:11:46 +0000
Subject: [PATCH 20/47] rollback

---
 artifaa.            | 0
 build.sh            | 0
 eng/common/build.sh | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 artifaa.
 mode change 100755 => 100644 build.sh
 mode change 100755 => 100644 eng/common/build.sh

diff --git a/artifaa. b/artifaa.
deleted file mode 100644
index e69de29bb..000000000
diff --git a/build.sh b/build.sh
old mode 100755
new mode 100644
diff --git a/eng/common/build.sh b/eng/common/build.sh
old mode 100755
new mode 100644

From 33699ea327c39865e8caf1cb23925ed50a6a4b09 Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 15 Jan 2020 07:27:11 +0000
Subject: [PATCH 21/47] Apply suggestions from code review

Co-Authored-By: Steve Suh <suhsteve@gmail.com>
---
 .../IpcTests/ML/Feature/BucketizerTests.cs                  | 6 +++---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs         | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index dbaec110c..8b9a85aab 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -39,9 +39,9 @@ public void TestBucketizer()
             DataFrame output = bucketizer.Transform(input);
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
 
-            Assert.IsType<string>(bucketizer.GetInputCol());
-            Assert.IsType<string>(bucketizer.GetOutputCol());
-            Assert.IsType<double[]>(bucketizer.GetSplits());
+            Assert.Equal("input_col", bucketizer.GetInputCol());
+            Assert.Equal("output_col", bucketizer.GetOutputCol());
+            Assert.Equal(expectedSplits, bucketizer.GetSplits());
         }
 
         [Fact]
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 0c3acbe2b..7ace8c64e 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -134,7 +134,7 @@ public Bucketizer SetInputCol(string value)
         /// buckets.
         ///
         /// Each column is one set of buckets so if you have two input columns you can have two
-        ///  sets of buckets and two output columns.
+        /// sets of buckets and two output columns.
         /// </summary>
        /// <returns><see cref="Bucketizer"/></returns>
         public IEnumerable<string> GetInputCols()
@@ -147,7 +147,7 @@ public IEnumerable<string> GetInputCols()
         /// buckets.
         ///
         /// Each column is one set of buckets so if you have two input columns you can have two
-        ///  sets of buckets and two output columns.
+        /// sets of buckets and two output columns.
         /// </summary>
         /// <param name="value">List of input columns to use as sources for buckets</param>
         /// <returns><see cref="Bucketizer"/></returns>

From 5b8060606d6831f92555b2bb17f089d58b4928cd Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 15 Jan 2020 07:38:01 +0000
Subject: [PATCH 22/47] Fixing comments after review

---
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 34 +++++++------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 7ace8c64e..f62525a3f 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -52,14 +52,9 @@ public Bucketizer(string uid)
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
-        /// Split points for splitting a single column into buckets. To split multiple columns use
-        /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time
+        /// Gets the splits that were set using SetSplits
         /// </summary>
-        /// Split points for mapping continuous features into buckets. With n+1 splits, there are n
-        /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last
-        /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
-        /// increasing. Values outside the splits specified will be treated as errors.
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>double[], the splits to be used to bucket the input column</returns>
         public double[] GetSplits()
         {
             return (double[])_jvmObject.Invoke("getSplits");
@@ -82,10 +77,9 @@ public Bucketizer SetSplits(double[] value)
         }
 
         /// <summary>
-        /// Split points fot splitting multiple columns into buckets. To split a single column use
-        /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time.
+        /// Gets the splits that were set by SetSplitsArray
         /// </summary>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>double[][], the splits to be used to bucket the input columns</returns>
         public double[][] GetSplitsArray()
         {
             return (double[][])_jvmObject.Invoke("getSplitsArray");
@@ -110,9 +104,9 @@ public Bucketizer SetSplitsArray(double[][] value)
 
         /// <summary>
         /// Gets the column that the <see cref="Bucketizer"/> should read from and convert into
-        /// buckets
+        /// buckets. This would have been set by SetInputCol
         /// </summary>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>string, the input column</returns>
         public string GetInputCol()
         {
             return (string)_jvmObject.Invoke("getInputCol");
@@ -131,12 +125,9 @@ public Bucketizer SetInputCol(string value)
         
         /// <summary>
         /// Gets the columns that <see cref="Bucketizer"/> should read from and convert into
-        /// buckets.
-        ///
-        /// Each column is one set of buckets so if you have two input columns you can have two
-        /// sets of buckets and two output columns.
+        /// buckets. This is set by SetInputCol
         /// </summary>
-       /// <returns><see cref="Bucketizer"/></returns>
+       /// <returns>IEnumerable<string>, list of input columns</returns>
         public IEnumerable<string> GetInputCols()
         {
             return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList();
@@ -157,10 +148,10 @@ public Bucketizer SetInputCols(IEnumerable<string> value)
         }
         
         /// <summary>
-        /// The <see cref="Bucketizer"/> will create a new column in the DataFrame, this is the
-        /// name of the new column.
+        /// Gets the name of the column the output data will be written to. This is set by
+        /// SetInputCol
         /// </summary>
-        // <returns><see cref="Bucketizer"/></returns>
+        // <returns>string, the output column</returns>
         public string GetOutputCol()
         {
             return (string)_jvmObject.Invoke("getOutputCol");
@@ -179,8 +170,9 @@ public Bucketizer SetOutputCol(string value)
 
         /// <summary>
         /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
+        /// This is set by SetOutputCols
         /// </summary>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>IEnumerable<string>, list of output columns</returns>
         public IEnumerable<string> GetOutputCols()
         {
             return ((string[])_jvmObject.Invoke("getOutputCols")).ToList();

From 6c12e6aa808fcc73f156433c31e4271687cc642b Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 15 Jan 2020 07:56:54 +0000
Subject: [PATCH 23/47] fixes after review

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    | 55 +++++++++++--------
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  |  4 +-
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 8b9a85aab..a3578fc33 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.Security.Cryptography;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
@@ -23,44 +24,54 @@ public BucketizerTests(SparkFixture fixture)
         [Fact]
         public void TestBucketizer()
         {
-            Bucketizer bucketizer = new Bucketizer("uid")
-                .SetInputCol("input_col")
-                .SetOutputCol("output_col")
-                .SetHandleInvalid("skip")
-                .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue});
+            double[] expectedSplits = new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue};
 
-            Assert.Equal("skip",
-                bucketizer.GetHandleInvalid());
+            string expectedHandle = "skip";
+            string expectedUid = "uid";
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+            
+            Bucketizer bucketizer = new Bucketizer(expectedUid)
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol)
+                .SetHandleInvalid(expectedHandle)
+                .SetSplits(expectedSplits);
+
+            Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid());
 
-            Assert.Equal("uid", bucketizer.Uid());
+            Assert.Equal(expectedUid, bucketizer.Uid());
 
             DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)");
 
             DataFrame output = bucketizer.Transform(input);
-            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
 
-            Assert.Equal("input_col", bucketizer.GetInputCol());
-            Assert.Equal("output_col", bucketizer.GetOutputCol());
+            Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
+            Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
             Assert.Equal(expectedSplits, bucketizer.GetSplits());
         }
 
         [Fact]
         public void TestBucketizer_MultipleColumns()
         {
-            double[][] splitsArray = new[]
+            double[][] expectedSplitsArray = new[]
             {
                 new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue},
                 new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue}
             };
-                
+
+            string expectedHandle = "keep";
+
+            List<string> expectedInputCols = new List<string>() {"input_col_a", "input_col_b"};
+            List<string> expectedOutputCols = new List<string>() {"output_col_a", "output_col_b"};
+            
             Bucketizer bucketizer = new Bucketizer()
-                .SetInputCols(new List<string>() {"input_col_a", "input_col_b"})
-                .SetOutputCols(new List<string>() {"output_col_a", "output_col_b"})
-                .SetHandleInvalid("keep")
-                .SetSplitsArray(splitsArray);
+                .SetInputCols(expectedInputCols)
+                .SetOutputCols(expectedOutputCols)
+                .SetHandleInvalid(expectedHandle)
+                .SetSplitsArray(expectedSplitsArray);
 
-            Assert.Equal("keep",
-                bucketizer.GetHandleInvalid());
+            Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid());
 
             DataFrame input =
                 _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)");
@@ -69,9 +80,9 @@ public void TestBucketizer_MultipleColumns()
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a"));
             Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b"));
             
-            Assert.IsType<List<string>>(bucketizer.GetInputCols());
-            Assert.IsType<List<string>>(bucketizer.GetOutputCols());
-            Assert.IsType<double[][]>(bucketizer.GetSplitsArray());
+            Assert.Equal(expectedInputCols, bucketizer.GetInputCols());
+            Assert.Equal(expectedOutputCols, bucketizer.GetOutputCols());
+            Assert.Equal(expectedSplitsArray, bucketizer.GetSplitsArray());
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index f62525a3f..1d5e047d2 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -127,7 +127,7 @@ public Bucketizer SetInputCol(string value)
         /// Gets the columns that <see cref="Bucketizer"/> should read from and convert into
         /// buckets. This is set by SetInputCol
         /// </summary>
-       /// <returns>IEnumerable<string>, list of input columns</returns>
+       /// <returns>IEnumerable&lt;string&gt;, list of input columns</returns>
         public IEnumerable<string> GetInputCols()
         {
             return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList();
@@ -172,7 +172,7 @@ public Bucketizer SetOutputCol(string value)
         /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
         /// This is set by SetOutputCols
         /// </summary>
-        /// <returns>IEnumerable<string>, list of output columns</returns>
+        /// <returns>IEnumerable&lt;string&gt;, list of output columns</returns>
         public IEnumerable<string> GetOutputCols()
         {
             return ((string[])_jvmObject.Invoke("getOutputCols")).ToList();

From dc7bf4b05ea36f16ccd31e8a772c5b53c8183563 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 15 Jan 2020 08:15:58 +0000
Subject: [PATCH 24/47] wip

---
 .../IpcTests/ML/Feature/HashingTFTests.cs     |  4 +-
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 60 +++++++++++++------
 2 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 80f35e7bd..3098c92dc 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -23,12 +23,12 @@ public HashingTFTests(SparkFixture fixture)
         [Fact]
         public void TestHashingTF()
         {
-            HashingTF HashingTF = new HashingTF("uid")
+            HashingTF HashingTF = new HashingTF(100)
                 .SetNumFeatures(10)
                 .SetInputCol("input_col")
                 .SetOutputCol("output_col");
 
-            Assert.Equal("uid", HashingTF.Uid());
+            Assert.Equal(10, HashingTF.GetNumFeatures());
 
             DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
                                             " as input_col");
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 6356ea53b..58d097328 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -33,12 +33,12 @@ public HashingTF()
         /// <summary>
         /// Create a <see cref="HashingTF"/> with a UID that is used to give the
         /// <see cref="HashingTF"/> a unique ID
+        /// <param name="numFeatures">numFeatures number of features (default: 2^20^)</param>
         /// </summary>
-        /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
-        public HashingTF(string uid)
+        public HashingTF(int numFeatures)
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.HashingTF", uid);
+                "org.apache.spark.ml.feature.HashingTF", numFeatures);
         }
         
         internal HashingTF(JvmObjectReference jvmObject)
@@ -49,6 +49,15 @@ internal HashingTF(JvmObjectReference jvmObject)
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
+        /// <summary>
+        /// Gets the column that the <see cref="HashingTF"/> should read from
+        /// </summary>
+        /// <returns>string, the name of the input column</returns>
+        public string GetInputCol()
+        {
+            return (string)_jvmObject.Invoke("getInputCol");
+        }
+        
         /// <summary>
         /// Sets the column that the <see cref="HashingTF"/> should read from
         /// </summary>
@@ -63,19 +72,43 @@ public HashingTF SetInputCol(string value)
         /// The <see cref="HashingTF"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
         /// </summary>
-        /// <param name="value">The name of the new column
-        /// </param>
+        /// <returns>string, the name of the output col</returns>
+        public string GetOutputCol()
+        {
+            return (string)_jvmObject.Invoke("getOutputCol");
+        }
+        
+        /// <summary>
+        /// The <see cref="HashingTF"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <param name="value">The name of the new column</param>
         /// <returns><see cref="HashingTF"/></returns>
         public HashingTF SetOutputCol(string value)
         {
             return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value));
         }
 
-        public HashingTF SetNumFeatures(int value)
+        /// <summary>
+        /// Gets the number of features that should be used
+        /// </summary>
+        /// <returns></returns>
+        public int NumFeatures()
+        {
+            return (int)_jvmObject.Invoke("NumFeatures");
+        }
+
+        /// <summary>
+        /// If true, term frequency vector will be binary such that non-zero term counts will be
+        /// set to 1, default: false
+        /// </summary>
+        /// <param name="value">Term frequency vector, default: false</param>
+        /// <returns></returns>
+        public HashingTF SetBinary(bool value)
         {
-            return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value));
+            return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value));
         }
-        
+
         /// <summary>
         /// Executes the <see cref="HashingTF"/> and transforms the DataFrame to include the new
         /// column or columns with the tokens.
@@ -98,16 +131,5 @@ private static HashingTF WrapAsHashingTF(object obj)
         {
             return new HashingTF((JvmObjectReference)obj);
         }
-
-        /// <summary>
-        /// The uid that was used to create the <see cref="HashingTF"/>. If no UID is passed in
-        /// when creating the <see cref="HashingTF"/> then a random UID is created when the
-        /// <see cref="HashingTF"/> is created.
-        /// </summary>
-        /// <returns>string UID identifying the <see cref="HashingTF"/></returns>
-        public string Uid()
-        {
-            return (string)_jvmObject.Invoke("uid");
-        }
     }
 }

From 283f8ea594632c0f6f73cae57bb9c9f2073df344 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 15 Jan 2020 23:00:16 +0000
Subject: [PATCH 25/47] Hashing TF from ml not mllib

---
 .../IpcTests/ML/Feature/HashingTFTests.cs     | 31 ++++++--
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 77 +++++++++++++++----
 2 files changed, 89 insertions(+), 19 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 3098c92dc..398eab47c 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -4,8 +4,10 @@
 
 using System;
 using System.Collections.Generic;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
 using Xunit;
 
 namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
@@ -23,17 +25,34 @@ public HashingTFTests(SparkFixture fixture)
         [Fact]
         public void TestHashingTF()
         {
-            HashingTF HashingTF = new HashingTF(100)
-                .SetNumFeatures(10)
-                .SetInputCol("input_col")
-                .SetOutputCol("output_col");
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+            int expectedFeatures = 10;
+            
+            HashingTF hashingTf = new HashingTF("my-unique-id")
+                .SetNumFeatures(expectedFeatures)
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol);
 
-            Assert.Equal(10, HashingTF.GetNumFeatures());
+            Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures());
+            Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
+            Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());
 
             DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
                                             " as input_col");
 
-            DataFrame output = HashingTF.Transform(input);
+            DataFrame output = hashingTf.Transform(input);
+            Assert.Contains(expectedOutputCol, output.Columns());
+
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                hashingTf.Save(tempDirectory.Path);
+                var loadedHashingTf = HashingTF.Load(tempDirectory.Path);
+                Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
+            }
+
+            hashingTf.SetBinary(true);
+            Assert.True(hashingTf.GetBinary());
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 58d097328..b0d8fc078 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -7,6 +7,7 @@
 using Microsoft.Spark.Interop;
 using Microsoft.Spark.Interop.Ipc;
 using Microsoft.Spark.Sql;
+using Microsoft.Spark.Sql.Types;
 
 namespace Microsoft.Spark.ML.Feature
 {
@@ -27,18 +28,18 @@ public class HashingTF : IJvmObjectReferenceProvider
         public HashingTF()
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.HashingTF");
+                _javaClassName);
         }
 
         /// <summary>
         /// Create a <see cref="HashingTF"/> with a UID that is used to give the
         /// <see cref="HashingTF"/> a unique ID
-        /// <param name="numFeatures">numFeatures number of features (default: 2^20^)</param>
+        /// <param name="uid">unique identifier</param>
         /// </summary>
-        public HashingTF(int numFeatures)
+        public HashingTF(string uid)
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.HashingTF", numFeatures);
+                _javaClassName, uid);
         }
         
         internal HashingTF(JvmObjectReference jvmObject)
@@ -47,8 +48,51 @@ internal HashingTF(JvmObjectReference jvmObject)
         }
 
         private readonly JvmObjectReference _jvmObject;
+        private const string _javaClassName = "org.apache.spark.ml.feature.HashingTF";
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
+        /// <summary>
+        /// Loads the <see cref="HashingTF"/> that was previously saved using Save
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="HashingTF"/></returns>
+        public static HashingTF Load(string path)
+        {
+            return WrapAsHashingTF(SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                _javaClassName,
+                "load", path));
+        }
+        
+        /// <summary>
+        /// Saves the <see cref="HashingTF"/> so that it can be loaded later using Load
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="HashingTF"/></returns>
+        public HashingTF Save(string path)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("save", path));
+        }
+        
+        /// <summary>
+        /// Gets the binary toggle that controls term frequency counts
+        /// </summary>
+        /// <returns></returns>
+        public bool GetBinary()
+        {
+            return (bool)_jvmObject.Invoke("getBinary");
+        }
+
+        /// <summary>
+        /// Binary toggle to control term frequency counts.
+        /// If true, all non-zero counts are set to 1.  This is useful for discrete probabilistic
+        /// models that model binary events rather than integer counts
+        ///</summary>
+        /// <param name="value">binary toggle, default is false</param>
+        public HashingTF SetBinary(bool value)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value));
+        }
+        
         /// <summary>
         /// Gets the column that the <see cref="HashingTF"/> should read from
         /// </summary>
@@ -92,21 +136,28 @@ public HashingTF SetOutputCol(string value)
         /// <summary>
         /// Gets the number of features that should be used
         /// </summary>
-        /// <returns></returns>
-        public int NumFeatures()
+        /// <returns>int</returns>
+        public int GetNumFeatures()
         {
-            return (int)_jvmObject.Invoke("NumFeatures");
+            return (int)_jvmObject.Invoke("getNumFeatures");
+        }
+        
+        /// <summary>
+        /// Sets the number of features that should be used
+        /// </summary>
+        /// <returns><see cref="HashingTF"/></returns>
+        public HashingTF SetNumFeatures(int value)
+        {
+            return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value));
         }
 
         /// <summary>
-        /// If true, term frequency vector will be binary such that non-zero term counts will be
-        /// set to 1, default: false
+        /// An immutable unique ID for the object and its derivatives.
         /// </summary>
-        /// <param name="value">Term frequency vector, default: false</param>
-        /// <returns></returns>
-        public HashingTF SetBinary(bool value)
+        /// <returns>string</returns>
+        public string Uid()
         {
-            return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value));
+            return (string)_jvmObject.Invoke("uid");
         }
 
         /// <summary>

From 9d0f7ea7df5a3ef3a34fc2bd34b0270c8f34ebea Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Thu, 16 Jan 2020 21:12:47 +0000
Subject: [PATCH 26/47] tests for HashingTF

---
 .../IpcTests/ML/Feature/HashingTFTests.cs             | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 398eab47c..ce20736b7 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -4,6 +4,8 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
+using System.Linq;
 using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
@@ -28,6 +30,8 @@ public void TestHashingTF()
             string expectedInputCol = "input_col";
             string expectedOutputCol = "output_col";
             int expectedFeatures = 10;
+
+            Assert.IsType<HashingTF>(new HashingTF());
             
             HashingTF hashingTf = new HashingTF("my-unique-id")
                 .SetNumFeatures(expectedFeatures)
@@ -42,12 +46,13 @@ public void TestHashingTF()
                                             " as input_col");
 
             DataFrame output = hashingTf.Transform(input);
-            Assert.Contains(expectedOutputCol, output.Columns());
+            DataFrame outputColumn = output.Select(expectedOutputCol);
 
             using (var tempDirectory = new TemporaryDirectory())
             {
-                hashingTf.Save(tempDirectory.Path);
-                var loadedHashingTf = HashingTF.Load(tempDirectory.Path);
+                var bucketPath = Path.Join(tempDirectory.Path, "bucket");
+                hashingTf.Save(bucketPath);
+                var loadedHashingTf = HashingTF.Load(bucketPath);
                 Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
             }
 

From 107e01b7267afc1cf1661c3bd79574881403a71b Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Thu, 16 Jan 2020 22:12:21 +0000
Subject: [PATCH 27/47] adding tests

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    | 10 ++++
 .../IpcTests/ML/Feature/HashingTFTests.cs     |  6 +-
 .../IpcTests/ML/Feature/IDFModelTests.cs      | 29 +++++++++-
 .../IpcTests/ML/Feature/IDFTests.cs           | 50 +++++++++++++++++
 .../IpcTests/ML/Feature/TokenizerTests.cs     | 35 +++++++++---
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 29 ++++++++--
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs  | 56 ++++++++++++++++++-
 .../Microsoft.Spark/ML/Feature/IDFModel.cs    | 46 ++++++++++++++-
 .../Microsoft.Spark/ML/Feature/Tokenizer.cs   | 49 ++++++++++++++--
 .../Microsoft.Spark/Microsoft.Spark.csproj    |  5 +-
 10 files changed, 283 insertions(+), 32 deletions(-)
 create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index a3578fc33..08282320e 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -4,7 +4,9 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Security.Cryptography;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
@@ -49,6 +51,14 @@ public void TestBucketizer()
             Assert.Equal(expectedInputCol, bucketizer.GetInputCol());
             Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol());
             Assert.Equal(expectedSplits, bucketizer.GetSplits());
+            
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                var savePath = Path.Join(tempDirectory.Path, "bucket");
+                bucketizer.Save(savePath);
+                var loadedBucketizer = Bucketizer.Load(savePath);
+                Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
+            }
         }
 
         [Fact]
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index ce20736b7..9484fb30f 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -50,9 +50,9 @@ public void TestHashingTF()
 
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var bucketPath = Path.Join(tempDirectory.Path, "bucket");
-                hashingTf.Save(bucketPath);
-                var loadedHashingTf = HashingTF.Load(bucketPath);
+                var savePath = Path.Join(tempDirectory.Path, "hashingTF");
+                hashingTf.Save(savePath);
+                var loadedHashingTf = HashingTF.Load(savePath);
                 Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
             }
 
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 06a4a3be0..cd55e90a7 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -4,6 +4,8 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
@@ -23,21 +25,42 @@ public IDFModelTests(SparkFixture fixture)
         [Fact]
         public void TestIDFModel()
         {
+            int expectedDocFrequency = 1980;
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
+
+            
             DataFrame sentenceData =
                 _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
             Tokenizer tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words");
             DataFrame wordsData = tokenizer.Transform(sentenceData);
 
             HashingTF hashingTF = new HashingTF()
-                .SetInputCol("words").SetOutputCol("rawFeatures").SetNumFeatures(20);
+                                        .SetInputCol("words")
+                                        .SetOutputCol(expectedInputCol)
+                                        .SetNumFeatures(20);
 
             DataFrame featurizedData = hashingTF.Transform(wordsData);
-
-            IDF idf = new IDF().SetInputCol("rawFeatures").SetOutputCol("features");
+    
+            IDF idf = new IDF()
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol)
+                .SetMinDocFreq(expectedDocFrequency);
+            
             IDFModel idfModel = idf.Fit(featurizedData);
 
             DataFrame rescaledData = idfModel.Transform(featurizedData);
+
+            Assert.Equal(expectedInputCol, idf.GetInputCol());
+            Assert.Equal(expectedOutputCol, idf.GetOutputCol());
+            
+            Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());
             
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                var modelPath = Path.Join(tempDirectory.Path, "ideModel");
+                idfModel.Save(modelPath);
+            }
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
new file mode 100644
index 000000000..fe7f6d30c
--- /dev/null
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -0,0 +1,50 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
+using Microsoft.Spark.ML.Feature;
+using Microsoft.Spark.Sql;
+using Xunit;
+
+namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
+{
+    [Collection("Spark E2E Tests")]
+    public class IDFTests
+    {
+        private readonly SparkSession _spark;
+
+        public IDFTests(SparkFixture fixture)
+        {
+            _spark = fixture.Spark;
+        }
+
+        [Fact]
+        public void TestIDFModel()
+        {
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
+            int expectedDocFrequency = 100;
+            
+            IDF idf = new IDF()
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol)
+                .SetMinDocFreq(expectedDocFrequency);
+            
+            Assert.Equal(expectedInputCol, idf.GetInputCol());
+            Assert.Equal(expectedOutputCol, idf.GetOutputCol());
+            Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq());
+            
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                var savePath = Path.Join(tempDirectory.Path, "IDF");
+                idf.Save(savePath);
+                var loadedIdf = IDF.Load(savePath);
+                Assert.Equal(idf.Uid(), loadedIdf.Uid());
+            }
+        }
+    }
+}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index 19eb9216f..2a2d9139f 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -4,6 +4,8 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
@@ -23,17 +25,34 @@ public TokenizerTests(SparkFixture fixture)
         [Fact]
         public void TestTokenizer()
         {
-            Tokenizer Tokenizer = new Tokenizer("uid")
-                .SetInputCol("input_col")
-                .SetOutputCol("output_col");
-
-            Assert.Equal("uid", Tokenizer.Uid());
-
+            string expectedUid = "theUid";
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+            
             DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + 
                                                 " from range(100)");
+            
+            Tokenizer tokenizer = new Tokenizer(expectedUid);
+            
+            tokenizer
+                .SetInputCol(expectedInputCol)
+                .SetOutputCol(expectedOutputCol);
+            
+            DataFrame output = tokenizer.Transform(input);
+            Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
 
-            DataFrame output = Tokenizer.Transform(input);
-            Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col"));
+            Assert.Equal(expectedInputCol, tokenizer.GetInputCol());
+            Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol());
+            
+            using (var tempDirectory = new TemporaryDirectory())
+            {
+                var savePath = Path.Join(tempDirectory.Path, "Tokenizer");
+                tokenizer.Save(savePath);
+                var loadedIdf = Tokenizer.Load(savePath);
+                Assert.Equal(tokenizer.Uid(), loadedIdf.Uid());
+            }
+            
+            Assert.Equal(expectedUid, tokenizer.Uid());
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index ce1436fe4..2db86f5cc 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -32,8 +32,7 @@ internal Bucketizer(JvmObjectReference jvmObject)
         /// </summary>
         public Bucketizer()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Bucketizer");
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
         }
 
         /// <summary>
@@ -43,11 +42,11 @@ public Bucketizer()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Bucketizer(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Bucketizer", uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
         }
         
         private readonly JvmObjectReference _jvmObject;
+        private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer";
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -243,5 +242,27 @@ public Bucketizer SetHandleInvalid(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }
+        
+        /// <summary>
+        /// Loads the <see cref="Bucketizer"/> that was previously saved using Save
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public static Bucketizer Load(string path)
+        {
+            return WrapAsBucketizer(SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                JavaClassName,
+                "load", path));
+        }
+        
+        /// <summary>
+        /// Saves the <see cref="Bucketizer"/> so that it can be loaded later using Load
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        public Bucketizer Save(string path)
+        {
+            return WrapAsBucketizer(_jvmObject.Invoke("save", path));
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index 663b887d0..5f9e376a0 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -21,14 +21,13 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class IDF : IJvmObjectReferenceProvider
     {
-       
         /// <summary>
         /// Create a <see cref="IDF"/> without any parameters
         /// </summary>
         public IDF()
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.IDF");
+                JavaClassName);
         }
 
         /// <summary>
@@ -39,7 +38,7 @@ public IDF()
         public IDF(string uid)
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.IDF", uid);
+                JavaClassName, uid);
         }
         
         internal IDF(JvmObjectReference jvmObject)
@@ -47,9 +46,20 @@ internal IDF(JvmObjectReference jvmObject)
             _jvmObject = jvmObject;
         }
 
+        private const string JavaClassName = "org.apache.spark.ml.feature.IDF";
+
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
+        /// <summary>
+        /// Gets the column that the <see cref="IDF"/> should read from
+        /// </summary>
+        /// <returns>string, input column</returns>
+        public string GetInputCol()
+        {
+            return (string)(_jvmObject.Invoke("getInputCol"));
+        }
+        
         /// <summary>
         /// Sets the column that the <see cref="IDF"/> should read from
         /// </summary>
@@ -60,6 +70,16 @@ public IDF SetInputCol(string value)
             return WrapAsIDF(_jvmObject.Invoke("setInputCol", value));
         }
 
+        /// <summary>
+        /// The <see cref="IDF"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <returns>string, the output column</returns>
+        public string GetOutputCol()
+        {
+            return (string)(_jvmObject.Invoke("getOutputCol"));
+        }
+        
         /// <summary>
         /// The <see cref="IDF"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
@@ -72,6 +92,15 @@ public IDF SetOutputCol(string value)
             return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value));
         }
 
+        /// <summary>
+        /// Minimum of documents in which a term should appear for filtering
+        /// </summary>
+        /// <returns>int</returns>
+        public int GetMinDocFreq()
+        {
+            return (int)_jvmObject.Invoke("getMinDocFreq");
+        }
+        
         /// <summary>
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
@@ -114,5 +143,26 @@ public string Uid()
         {
             return (string)_jvmObject.Invoke("uid");
         }
+        
+        /// <summary>
+        /// Loads the <see cref="IDF"/> that was previously saved using Save
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="IDF"/></returns>
+        public static IDF Load(string path)
+        {
+            return WrapAsIDF(
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path));
+        }
+        
+        /// <summary>
+        /// Saves the <see cref="IDF"/> so that it can be loaded later using Load
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="IDF"/></returns>
+        public IDF Save(string path)
+        {
+            return WrapAsIDF(_jvmObject.Invoke("save", path));
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index d9cc13882..0ee54564c 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -23,7 +23,7 @@ public class IDFModel : IJvmObjectReferenceProvider
         public IDFModel()
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.IDFModel");
+                JavaClassName);
         }
 
         /// <summary>
@@ -34,7 +34,7 @@ public IDFModel()
         public IDFModel(string uid)
         {
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.IDFModel", uid);
+                JavaClassName, uid);
         }
         
         internal IDFModel(JvmObjectReference jvmObject)
@@ -44,7 +44,18 @@ internal IDFModel(JvmObjectReference jvmObject)
 
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
-
+        
+        private const string JavaClassName = "org.apache.spark.ml.feature.IDFModel";
+        
+        /// <summary>
+        /// Gets the column that the <see cref="IDFModel"/> should read from
+        /// </summary>
+        /// <returns>string, input column</returns>
+        public string GetInputCol()
+        {
+            return (string)(_jvmObject.Invoke("getInputCol"));
+        }
+        
         /// <summary>
         /// Sets the column that the <see cref="IDFModel"/> should read from and convert into
         /// buckets
@@ -56,6 +67,16 @@ public IDFModel SetInputCol(string value)
             return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value));
         }
 
+        /// <summary>
+        /// The <see cref="IDFModel"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <returns>string, the output column</returns>
+        public string GetOutputCol()
+        {
+            return (string)(_jvmObject.Invoke("getOutputCol"));
+        }
+        
         /// <summary>
         /// The <see cref="IDFModel"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
@@ -68,6 +89,15 @@ public IDFModel SetOutputCol(string value)
             return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value));
         }
         
+        /// <summary>
+        /// Minimum of documents in which a term should appear for filtering
+        /// </summary>
+        /// <returns>int</returns>
+        public int GetMinDocFreq()
+        {
+            return (int)_jvmObject.Invoke("getMinDocFreq");
+        }
+        
         /// <summary>
         /// Executes the <see cref="IDFModel"/> and transforms the DataFrame to include the new
         /// column or columns with the tokens.
@@ -101,5 +131,15 @@ public string Uid()
         {
             return (string)_jvmObject.Invoke("uid");
         }
+        
+        /// <summary>
+        /// Saves the <see cref="IDFModel"/> so that it can be loaded later using Load
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="IDFModel"/></returns>
+        public IDFModel Save(string path)
+        {
+            return WrapAsIDFModel(_jvmObject.Invoke("save", path));
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index 3b2d395e9..c4ee596a9 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -22,8 +22,7 @@ public class Tokenizer : IJvmObjectReferenceProvider
         /// </summary>
         public Tokenizer()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Tokenizer");
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
         }
 
         /// <summary>
@@ -33,8 +32,7 @@ public Tokenizer()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Tokenizer(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Tokenizer", uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
         }
         
         internal Tokenizer(JvmObjectReference jvmObject)
@@ -45,6 +43,17 @@ internal Tokenizer(JvmObjectReference jvmObject)
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
+        private const string JavaClassName = "org.apache.spark.ml.feature.Tokenizer";
+        
+        /// <summary>
+        /// Gets the column that the <see cref="Tokenizer"/> should read from
+        /// </summary>
+        /// <returns>string, input column</returns>
+        public string GetInputCol()
+        {
+            return (string)(_jvmObject.Invoke("getInputCol"));
+        }
+        
         /// <summary>
         /// Sets the column that the <see cref="Tokenizer"/> should read from
         /// </summary>
@@ -55,6 +64,16 @@ public Tokenizer SetInputCol(string value)
             return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value));
         }
 
+        /// <summary>
+        /// The <see cref="Tokenizer"/> will create a new column in the DataFrame, this is the
+        /// name of the new column.
+        /// </summary>
+        /// <returns>string, the output column</returns>
+        public string GetOutputCol()
+        {
+            return (string)(_jvmObject.Invoke("getOutputCol"));
+        }
+        
         /// <summary>
         /// The <see cref="Tokenizer"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
@@ -100,5 +119,27 @@ public string Uid()
         {
             return (string)_jvmObject.Invoke("uid");
         }
+        
+        
+        /// <summary>
+        /// Loads the <see cref="Tokenizer"/> that was previously saved using Save
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        public static Tokenizer Load(string path)
+        {
+            return WrapAsTokenizer(
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path));
+        }
+        
+        /// <summary>
+        /// Saves the <see cref="Tokenizer"/> so that it can be loaded later using Load
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        public Tokenizer Save(string path)
+        {
+            return WrapAsTokenizer(_jvmObject.Invoke("save", path));
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index d473408b1..01a6fd7ec 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -32,10 +32,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
-             Link="jars\%(Filename)%(Extension)"
-             Pack="true"
-             PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 

From d85ca33540a0ee4e8bc139b88f8c7decf0000e11 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 21:52:59 +0000
Subject: [PATCH 28/47] removing project, in spark main project

---
 .../Microsoft.Spark.Extensions.ML/Class1.cs          | 12 ------------
 .../Microsoft.Spark.Extensions.ML.csproj             |  7 -------
 src/csharp/Microsoft.Spark.sln                       |  7 -------
 3 files changed, 26 deletions(-)
 delete mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
 delete mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj

diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
deleted file mode 100644
index 5874db8d0..000000000
--- a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs
+++ /dev/null
@@ -1,12 +0,0 @@
-﻿using System;
-
-namespace Microsoft.Spark.Extensions.ML
-{
-    public class Pipeline<T> where T : new()
-    {
-        public T Load(string path)
-        {
-            return new T();
-        }
-    }
-}
diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj
deleted file mode 100644
index 27560206d..000000000
--- a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj
+++ /dev/null
@@ -1,7 +0,0 @@
-<Project Sdk="Microsoft.NET.Sdk">
-
-    <PropertyGroup>
-        <TargetFramework>netstandard2.0</TargetFramework>
-    </PropertyGroup>
-
-</Project>
diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln
index 4b76eb777..b31c377c7 100644
--- a/src/csharp/Microsoft.Spark.sln
+++ b/src/csharp/Microsoft.Spark.sln
@@ -33,8 +33,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Delta.E2ETest", "Extensions\Microsoft.Spark.Extensions.Delta.E2ETest\Microsoft.Spark.Extensions.Delta.E2ETest.csproj", "{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}"
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.ML", "Extensions\Microsoft.Spark.Extensions.ML\Microsoft.Spark.Extensions.ML.csproj", "{38672397-3BC7-4818-A84A-7EE1618311CA}"
-EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
@@ -85,10 +83,6 @@ Global
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.Build.0 = Release|Any CPU
-		{38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@@ -98,7 +92,6 @@ Global
 		{4E379DB3-7741-43C2-B32D-17AD96FEA7D0} = {C8C53525-4FEB-4B5B-91A2-619566C72F3E}
 		{2048446B-45AB-4304-B230-50EDF6E8E6A4} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
 		{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
-		{38672397-3BC7-4818-A84A-7EE1618311CA} = {71A19F75-8279-40AB-BEA0-7D4B153FC416}
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538}

From c15ad6b018322487a69dd887234dd343ae3c9d5c Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:02:26 +0000
Subject: [PATCH 29/47] merge

---
 .../Microsoft.Spark/Interop/Ipc/JvmBridge.cs  |  4 ----
 .../Interop/Ipc/PayloadHelper.cs              |  4 ----
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 19 ++-----------------
 3 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
index 0bd02aa7e..887e8304f 100644
--- a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
+++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs
@@ -372,10 +372,6 @@ private object ReadCollection(Stream s)
                         doubleArrayArray[itemIndex] = ReadCollection(s) as double[];
                     }
                     returnValue = doubleArrayArray;
-<<<<<<< HEAD
-                    
-=======
->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce
                     break;
                 case 'b':
                     var boolArray = new bool[numOfItemsInList];
diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
index eeedf7bf4..e1771405d 100644
--- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
+++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs
@@ -24,11 +24,7 @@ internal class PayloadHelper
         private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' };
         private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' };
         private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' };
-<<<<<<< HEAD
-        private static readonly byte[] s_doubleArrayArrayTypeId = new[] {( byte)'A' };
-=======
         private static readonly byte[] s_doubleArrayArrayTypeId = new[] { ( byte)'A' };
->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce
         private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' };
         private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' };
         private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' };
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 70339d03a..1198e6a98 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -27,18 +27,13 @@ internal Bucketizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-        
+
         /// <summary>
         /// Create a <see cref="Bucketizer"/> without any parameters
         /// </summary>
         public Bucketizer()
         {
-<<<<<<< HEAD
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
-=======
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Bucketizer");
->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce
         }
 
         /// <summary>
@@ -48,19 +43,12 @@ public Bucketizer()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Bucketizer(string uid)
         {
-<<<<<<< HEAD
             _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
         }
         
         private readonly JvmObjectReference _jvmObject;
         private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer";
-=======
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                "org.apache.spark.ml.feature.Bucketizer", uid);
-        }
-        
-        private readonly JvmObjectReference _jvmObject;
->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce
+
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -198,7 +186,6 @@ public Bucketizer SetOutputCols(List<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
         }
-<<<<<<< HEAD
         
         /// <summary>
         /// Loads the <see cref="Bucketizer"/> that was previously saved using Save
@@ -220,8 +207,6 @@ public Bucketizer Save(string path)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("save", path));
         }
-=======
->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce
 
         /// <summary>
         /// Executes the <see cref="Bucketizer"/> and transforms the DataFrame to include the new

From 5c358d1eb9ee10bba47dd25b3c405e1ae1849580 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:18:14 +0000
Subject: [PATCH 30/47] testing

---
 .../IpcTests/ML/Feature/BucketizerTests.cs           |  2 ++
 .../IpcTests/ML/Feature/IDFModelTests.cs             | 12 ++++++++----
 .../IpcTests/ML/Feature/TokenizerTests.cs            |  6 ++----
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index a07bd9172..5fc52261e 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -4,6 +4,8 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
 using Xunit;
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 97c08262a..896df31f5 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -32,7 +32,11 @@ public void TestIDFModel()
             
             DataFrame sentenceData =
                 _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
-            var tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words");
+            
+            var tokenizer = new Tokenizer()
+                .SetInputCol("sentence")
+                .SetOutputCol("words");
+            
             DataFrame wordsData = tokenizer.Transform(sentenceData);
 
             var hashingTF = new HashingTF()
@@ -50,9 +54,9 @@ public void TestIDFModel()
             var idfModel = idf.Fit(featurizedData);
 
             DataFrame rescaledData = idfModel.Transform(featurizedData);
-
-            Assert.Equal(expectedInputCol, idf.GetInputCol());
-            Assert.Equal(expectedOutputCol, idf.GetOutputCol());
+            
+            Assert.Equal(expectedInputCol, idfModel.GetInputCol());
+            Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
             
             Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());
             
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index 9de3e2f8d..b34a8a2d9 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -32,15 +32,13 @@ public void TestTokenizer()
             DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + 
                                                 " from range(100)");
             
-            var tokenizer = new Tokenizer(expectedUid);
-            
-            tokenizer
+            var tokenizer = new Tokenizer(expectedUid)
                 .SetInputCol(expectedInputCol)
                 .SetOutputCol(expectedOutputCol);
             
             DataFrame output = tokenizer.Transform(input);
+            
             Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol));
-
             Assert.Equal(expectedInputCol, tokenizer.GetInputCol());
             Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol());
             

From 9234dba3cbc4d6a247d8e549eb4f34fbec43f3c1 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:21:02 +0000
Subject: [PATCH 31/47] formatting

---
 .../Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 896df31f5..55946abb4 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -28,7 +28,6 @@ public void TestIDFModel()
             var expectedDocFrequency = 1980;
             var expectedInputCol = "rawFeatures";
             var expectedOutputCol = "features";
-
             
             DataFrame sentenceData =
                 _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");

From a524396d9822842d1f78c80da31f8634be6f3028 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:29:21 +0000
Subject: [PATCH 32/47] tidying:

---
 .../IpcTests/ML/Feature/IDFModelTests.cs                      | 1 -
 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs            | 1 -
 src/csharp/Microsoft.Spark/RDD.cs                             | 2 +-
 .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala    | 4 ++--
 .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala    | 4 ++--
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 55946abb4..27e7b3a2d 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -56,7 +56,6 @@ public void TestIDFModel()
             
             Assert.Equal(expectedInputCol, idfModel.GetInputCol());
             Assert.Equal(expectedOutputCol, idfModel.GetOutputCol());
-            
             Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq());
             
             using (var tempDirectory = new TemporaryDirectory())
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index c4ee596a9..566885a0c 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -120,7 +120,6 @@ public string Uid()
             return (string)_jvmObject.Invoke("uid");
         }
         
-        
         /// <summary>
         /// Loads the <see cref="Tokenizer"/> that was previously saved using Save
         /// </summary>
diff --git a/src/csharp/Microsoft.Spark/RDD.cs b/src/csharp/Microsoft.Spark/RDD.cs
index 556884560..7eda57c61 100644
--- a/src/csharp/Microsoft.Spark/RDD.cs
+++ b/src/csharp/Microsoft.Spark/RDD.cs
@@ -102,7 +102,7 @@ internal RDD(
             _prevSerializedMode = prevSerializedMode;
         }
 
-            JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
+        JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
         /// <summary>
         /// Persist this RDD with the default storage level (MEMORY_ONLY).
diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 4a6b27a58..7a77af9b7 100644
--- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -119,8 +119,8 @@ object SerDe {
   }
 
   def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = {
-     val len = readInt(in)
-      (0 until len).map(_ => readDoubleArr(in)).toArray
+    val len = readInt(in)
+    (0 until len).map(_ => readDoubleArr(in)).toArray
   }
 
   def readBooleanArr(in: DataInputStream): Array[Boolean] = {
diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
index 4a6b27a58..7a77af9b7 100644
--- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
+++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala
@@ -119,8 +119,8 @@ object SerDe {
   }
 
   def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = {
-     val len = readInt(in)
-      (0 until len).map(_ => readDoubleArr(in)).toArray
+    val len = readInt(in)
+    (0 until len).map(_ => readDoubleArr(in)).toArray
   }
 
   def readBooleanArr(in: DataInputStream): Array[Boolean] = {

From fa9c065b5709799eaa45244d6ddf0921eb95a94e Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:31:56 +0000
Subject: [PATCH 33/47] removing change

---
 src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index 01a6fd7ec..3bfdd951a 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -32,7 +32,10 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" 
+             Link="jars\%(Filename)%(Extension)" 
+             Pack="true" 
+             PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 

From 13adf7b21529df51a7b4d2e6aa67cf2bc63b0f17 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:33:05 +0000
Subject: [PATCH 34/47] removing change

---
 src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index 3bfdd951a..d473408b1 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -32,9 +32,9 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" 
-             Link="jars\%(Filename)%(Extension)" 
-             Pack="true" 
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
+             Link="jars\%(Filename)%(Extension)"
+             Pack="true"
              PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>

From 9147c121a5b6f65761781faaa19a08670f5a658d Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:42:33 +0000
Subject: [PATCH 35/47] docs

---
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index b0d8fc078..f04f5314d 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -27,8 +27,7 @@ public class HashingTF : IJvmObjectReferenceProvider
         /// </summary>
         public HashingTF()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                _javaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName);
         }
 
         /// <summary>
@@ -38,8 +37,7 @@ public HashingTF()
         /// </summary>
         public HashingTF(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                _javaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName, uid);
         }
         
         internal HashingTF(JvmObjectReference jvmObject)
@@ -58,9 +56,8 @@ internal HashingTF(JvmObjectReference jvmObject)
         /// <returns><see cref="HashingTF"/></returns>
         public static HashingTF Load(string path)
         {
-            return WrapAsHashingTF(SparkEnvironment.JvmBridge.CallStaticJavaMethod(
-                _javaClassName,
-                "load", path));
+            return WrapAsHashingTF(
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(_javaClassName,"load", path));
         }
         
         /// <summary>
@@ -76,7 +73,7 @@ public HashingTF Save(string path)
         /// <summary>
         /// Gets the binary toggle that controls term frequency counts
         /// </summary>
-        /// <returns></returns>
+        /// <returns>bool</returns>
         public bool GetBinary()
         {
             return (bool)_jvmObject.Invoke("getBinary");
@@ -113,8 +110,8 @@ public HashingTF SetInputCol(string value)
         }
 
         /// <summary>
-        /// The <see cref="HashingTF"/> will create a new column in the DataFrame, this is the
-        /// name of the new column.
+        /// The <see cref="HashingTF"/> will create a new column in the <see cref="DataFrame"/>,
+        /// this is the name of the new column.
         /// </summary>
         /// <returns>string, the name of the output col</returns>
         public string GetOutputCol()
@@ -123,8 +120,8 @@ public string GetOutputCol()
         }
         
         /// <summary>
-        /// The <see cref="HashingTF"/> will create a new column in the DataFrame, this is the
-        /// name of the new column.
+        /// The <see cref="HashingTF"/> will create a new column in the <see cref="DataFrame"/>,
+        /// this is the name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column</param>
         /// <returns><see cref="HashingTF"/></returns>
@@ -145,6 +142,7 @@ public int GetNumFeatures()
         /// <summary>
         /// Sets the number of features that should be used
         /// </summary>
+        /// <param name="value">int</param>
         /// <returns><see cref="HashingTF"/></returns>
         public HashingTF SetNumFeatures(int value)
         {
@@ -164,7 +162,7 @@ public string Uid()
         /// Executes the <see cref="HashingTF"/> and transforms the DataFrame to include the new
         /// column or columns with the tokens.
         /// </summary>
-        /// <param name="source">The DataFrame to add the tokens to</param>
+        /// <param name="source">The <see cref="DataFrame"/> to add the tokens to</param>
         /// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
         public DataFrame Transform(DataFrame source)
         {

From 90937a8dac64a1399dd4715680489f8e3fb5f730 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Sat, 18 Jan 2020 22:50:16 +0000
Subject: [PATCH 36/47] formatting

---
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 24 ++++++-------
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs  | 32 ++++++++---------
 .../Microsoft.Spark/ML/Feature/IDFModel.cs    | 34 +++++++++----------
 .../Microsoft.Spark/ML/Feature/Tokenizer.cs   | 24 ++++++-------
 4 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 1198e6a98..a43981b27 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -220,18 +220,6 @@ public DataFrame Transform(DataFrame source)
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
 
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="Bucketizer"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="Bucketizer"/></param>
-        /// <returns><see cref="Bucketizer"/></returns>
-        private static Bucketizer WrapAsBucketizer(object obj)
-        {
-            return new Bucketizer((JvmObjectReference)obj);
-        }
-
         /// <summary>
         /// The uid that was used to create the <see cref="Bucketizer"/>. If no UID is passed in
         /// when creating the <see cref="Bucketizer"/> then a random UID is created when the
@@ -264,5 +252,17 @@ public Bucketizer SetHandleInvalid(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }
+        
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="Bucketizer"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="Bucketizer"/></param>
+        /// <returns><see cref="Bucketizer"/></returns>
+        private static Bucketizer WrapAsBucketizer(object obj)
+        {
+            return new Bucketizer((JvmObjectReference)obj);
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index 5f9e376a0..f30e15d72 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -26,8 +26,7 @@ public class IDF : IJvmObjectReferenceProvider
         /// </summary>
         public IDF()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                JavaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
         }
 
         /// <summary>
@@ -37,8 +36,7 @@ public IDF()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public IDF(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(
-                JavaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
         }
         
         internal IDF(JvmObjectReference jvmObject)
@@ -114,25 +112,13 @@ public IDF SetMinDocFreq(int value)
         /// <summary>
         /// Fits a model to the input data.
         /// </summary>
-        /// <param name="source">The DataFrame to fit the model to</param>
+        /// <param name="source">The <see cref="DataFrame"/> to fit the model to</param>
         /// <returns><see cref="IDFModel"/></returns>
         public IDFModel Fit(DataFrame source)
         {
             return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source));
         }
 
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="IDF"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="IDF"/></param>
-        /// <returns><see cref="IDF"/></returns>
-        private static IDF WrapAsIDF(object obj)
-        {
-            return new IDF((JvmObjectReference)obj);
-        }
-
         /// <summary>
         /// The uid that was used to create the <see cref="IDF"/>. If no UID is passed in
         /// when creating the <see cref="IDF"/> then a random UID is created when the
@@ -164,5 +150,17 @@ public IDF Save(string path)
         {
             return WrapAsIDF(_jvmObject.Invoke("save", path));
         }
+
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="IDF"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="IDF"/></param>
+        /// <returns><see cref="IDF"/></returns>
+        private static IDF WrapAsIDF(object obj)
+        {
+            return new IDF((JvmObjectReference)obj);
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index 3953ce58c..702a45aec 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -66,8 +66,8 @@ public IDFModel SetInputCol(string value)
         }
 
         /// <summary>
-        /// The <see cref="IDFModel"/> will create a new column in the DataFrame, this is the
-        /// name of the new column.
+        /// The <see cref="IDFModel"/> will create a new column in the <see cref="DataFrame"/>,
+        /// this is the name of the new column.
         /// </summary>
         /// <returns>string, the output column</returns>
         public string GetOutputCol()
@@ -97,28 +97,16 @@ public int GetMinDocFreq()
         }
         
         /// <summary>
-        /// Executes the <see cref="IDFModel"/> and transforms the DataFrame to include the new
-        /// column or columns with the tokens.
+        /// Executes the <see cref="IDFModel"/> and transforms the <see cref="DataFrame"/> to
+        /// include the new column or columns with the tokens.
         /// </summary>
-        /// <param name="source">The DataFrame to add the tokens to</param>
+        /// <param name="source">The <see cref="DataFrame"/> to add the tokens to</param>
         /// <returns><see cref="DataFrame"/> containing the original data and the tokens</returns>
         public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
 
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="IDFModel"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="IDFModel"/></param>
-        /// <returns><see cref="IDFModel"/></returns>
-        private static IDFModel WrapAsIDFModel(object obj)
-        {
-            return new IDFModel((JvmObjectReference)obj);
-        }
-
         /// <summary>
         /// The uid that was used to create the <see cref="IDFModel"/>. If no UID is passed in
         /// when creating the <see cref="IDFModel"/> then a random UID is created when the
@@ -139,5 +127,17 @@ public IDFModel Save(string path)
         {
             return WrapAsIDFModel(_jvmObject.Invoke("save", path));
         }
+        
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="IDFModel"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="IDFModel"/></param>
+        /// <returns><see cref="IDFModel"/></returns>
+        private static IDFModel WrapAsIDFModel(object obj)
+        {
+            return new IDFModel((JvmObjectReference)obj);
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index 566885a0c..35e86b039 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -97,18 +97,6 @@ public DataFrame Transform(DataFrame source)
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
 
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="Tokenizer"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="Tokenizer"/></param>
-        /// <returns><see cref="Tokenizer"/></returns>
-        private static Tokenizer WrapAsTokenizer(object obj)
-        {
-            return new Tokenizer((JvmObjectReference)obj);
-        }
-
         /// <summary>
         /// The uid that was used to create the <see cref="Tokenizer"/>. If no UID is passed in
         /// when creating the <see cref="Tokenizer"/> then a random UID is created when the
@@ -140,5 +128,17 @@ public Tokenizer Save(string path)
         {
             return WrapAsTokenizer(_jvmObject.Invoke("save", path));
         }
+        
+        /// <summary>
+        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
+        /// <see cref="Tokenizer"/>
+        /// </summary>
+        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
+        /// <see cref="Tokenizer"/></param>
+        /// <returns><see cref="Tokenizer"/></returns>
+        private static Tokenizer WrapAsTokenizer(object obj)
+        {
+            return new Tokenizer((JvmObjectReference)obj);
+        }
     }
 }

From adca1d675e3f1ec5cfa59f1f9c72bec455ecce1d Mon Sep 17 00:00:00 2001
From: Ed Elliott <GoEddie@users.noreply.github.com>
Date: Wed, 5 Feb 2020 22:32:31 +0000
Subject: [PATCH 37/47] Apply suggestions from code review

Co-Authored-By: elvaliuliuliu <47404285+elvaliuliuliu@users.noreply.github.com>
---
 .../IpcTests/ML/Feature/HashingTFTests.cs                      | 3 ++-
 .../IpcTests/ML/Feature/TokenizerTests.cs                      | 3 ++-
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs                   | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index f8aa7befa..390faaeb4 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -42,7 +42,8 @@ public void TestHashingTF()
             Assert.Equal(expectedInputCol, hashingTf.GetInputCol());
             Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol());
 
-            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + 
+            DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
+                " as input_col");
                                             " as input_col");
 
             DataFrame output = hashingTf.Transform(input);
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index b34a8a2d9..59255e149 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -29,7 +29,8 @@ public void TestTokenizer()
             var expectedInputCol = "input_col";
             var expectedOutputCol = "output_col";
             
-            DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + 
+            DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
+                " from range(100)");
                                                 " from range(100)");
             
             var tokenizer = new Tokenizer(expectedUid)
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index f30e15d72..7873b085a 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -12,7 +12,7 @@ namespace Microsoft.Spark.ML.Feature
 {
     /// <summary>
     /// Inverse document frequency (IDF). The standard formulation is used:
-    ///     idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is
+    /// idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is
     /// the number of documents that contain term t.
     /// 
     /// This implementation supports filtering out terms which do not appear in a minimum number

From 44a4bb51f487daefd1d66192afacff18274e263e Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Fri, 7 Feb 2020 20:36:41 +0000
Subject: [PATCH 38/47] adding datatype udf where sqlType is available

---
 .../IpcTests/ML/Feature/HashingTFTests.cs                 | 7 +++----
 .../IpcTests/ML/Feature/TokenizerTests.cs                 | 1 -
 src/csharp/Microsoft.Spark/Microsoft.Spark.csproj         | 5 +----
 src/csharp/Microsoft.Spark/Sql/Types/DataType.cs          | 8 ++++++++
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 390faaeb4..aaf12af02 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -3,13 +3,11 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
-using System.Collections.Generic;
 using System.IO;
 using System.Linq;
 using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
-using Microsoft.Spark.Sql.Types;
 using Xunit;
 
 namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature
@@ -44,11 +42,12 @@ public void TestHashingTF()
 
             DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" +
                 " as input_col");
-                                            " as input_col");
 
             DataFrame output = hashingTf.Transform(input);
-            DataFrame outputColumn = output.Select(expectedOutputCol);
+            DataFrame outputVector = output.Select(expectedOutputCol);
 
+            Assert.Contains(expectedOutputCol, outputVector.Columns());
+       
             using (var tempDirectory = new TemporaryDirectory())
             {
                 var savePath = Path.Join(tempDirectory.Path, "hashingTF");
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index 59255e149..3c99fa1e3 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -31,7 +31,6 @@ public void TestTokenizer()
             
             DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
                 " from range(100)");
-                                                " from range(100)");
             
             var tokenizer = new Tokenizer(expectedUid)
                 .SetInputCol(expectedInputCol)
diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index d473408b1..01a6fd7ec 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -32,10 +32,7 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
-             Link="jars\%(Filename)%(Extension)"
-             Pack="true"
-             PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 
diff --git a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
index 83bd1770f..a82babbf5 100644
--- a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
+++ b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
@@ -160,6 +160,14 @@ internal static DataType ParseDataType(JToken json)
                     }
                     else if (typeName == "udt")
                     {
+                        if (typeJObject.TryGetValue("class", out JToken classToken))
+                        {
+                            if (typeJObject.TryGetValue("sqlType", out JToken sqlTypeToken))
+                            {
+                                return new StructType(sqlTypeToken as JObject);
+                            }
+                        }
+
                         throw new NotImplementedException();
                     }
                 }

From 15bae3eeb5c94f9b25f41d64c028cc7f5d21dc6a Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 2 Mar 2020 21:53:29 +0000
Subject: [PATCH 39/47] feedback from review

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    |  5 +++--
 .../IpcTests/ML/Feature/HashingTFTests.cs     | 11 +++++-----
 .../IpcTests/ML/Feature/IDFModelTests.cs      | 13 +++++++-----
 .../IpcTests/ML/Feature/IDFTests.cs           | 11 +++++-----
 .../IpcTests/ML/Feature/TokenizerTests.cs     | 13 ++++++------
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 21 ++++++++++---------
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 21 +++++++++++++------
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs  | 10 ++++-----
 .../Microsoft.Spark/ML/Feature/IDFModel.cs    | 21 ++++++++++++++-----
 .../Microsoft.Spark/ML/Feature/Tokenizer.cs   | 11 +++++-----
 .../Microsoft.Spark/Sql/Types/DataType.cs     |  2 +-
 11 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index a8b2c1c20..10b48e634 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -52,9 +52,10 @@ public void TestBucketizer()
             
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var savePath = Path.Join(tempDirectory.Path, "bucket");
+                string savePath = Path.Join(tempDirectory.Path, "bucket");
                 bucketizer.Save(savePath);
-                var loadedBucketizer = Bucketizer.Load(savePath);
+                
+                Bucketizer loadedBucketizer = Bucketizer.Load(savePath);
                 Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid());
             }
         }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 20beb5be2..106bca3f2 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -23,9 +23,9 @@ public HashingTFTests(SparkFixture fixture)
         [Fact]
         public void TestHashingTF()
         {
-            var expectedInputCol = "input_col";
-            var expectedOutputCol = "output_col";
-            var expectedFeatures = 10;
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
+            int expectedFeatures = 10;
 
             Assert.IsType<HashingTF>(new HashingTF());
             
@@ -48,9 +48,10 @@ public void TestHashingTF()
        
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var savePath = Path.Join(tempDirectory.Path, "hashingTF");
+                string savePath = Path.Join(tempDirectory.Path, "hashingTF");
                 hashingTf.Save(savePath);
-                var loadedHashingTf = HashingTF.Load(savePath);
+                
+                HashingTF loadedHashingTf = HashingTF.Load(savePath);
                 Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid());
             }
 
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 8062c66c2..c695f8515 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -23,9 +23,9 @@ public IDFModelTests(SparkFixture fixture)
         [Fact]
         public void TestIDFModel()
         {
-            var expectedDocFrequency = 1980;
-            var expectedInputCol = "rawFeatures";
-            var expectedOutputCol = "features";
+            int expectedDocFrequency = 1980;
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
             
             DataFrame sentenceData =
                 _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
@@ -48,7 +48,7 @@ public void TestIDFModel()
                 .SetOutputCol(expectedOutputCol)
                 .SetMinDocFreq(expectedDocFrequency);
             
-            var idfModel = idf.Fit(featurizedData);
+            IDFModel idfModel = idf.Fit(featurizedData);
 
             DataFrame rescaledData = idfModel.Transform(featurizedData);
             Assert.Contains(expectedOutputCol, rescaledData.Columns());
@@ -59,8 +59,11 @@ public void TestIDFModel()
             
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var modelPath = Path.Join(tempDirectory.Path, "ideModel");
+                string modelPath = Path.Join(tempDirectory.Path, "ideModel");
                 idfModel.Save(modelPath);
+
+                IDFModel loadedModel = IDFModel.Load(modelPath);
+                Assert.Equal(idfModel.Uid(), loadedModel.Uid());
             }
         }
     }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
index c556a37e3..944d2b24b 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -23,9 +23,9 @@ public IDFTests(SparkFixture fixture)
         [Fact]
         public void TestIDFModel()
         {
-            var expectedInputCol = "rawFeatures";
-            var expectedOutputCol = "features";
-            var expectedDocFrequency = 100;
+            string expectedInputCol = "rawFeatures";
+            string expectedOutputCol = "features";
+            int expectedDocFrequency = 100;
             
             var idf = new IDF()
                 .SetInputCol(expectedInputCol)
@@ -38,9 +38,10 @@ public void TestIDFModel()
             
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var savePath = Path.Join(tempDirectory.Path, "IDF");
+                string savePath = Path.Join(tempDirectory.Path, "IDF");
                 idf.Save(savePath);
-                var loadedIdf = IDF.Load(savePath);
+                
+                IDF loadedIdf = IDF.Load(savePath);
                 Assert.Equal(idf.Uid(), loadedIdf.Uid());
             }
         }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index 0b6611437..0039d9e07 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -23,9 +23,9 @@ public TokenizerTests(SparkFixture fixture)
         [Fact]
         public void TestTokenizer()
         {
-            var expectedUid = "theUid";
-            var expectedInputCol = "input_col";
-            var expectedOutputCol = "output_col";
+            string expectedUid = "theUid";
+            string expectedInputCol = "input_col";
+            string expectedOutputCol = "output_col";
             
             DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
                 " from range(100)");
@@ -42,10 +42,11 @@ public void TestTokenizer()
             
             using (var tempDirectory = new TemporaryDirectory())
             {
-                var savePath = Path.Join(tempDirectory.Path, "Tokenizer");
+                string savePath = Path.Join(tempDirectory.Path, "Tokenizer");
                 tokenizer.Save(savePath);
-                var loadedIdf = Tokenizer.Load(savePath);
-                Assert.Equal(tokenizer.Uid(), loadedIdf.Uid());
+                
+                Tokenizer loadedTokenizer = Tokenizer.Load(savePath);
+                Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid());
             }
             
             Assert.Equal(expectedUid, tokenizer.Uid());
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index a43981b27..d870baf66 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -8,7 +8,6 @@
 using Microsoft.Spark.Interop;
 using Microsoft.Spark.Interop.Ipc;
 using Microsoft.Spark.Sql;
-using Microsoft.Spark.Sql.Types;
 
 namespace Microsoft.Spark.ML.Feature
 {
@@ -23,17 +22,15 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class Bucketizer : IJvmObjectReferenceProvider
     {
-        internal Bucketizer(JvmObjectReference jvmObject)
-        {
-            _jvmObject = jvmObject;
-        }
+        private static readonly string s_bucketizerClassName = 
+            "org.apache.spark.ml.feature.Bucketizer";
 
         /// <summary>
         /// Create a <see cref="Bucketizer"/> without any parameters
         /// </summary>
         public Bucketizer()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName);
         }
 
         /// <summary>
@@ -43,12 +40,16 @@ public Bucketizer()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Bucketizer(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName, uid);
+        }
+        
+        internal Bucketizer(JvmObjectReference jvmObject)
+        {
+            _jvmObject = jvmObject;
         }
         
         private readonly JvmObjectReference _jvmObject;
-        private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer";
-
+        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -195,7 +196,7 @@ public Bucketizer SetOutputCols(List<string> value)
         public static Bucketizer Load(string path)
         {
             return WrapAsBucketizer(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName,"load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_bucketizerClassName,"load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 6471ca563..9aef51934 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -21,12 +21,15 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class HashingTF : IJvmObjectReferenceProvider
     {
+        private static readonly string s_hashingTfClassName = 
+            "org.apache.spark.ml.feature.HashingTF";
+        
         /// <summary>
         /// Create a <see cref="HashingTF"/> without any parameters
         /// </summary>
         public HashingTF()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName);
         }
 
         /// <summary>
@@ -36,7 +39,7 @@ public HashingTF()
         /// </summary>
         public HashingTF(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName, uid);
         }
         
         internal HashingTF(JvmObjectReference jvmObject)
@@ -45,7 +48,7 @@ internal HashingTF(JvmObjectReference jvmObject)
         }
 
         private readonly JvmObjectReference _jvmObject;
-        private const string _javaClassName = "org.apache.spark.ml.feature.HashingTF";
+        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
         /// <summary>
@@ -56,7 +59,7 @@ internal HashingTF(JvmObjectReference jvmObject)
         public static HashingTF Load(string path)
         {
             return WrapAsHashingTF(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(_javaClassName,"load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_hashingTfClassName,"load", path));
         }
         
         /// <summary>
@@ -130,7 +133,10 @@ public HashingTF SetOutputCol(string value)
         }
 
         /// <summary>
-        /// Gets the number of features that should be used
+        /// Gets the number of features that should be used. Since a simple modulo is used to
+        /// transform the hash function to a column index, it is advisable to use a power of two
+        /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the
+        /// columns.
         /// </summary>
         /// <returns>int</returns>
         public int GetNumFeatures()
@@ -139,7 +145,10 @@ public int GetNumFeatures()
         }
         
         /// <summary>
-        /// Sets the number of features that should be used
+        /// Sets the number of features that should be used. Since a simple modulo is used to
+        /// transform the hash function to a column index, it is advisable to use a power of two as
+        /// the numFeatures parameter; otherwise the features will not be mapped evenly to the
+        /// columns.
         /// </summary>
         /// <param name="value">int</param>
         /// <returns><see cref="HashingTF"/></returns>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index b774b4257..67fec0890 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -19,12 +19,14 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class IDF : IJvmObjectReferenceProvider
     {
+        private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF";
+        
         /// <summary>
         /// Create a <see cref="IDF"/> without any parameters
         /// </summary>
         public IDF()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName);
         }
 
         /// <summary>
@@ -34,7 +36,7 @@ public IDF()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public IDF(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName, uid);
         }
         
         internal IDF(JvmObjectReference jvmObject)
@@ -42,8 +44,6 @@ internal IDF(JvmObjectReference jvmObject)
             _jvmObject = jvmObject;
         }
 
-        private const string JavaClassName = "org.apache.spark.ml.feature.IDF";
-
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
@@ -136,7 +136,7 @@ public string Uid()
         public static IDF Load(string path)
         {
             return WrapAsIDF(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFClassName, "load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index aeaedd182..c222e1ada 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -14,13 +14,15 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class IDFModel : IJvmObjectReferenceProvider
     {
-       
+        private static readonly string s_IDFModelClassName = 
+            "org.apache.spark.ml.feature.IDFModel";
+
         /// <summary>
         /// Create a <see cref="IDFModel"/> without any parameters
         /// </summary>
         public IDFModel()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName);
         }
 
         /// <summary>
@@ -30,7 +32,7 @@ public IDFModel()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public IDFModel(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName, uid);
         }
         
         internal IDFModel(JvmObjectReference jvmObject)
@@ -41,8 +43,6 @@ internal IDFModel(JvmObjectReference jvmObject)
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
-        private const string JavaClassName = "org.apache.spark.ml.feature.IDFModel";
-        
         /// <summary>
         /// Gets the column that the <see cref="IDFModel"/> should read from
         /// </summary>
@@ -116,6 +116,17 @@ public string Uid()
             return (string)_jvmObject.Invoke("uid");
         }
         
+        /// <summary>
+        /// Loads the <see cref="IDFModel"/> that was previously saved using Save
+        /// </summary>
+        /// <param name="path"></param>
+        /// <returns><see cref="IDFModel"/></returns>
+        public static IDFModel Load(string path)
+        {
+            return WrapAsIDFModel(
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFModelClassName, "load", path));
+        }
+        
         /// <summary>
         /// Saves the <see cref="IDFModel"/> so that it can be loaded later using Load
         /// </summary>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index ab2e0ec76..904978a45 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -14,12 +14,15 @@ namespace Microsoft.Spark.ML.Feature
     /// </summary>
     public class Tokenizer : IJvmObjectReferenceProvider
     {
+        private static readonly string s_tokenizerClassName = 
+            "org.apache.spark.ml.feature.Tokenizer";
+        
         /// <summary>
         /// Create a <see cref="Tokenizer"/> without any parameters
         /// </summary>
         public Tokenizer()
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName);
         }
 
         /// <summary>
@@ -29,7 +32,7 @@ public Tokenizer()
         /// <param name="uid">An immutable unique ID for the object and its derivatives.</param>
         public Tokenizer(string uid)
         {
-            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid);
+            _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName, uid);
         }
         
         internal Tokenizer(JvmObjectReference jvmObject)
@@ -39,8 +42,6 @@ internal Tokenizer(JvmObjectReference jvmObject)
 
         private readonly JvmObjectReference _jvmObject;
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
-
-        private const string JavaClassName = "org.apache.spark.ml.feature.Tokenizer";
         
         /// <summary>
         /// Gets the column that the <see cref="Tokenizer"/> should read from
@@ -113,7 +114,7 @@ public string Uid()
         public static Tokenizer Load(string path)
         {
             return WrapAsTokenizer(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_tokenizerClassName, "load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
index a82babbf5..20698cace 100644
--- a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
+++ b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs
@@ -164,7 +164,7 @@ internal static DataType ParseDataType(JToken json)
                         {
                             if (typeJObject.TryGetValue("sqlType", out JToken sqlTypeToken))
                             {
-                                return new StructType(sqlTypeToken as JObject);
+                                return new StructType((JObject)sqlTypeToken);
                             }
                         }
 

From 64066a5074e3d2b30936235dc31f6cfd116da7a3 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 2 Mar 2020 22:54:20 +0000
Subject: [PATCH 40/47] fixes from feedback

---
 .../IpcTests/ML/Feature/HashingTFTests.cs           |  8 ++++++++
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs |  3 ++-
 src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs   |  3 ++-
 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs  |  3 ++-
 .../Microsoft.Spark/Sql/Types/ComplexTypes.cs       | 13 ++++++++++++-
 5 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 106bca3f2..63bc54bdf 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -2,7 +2,10 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 // See the LICENSE file in the project root for more information.
 
+using System;
+using System.Collections.Generic;
 using System.IO;
+using System.Linq;
 using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.ML.Feature;
 using Microsoft.Spark.Sql;
@@ -57,6 +60,11 @@ public void TestHashingTF()
 
             hashingTf.SetBinary(true);
             Assert.True(hashingTf.GetBinary());
+            
+            IEnumerable<Row> vectors = outputVector.Collect();
+            Row row = vectors.First();
+            Assert.Equal(1.0, ((row.Values[0] as Row).Values[3] as object[])[1]);
+            
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index d870baf66..32abf176b 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -196,7 +196,8 @@ public Bucketizer SetOutputCols(List<string> value)
         public static Bucketizer Load(string path)
         {
             return WrapAsBucketizer(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_bucketizerClassName,"load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                    s_bucketizerClassName,"load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index c222e1ada..0b2a1e802 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -124,7 +124,8 @@ public string Uid()
         public static IDFModel Load(string path)
         {
             return WrapAsIDFModel(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFModelClassName, "load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                    s_IDFModelClassName, "load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index 904978a45..b69712227 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -114,7 +114,8 @@ public string Uid()
         public static Tokenizer Load(string path)
         {
             return WrapAsTokenizer(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_tokenizerClassName, "load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                    s_tokenizerClassName, "load", path));
         }
         
         /// <summary>
diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
index 2b65ea6d1..909266133 100644
--- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
+++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
@@ -3,7 +3,9 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
+using System.Collections;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using Microsoft.Spark.Interop.Ipc;
 using Newtonsoft.Json.Linq;
@@ -71,7 +73,16 @@ private DataType FromJson(JObject json)
 
         internal override bool NeedConversion() => true;
 
-        internal override object FromInternal(object obj) => throw new NotImplementedException();
+        internal override object FromInternal(object obj)
+        {
+            switch (obj)
+            {
+                case ArrayList objArrayList:
+                    return objArrayList.ToArray();
+            }
+            
+            throw new NotImplementedException();
+        }
     }
 
     /// <summary>

From 37cf616918733239683c975cb2d5efb85c137f9c Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Tue, 3 Mar 2020 20:50:29 +0000
Subject: [PATCH 41/47] reverting fix for ArrayType

---
 .../IpcTests/ML/Feature/HashingTFTests.cs             |  5 -----
 src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs  | 11 +----------
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index 63bc54bdf..a6d9952da 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -60,11 +60,6 @@ public void TestHashingTF()
 
             hashingTf.SetBinary(true);
             Assert.True(hashingTf.GetBinary());
-            
-            IEnumerable<Row> vectors = outputVector.Collect();
-            Row row = vectors.First();
-            Assert.Equal(1.0, ((row.Values[0] as Row).Values[3] as object[])[1]);
-            
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
index 909266133..c99b141b9 100644
--- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
+++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
@@ -73,16 +73,7 @@ private DataType FromJson(JObject json)
 
         internal override bool NeedConversion() => true;
 
-        internal override object FromInternal(object obj)
-        {
-            switch (obj)
-            {
-                case ArrayList objArrayList:
-                    return objArrayList.ToArray();
-            }
-            
-            throw new NotImplementedException();
-        }
+        internal override object FromInternal(object obj) => throw new NotImplementedException();
     }
 
     /// <summary>

From cd07e5682ed962f4d424fde3a296de04a57fda1d Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Tue, 3 Mar 2020 21:08:02 +0000
Subject: [PATCH 42/47] params comments

---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 4 ++--
 src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs  | 4 ++--
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs        | 9 ++++-----
 src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs   | 4 ++--
 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs  | 7 +++----
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 32abf176b..9f68546be 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -191,7 +191,7 @@ public Bucketizer SetOutputCols(List<string> value)
         /// <summary>
         /// Loads the <see cref="Bucketizer"/> that was previously saved using Save
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path the previous <see cref="Bucketizer"/> was saved to</param>
         /// <returns><see cref="Bucketizer"/></returns>
         public static Bucketizer Load(string path)
         {
@@ -203,7 +203,7 @@ public static Bucketizer Load(string path)
         /// <summary>
         /// Saves the <see cref="Bucketizer"/> so that it can be loaded later using Load
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path to save the <see cref="Bucketizer"/> to</param>
         /// <returns><see cref="Bucketizer"/></returns>
         public Bucketizer Save(string path)
         {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 9aef51934..77eb1b2e7 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -54,7 +54,7 @@ internal HashingTF(JvmObjectReference jvmObject)
         /// <summary>
         /// Loads the <see cref="HashingTF"/> that was previously saved using Save
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path the previous <see cref="HashingTF"/> was saved to</param>
         /// <returns><see cref="HashingTF"/></returns>
         public static HashingTF Load(string path)
         {
@@ -65,7 +65,7 @@ public static HashingTF Load(string path)
         /// <summary>
         /// Saves the <see cref="HashingTF"/> so that it can be loaded later using Load
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path to save the <see cref="HashingTF"/> to</param>
         /// <returns><see cref="HashingTF"/></returns>
         public HashingTF Save(string path)
         {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index 67fec0890..fe92b1e23 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -80,8 +80,7 @@ public string GetOutputCol()
         /// The <see cref="IDF"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
         /// </summary>
-        /// <param name="value">The name of the new column
-        /// </param>
+        /// <param name="value">The name of the new column</param>
         /// <returns><see cref="IDF"/></returns>
         public IDF SetOutputCol(string value)
         {
@@ -100,7 +99,7 @@ public int GetMinDocFreq()
         /// <summary>
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
-        /// <param name="value"></param>
+        /// <param name="value">int, the minimum of documents a term should appear in</param>
         /// <returns></returns>
         public IDF SetMinDocFreq(int value)
         {
@@ -131,7 +130,7 @@ public string Uid()
         /// <summary>
         /// Loads the <see cref="IDF"/> that was previously saved using Save
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path the previous <see cref="IDF"/> was saved to</param>
         /// <returns><see cref="IDF"/></returns>
         public static IDF Load(string path)
         {
@@ -142,7 +141,7 @@ public static IDF Load(string path)
         /// <summary>
         /// Saves the <see cref="IDF"/> so that it can be loaded later using Load
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path to save the <see cref="IDF"/> to</param>
         /// <returns><see cref="IDF"/></returns>
         public IDF Save(string path)
         {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index 0b2a1e802..20cc6a886 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -119,7 +119,7 @@ public string Uid()
         /// <summary>
         /// Loads the <see cref="IDFModel"/> that was previously saved using Save
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path the previous <see cref="IDFModel"/> was saved to</param>
         /// <returns><see cref="IDFModel"/></returns>
         public static IDFModel Load(string path)
         {
@@ -131,7 +131,7 @@ public static IDFModel Load(string path)
         /// <summary>
         /// Saves the <see cref="IDFModel"/> so that it can be loaded later using Load
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path to save the <see cref="IDFModel"/> to</param>
         /// <returns><see cref="IDFModel"/></returns>
         public IDFModel Save(string path)
         {
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index b69712227..cfd605f33 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -76,8 +76,7 @@ public string GetOutputCol()
         /// The <see cref="Tokenizer"/> will create a new column in the DataFrame, this is the
         /// name of the new column.
         /// </summary>
-        /// <param name="value">The name of the new column
-        /// </param>
+        /// <param name="value">The name of the new column</param>
         /// <returns><see cref="Tokenizer"/></returns>
         public Tokenizer SetOutputCol(string value)
         {
@@ -109,7 +108,7 @@ public string Uid()
         /// <summary>
         /// Loads the <see cref="Tokenizer"/> that was previously saved using Save
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path the previous <see cref="Tokenizer"/> was saved to</param>
         /// <returns><see cref="Tokenizer"/></returns>
         public static Tokenizer Load(string path)
         {
@@ -121,7 +120,7 @@ public static Tokenizer Load(string path)
         /// <summary>
         /// Saves the <see cref="Tokenizer"/> so that it can be loaded later using Load
         /// </summary>
-        /// <param name="path"></param>
+        /// <param name="path">The path to save the <see cref="Tokenizer"/> to</param>
         /// <returns><see cref="Tokenizer"/></returns>
         public Tokenizer Save(string path)
         {

From 573fc1a38aea7e20e1686b8b9ade0af0d5c5066a Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 23 Mar 2020 21:41:59 +0000
Subject: [PATCH 43/47] formatting and comments from feedback

---
 .../IpcTests/ML/Feature/BucketizerTests.cs    |  2 +-
 .../IpcTests/ML/Feature/HashingTFTests.cs     |  2 +-
 .../IpcTests/ML/Feature/IDFModelTests.cs      |  6 +--
 .../Microsoft.Spark/ML/Feature/Bucketizer.cs  | 37 +++++++-----------
 .../Microsoft.Spark/ML/Feature/HashingTF.cs   | 38 ++++++++-----------
 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs  | 31 ++++++---------
 .../Microsoft.Spark/ML/Feature/IDFModel.cs    | 28 +++++---------
 .../Microsoft.Spark/ML/Feature/Tokenizer.cs   | 29 ++++++--------
 .../Microsoft.Spark/Microsoft.Spark.csproj    |  5 ++-
 .../Microsoft.Spark/Sql/Types/ComplexTypes.cs |  2 -
 10 files changed, 70 insertions(+), 110 deletions(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
index 10b48e634..11037bc6d 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs
@@ -63,7 +63,7 @@ public void TestBucketizer()
         [Fact]
         public void TestBucketizer_MultipleColumns()
         {
-            var expectedSplitsArray = new[]
+            var expectedSplitsArray = new double[][]
             {
                 new[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue},
                 new[] { double.MinValue, 0.0, 10000.0, double.MaxValue}
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
index a6d9952da..7b6882bea 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs
@@ -32,7 +32,7 @@ public void TestHashingTF()
 
             Assert.IsType<HashingTF>(new HashingTF());
             
-            var hashingTf = new HashingTF("my-unique-id")
+            HashingTF hashingTf = new HashingTF("my-unique-id")
                 .SetNumFeatures(expectedFeatures)
                 .SetInputCol(expectedInputCol)
                 .SetOutputCol(expectedOutputCol);
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index c695f8515..314030ca7 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -30,20 +30,20 @@ public void TestIDFModel()
             DataFrame sentenceData =
                 _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence");
             
-            var tokenizer = new Tokenizer()
+            Tokenizer tokenizer = new Tokenizer()
                 .SetInputCol("sentence")
                 .SetOutputCol("words");
             
             DataFrame wordsData = tokenizer.Transform(sentenceData);
 
-            var hashingTF = new HashingTF()
+            HashingTF hashingTF = new HashingTF()
                 .SetInputCol("words")
                 .SetOutputCol(expectedInputCol)
                 .SetNumFeatures(20);
 
             DataFrame featurizedData = hashingTF.Transform(wordsData);
     
-            var idf = new IDF()
+            IDF idf = new IDF()
                 .SetInputCol(expectedInputCol)
                 .SetOutputCol(expectedOutputCol)
                 .SetMinDocFreq(expectedDocFrequency);
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 9f68546be..02561b0a1 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -24,6 +24,8 @@ public class Bucketizer : IJvmObjectReferenceProvider
     {
         private static readonly string s_bucketizerClassName = 
             "org.apache.spark.ml.feature.Bucketizer";
+        
+        private readonly JvmObjectReference _jvmObject;
 
         /// <summary>
         /// Create a <see cref="Bucketizer"/> without any parameters
@@ -47,9 +49,7 @@ internal Bucketizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-        
-        private readonly JvmObjectReference _jvmObject;
-        
+
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -71,7 +71,7 @@ public double[] GetSplits()
         /// bucket, which also includes y. The splits should be of length &gt;= 3 and strictly
         /// increasing. Values outside the splits specified will be treated as errors.
         /// </param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetSplits(double[] value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value));
@@ -96,7 +96,7 @@ public double[][] GetSplitsArray()
         /// by splits x,y holds values in the range [x,y) except the last bucket, which also
         /// includes y. The splits should be of length &gt;= 3 and strictly increasing.
         /// Values outside the splits specified will be treated as errors.</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetSplitsArray(double[][] value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", (object)value));
@@ -117,7 +117,7 @@ public string GetInputCol()
         /// buckets
         /// </summary>
         /// <param name="value">The name of the column to as the source of the buckets</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetInputCol(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value));
@@ -141,7 +141,7 @@ public IEnumerable<string> GetInputCols()
         /// sets of buckets and two output columns.
         /// </summary>
         /// <param name="value">List of input columns to use as sources for buckets</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetInputCols(IEnumerable<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value));
@@ -162,7 +162,7 @@ public string GetOutputCol()
         /// name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column which contains the bucket ID</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetOutputCol(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value));
@@ -182,7 +182,7 @@ public IEnumerable<string> GetOutputCols()
         /// The list of columns that the <see cref="Bucketizer"/> will create in the DataFrame.
         /// </summary>
         /// <param name="value">List of column names which will contain the bucket ID</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetOutputCols(List<string> value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value));
@@ -192,7 +192,7 @@ public Bucketizer SetOutputCols(List<string> value)
         /// Loads the <see cref="Bucketizer"/> that was previously saved using Save
         /// </summary>
         /// <param name="path">The path the previous <see cref="Bucketizer"/> was saved to</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public static Bucketizer Load(string path)
         {
             return WrapAsBucketizer(
@@ -204,7 +204,7 @@ public static Bucketizer Load(string path)
         /// Saves the <see cref="Bucketizer"/> so that it can be loaded later using Load
         /// </summary>
         /// <param name="path">The path to save the <see cref="Bucketizer"/> to</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer Save(string path)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("save", path));
@@ -249,22 +249,13 @@ public string GetHandleInvalid()
         /// Choices are "skip", "error" or "keep". Default is "error"
         /// </summary>
         /// <param name="value">"skip", "error" or "keep"</param>
-        /// <returns><see cref="Bucketizer"/></returns>
+        /// <returns>New <see cref="Bucketizer"/> object</returns>
         public Bucketizer SetHandleInvalid(string value)
         {
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }
         
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="Bucketizer"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="Bucketizer"/></param>
-        /// <returns><see cref="Bucketizer"/></returns>
-        private static Bucketizer WrapAsBucketizer(object obj)
-        {
-            return new Bucketizer((JvmObjectReference)obj);
-        }
+        private static Bucketizer WrapAsBucketizer(object obj) 
+            => new Bucketizer((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 77eb1b2e7..5fa774e00 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -24,6 +24,8 @@ public class HashingTF : IJvmObjectReferenceProvider
         private static readonly string s_hashingTfClassName = 
             "org.apache.spark.ml.feature.HashingTF";
         
+        private readonly JvmObjectReference _jvmObject;
+        
         /// <summary>
         /// Create a <see cref="HashingTF"/> without any parameters
         /// </summary>
@@ -47,26 +49,25 @@ internal HashingTF(JvmObjectReference jvmObject)
             _jvmObject = jvmObject;
         }
 
-        private readonly JvmObjectReference _jvmObject;
-        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
         /// <summary>
         /// Loads the <see cref="HashingTF"/> that was previously saved using Save
         /// </summary>
         /// <param name="path">The path the previous <see cref="HashingTF"/> was saved to</param>
-        /// <returns><see cref="HashingTF"/></returns>
+        /// <returns>New <see cref="HashingTF"/> object</returns>
         public static HashingTF Load(string path)
         {
             return WrapAsHashingTF(
-                SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_hashingTfClassName,"load", path));
+                SparkEnvironment.JvmBridge.CallStaticJavaMethod(
+                    s_hashingTfClassName, "load", path));
         }
         
         /// <summary>
         /// Saves the <see cref="HashingTF"/> so that it can be loaded later using Load
         /// </summary>
         /// <param name="path">The path to save the <see cref="HashingTF"/> to</param>
-        /// <returns><see cref="HashingTF"/></returns>
+        /// <returns>New <see cref="HashingTF"/> object</returns>
         public HashingTF Save(string path)
         {
             return WrapAsHashingTF(_jvmObject.Invoke("save", path));
@@ -75,7 +76,7 @@ public HashingTF Save(string path)
         /// <summary>
         /// Gets the binary toggle that controls term frequency counts
         /// </summary>
-        /// <returns>bool</returns>
+        /// <returns>bool showing term frequency counts</returns>
         public bool GetBinary()
         {
             return (bool)_jvmObject.Invoke("getBinary");
@@ -105,7 +106,7 @@ public string GetInputCol()
         /// Sets the column that the <see cref="HashingTF"/> should read from
         /// </summary>
         /// <param name="value">The name of the column to as the source</param>
-        /// <returns><see cref="HashingTF"/></returns>
+        /// <returns>New <see cref="HashingTF"/> object</returns>
         public HashingTF SetInputCol(string value)
         {
             return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value));
@@ -126,7 +127,7 @@ public string GetOutputCol()
         /// this is the name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column</param>
-        /// <returns><see cref="HashingTF"/></returns>
+        /// <returns>New <see cref="HashingTF"/> object</returns>
         public HashingTF SetOutputCol(string value)
         {
             return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value));
@@ -138,7 +139,7 @@ public HashingTF SetOutputCol(string value)
         /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the
         /// columns.
         /// </summary>
-        /// <returns>int</returns>
+        /// <returns>int, the number of features to be used</returns>
         public int GetNumFeatures()
         {
             return (int)_jvmObject.Invoke("getNumFeatures");
@@ -151,7 +152,7 @@ public int GetNumFeatures()
         /// columns.
         /// </summary>
         /// <param name="value">int</param>
-        /// <returns><see cref="HashingTF"/></returns>
+        /// <returns>New <see cref="HashingTF"/> object</returns>
         public HashingTF SetNumFeatures(int value)
         {
             return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value));
@@ -160,7 +161,7 @@ public HashingTF SetNumFeatures(int value)
         /// <summary>
         /// An immutable unique ID for the object and its derivatives.
         /// </summary>
-        /// <returns>string</returns>
+        /// <returns>string, unique ID for the object</returns>
         public string Uid()
         {
             return (string)_jvmObject.Invoke("uid");
@@ -176,17 +177,8 @@ public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
-
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="HashingTF"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="HashingTF"/></param>
-        /// <returns><see cref="HashingTF"/></returns>
-        private static HashingTF WrapAsHashingTF(object obj)
-        {
-            return new HashingTF((JvmObjectReference)obj);
-        }
+        
+        private static HashingTF WrapAsHashingTF(object obj) 
+            => new HashingTF((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
index fe92b1e23..5c2259aaf 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs
@@ -21,6 +21,8 @@ public class IDF : IJvmObjectReferenceProvider
     {
         private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF";
         
+        private readonly JvmObjectReference _jvmObject;
+        
         /// <summary>
         /// Create a <see cref="IDF"/> without any parameters
         /// </summary>
@@ -43,8 +45,7 @@ internal IDF(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-
-        private readonly JvmObjectReference _jvmObject;
+        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
 
         /// <summary>
@@ -60,7 +61,7 @@ public string GetInputCol()
         /// Sets the column that the <see cref="IDF"/> should read from
         /// </summary>
         /// <param name="value">The name of the column to as the source</param>
-        /// <returns><see cref="IDF"/></returns>
+        /// <returns>New <see cref="IDF"/> object</returns>
         public IDF SetInputCol(string value)
         {
             return WrapAsIDF(_jvmObject.Invoke("setInputCol", value));
@@ -81,7 +82,7 @@ public string GetOutputCol()
         /// name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column</param>
-        /// <returns><see cref="IDF"/></returns>
+        /// <returns>New <see cref="IDF"/> object</returns>
         public IDF SetOutputCol(string value)
         {
             return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value));
@@ -90,7 +91,7 @@ public IDF SetOutputCol(string value)
         /// <summary>
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
-        /// <returns>int</returns>
+        /// <returns>int, minimum number of documents in which a term should appear</returns>
         public int GetMinDocFreq()
         {
             return (int)_jvmObject.Invoke("getMinDocFreq");
@@ -100,7 +101,7 @@ public int GetMinDocFreq()
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
         /// <param name="value">int, the minimum of documents a term should appear in</param>
-        /// <returns></returns>
+        /// <returns>New <see cref="IDF"/> object</returns>
         public IDF SetMinDocFreq(int value)
         {
             return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value));
@@ -110,7 +111,7 @@ public IDF SetMinDocFreq(int value)
         /// Fits a model to the input data.
         /// </summary>
         /// <param name="source">The <see cref="DataFrame"/> to fit the model to</param>
-        /// <returns><see cref="IDFModel"/></returns>
+        /// <returns>New <see cref="IDFModel"/> object</returns>
         public IDFModel Fit(DataFrame source)
         {
             return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source));
@@ -131,7 +132,7 @@ public string Uid()
         /// Loads the <see cref="IDF"/> that was previously saved using Save
         /// </summary>
         /// <param name="path">The path the previous <see cref="IDF"/> was saved to</param>
-        /// <returns><see cref="IDF"/></returns>
+        /// <returns>New <see cref="IDF"/> object, loaded from path</returns>
         public static IDF Load(string path)
         {
             return WrapAsIDF(
@@ -142,22 +143,12 @@ public static IDF Load(string path)
         /// Saves the <see cref="IDF"/> so that it can be loaded later using Load
         /// </summary>
         /// <param name="path">The path to save the <see cref="IDF"/> to</param>
-        /// <returns><see cref="IDF"/></returns>
+        /// <returns>New <see cref="IDF"/> object</returns>
         public IDF Save(string path)
         {
             return WrapAsIDF(_jvmObject.Invoke("save", path));
         }
 
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="IDF"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="IDF"/></param>
-        /// <returns><see cref="IDF"/></returns>
-        private static IDF WrapAsIDF(object obj)
-        {
-            return new IDF((JvmObjectReference)obj);
-        }
+        private static IDF WrapAsIDF(object obj) => new IDF((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index 20cc6a886..16bccb50a 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -16,6 +16,8 @@ public class IDFModel : IJvmObjectReferenceProvider
     {
         private static readonly string s_IDFModelClassName = 
             "org.apache.spark.ml.feature.IDFModel";
+        
+        private readonly JvmObjectReference _jvmObject;
 
         /// <summary>
         /// Create a <see cref="IDFModel"/> without any parameters
@@ -39,8 +41,7 @@ internal IDFModel(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-
-        private readonly JvmObjectReference _jvmObject;
+        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -57,7 +58,7 @@ public string GetInputCol()
         /// buckets
         /// </summary>
         /// <param name="value">The name of the column to as the source</param>
-        /// <returns><see cref="IDFModel"/></returns>
+        /// <returns>New <see cref="IDFModel"/> object</returns>
         public IDFModel SetInputCol(string value)
         {
             return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value));
@@ -79,7 +80,7 @@ public string GetOutputCol()
         /// </summary>
         /// <param name="value">The name of the new column which contains the tokens
         /// </param>
-        /// <returns><see cref="IDFModel"/></returns>
+        /// <returns>New <see cref="IDFModel"/> object</returns>
         public IDFModel SetOutputCol(string value)
         {
             return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value));
@@ -88,7 +89,7 @@ public IDFModel SetOutputCol(string value)
         /// <summary>
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
-        /// <returns>int</returns>
+        /// <returns>int, minimum number of documents a term should appear</returns>
         public int GetMinDocFreq()
         {
             return (int)_jvmObject.Invoke("getMinDocFreq");
@@ -120,7 +121,7 @@ public string Uid()
         /// Loads the <see cref="IDFModel"/> that was previously saved using Save
         /// </summary>
         /// <param name="path">The path the previous <see cref="IDFModel"/> was saved to</param>
-        /// <returns><see cref="IDFModel"/></returns>
+        /// <returns>New <see cref="IDFModel"/> object, loaded from path</returns>
         public static IDFModel Load(string path)
         {
             return WrapAsIDFModel(
@@ -132,22 +133,13 @@ public static IDFModel Load(string path)
         /// Saves the <see cref="IDFModel"/> so that it can be loaded later using Load
         /// </summary>
         /// <param name="path">The path to save the <see cref="IDFModel"/> to</param>
-        /// <returns><see cref="IDFModel"/></returns>
+        /// <returns>New <see cref="IDFModel"/> object</returns>
         public IDFModel Save(string path)
         {
             return WrapAsIDFModel(_jvmObject.Invoke("save", path));
         }
         
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="IDFModel"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="IDFModel"/></param>
-        /// <returns><see cref="IDFModel"/></returns>
-        private static IDFModel WrapAsIDFModel(object obj)
-        {
-            return new IDFModel((JvmObjectReference)obj);
-        }
+        private static IDFModel WrapAsIDFModel(object obj) 
+            => new IDFModel((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index cfd605f33..4a323cb93 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -17,6 +17,8 @@ public class Tokenizer : IJvmObjectReferenceProvider
         private static readonly string s_tokenizerClassName = 
             "org.apache.spark.ml.feature.Tokenizer";
         
+        private readonly JvmObjectReference _jvmObject;
+        
         /// <summary>
         /// Create a <see cref="Tokenizer"/> without any parameters
         /// </summary>
@@ -39,8 +41,7 @@ internal Tokenizer(JvmObjectReference jvmObject)
         {
             _jvmObject = jvmObject;
         }
-
-        private readonly JvmObjectReference _jvmObject;
+        
         JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject;
         
         /// <summary>
@@ -56,7 +57,7 @@ public string GetInputCol()
         /// Sets the column that the <see cref="Tokenizer"/> should read from
         /// </summary>
         /// <param name="value">The name of the column to as the source</param>
-        /// <returns><see cref="Tokenizer"/></returns>
+        /// <returns>New <see cref="Tokenizer"/> object</returns>
         public Tokenizer SetInputCol(string value)
         {
             return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value));
@@ -77,7 +78,7 @@ public string GetOutputCol()
         /// name of the new column.
         /// </summary>
         /// <param name="value">The name of the new column</param>
-        /// <returns><see cref="Tokenizer"/></returns>
+        /// <returns>New <see cref="Tokenizer"/> object</returns>
         public Tokenizer SetOutputCol(string value)
         {
             return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value));
@@ -88,7 +89,8 @@ public Tokenizer SetOutputCol(string value)
         /// column
         /// </summary>
         /// <param name="source">The DataFrame to transform</param>
-        /// <returns><see cref="DataFrame"/></returns>
+        /// <returns>New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/>
+        /// transformed</returns>
         public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
@@ -109,7 +111,7 @@ public string Uid()
         /// Loads the <see cref="Tokenizer"/> that was previously saved using Save
         /// </summary>
         /// <param name="path">The path the previous <see cref="Tokenizer"/> was saved to</param>
-        /// <returns><see cref="Tokenizer"/></returns>
+        /// <returns>New <see cref="Tokenizer"/> object, loaded from path</returns>
         public static Tokenizer Load(string path)
         {
             return WrapAsTokenizer(
@@ -121,22 +123,13 @@ public static Tokenizer Load(string path)
         /// Saves the <see cref="Tokenizer"/> so that it can be loaded later using Load
         /// </summary>
         /// <param name="path">The path to save the <see cref="Tokenizer"/> to</param>
-        /// <returns><see cref="Tokenizer"/></returns>
+        /// <returns>New <see cref="Tokenizer"/> object</returns>
         public Tokenizer Save(string path)
         {
             return WrapAsTokenizer(_jvmObject.Invoke("save", path));
         }
         
-        /// <summary>
-        /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet
-        /// <see cref="Tokenizer"/>
-        /// </summary>
-        /// <param name="obj">The <see cref="JvmObjectReference"/> to convert into a dotnet
-        /// <see cref="Tokenizer"/></param>
-        /// <returns><see cref="Tokenizer"/></returns>
-        private static Tokenizer WrapAsTokenizer(object obj)
-        {
-            return new Tokenizer((JvmObjectReference)obj);
-        }
+        private static Tokenizer WrapAsTokenizer(object obj) 
+            => new Tokenizer((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
index 35488668d..6520c9505 100644
--- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
+++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj
@@ -33,7 +33,10 @@
   </ItemGroup>
 
   <ItemGroup>
-    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar" Link="jars\%(Filename)%(Extension)" Pack="true" PackagePath="jars\%(Filename)%(Extension)" />
+    <Content Include="..\..\scala\microsoft-spark-*\target\microsoft-spark-*.jar"
+             Link="jars\%(Filename)%(Extension)"
+             Pack="true"
+             PackagePath="jars\%(Filename)%(Extension)" />
     <Content Include="build\**" Pack="true" PackagePath="build" />
   </ItemGroup>
 
diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
index c99b141b9..2b65ea6d1 100644
--- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
+++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs
@@ -3,9 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using System;
-using System.Collections;
 using System.Collections.Generic;
-using System.Diagnostics;
 using System.Linq;
 using Microsoft.Spark.Interop.Ipc;
 using Newtonsoft.Json.Linq;

From 4cd86e38e14922ad39b5834a1ed047d89f345345 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 23 Mar 2020 21:50:56 +0000
Subject: [PATCH 44/47] typo ideModel and not idfModel

---
 .../IpcTests/ML/Feature/IDFModelTests.cs                        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
index 314030ca7..623b7322c 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs
@@ -59,7 +59,7 @@ public void TestIDFModel()
             
             using (var tempDirectory = new TemporaryDirectory())
             {
-                string modelPath = Path.Join(tempDirectory.Path, "ideModel");
+                string modelPath = Path.Join(tempDirectory.Path, "idfModel");
                 idfModel.Save(modelPath);
 
                 IDFModel loadedModel = IDFModel.Load(modelPath);

From 57729ee275730b52998b0064e100146548a95834 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 23 Mar 2020 21:51:45 +0000
Subject: [PATCH 45/47] cant use var here

---
 .../Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
index 944d2b24b..3dea63de7 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs
@@ -27,7 +27,7 @@ public void TestIDFModel()
             string expectedOutputCol = "features";
             int expectedDocFrequency = 100;
             
-            var idf = new IDF()
+            IDF idf = new IDF()
                 .SetInputCol(expectedInputCol)
                 .SetOutputCol(expectedOutputCol)
                 .SetMinDocFreq(expectedDocFrequency);

From da7660eb6725b96b8c820206b0ed486a79203a64 Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Mon, 23 Mar 2020 21:53:13 +0000
Subject: [PATCH 46/47] cant use var here

---
 .../IpcTests/ML/Feature/TokenizerTests.cs                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
index 0039d9e07..8cdb4e03a 100644
--- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
+++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs
@@ -30,7 +30,7 @@ public void TestTokenizer()
             DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" +
                 " from range(100)");
             
-            var tokenizer = new Tokenizer(expectedUid)
+            Tokenizer tokenizer = new Tokenizer(expectedUid)
                 .SetInputCol(expectedInputCol)
                 .SetOutputCol(expectedOutputCol);
             

From 22ff5e53f22560bdf9face72d219b6db36a360bb Mon Sep 17 00:00:00 2001
From: GOEddieUK <goeddie@gmail.com>
Date: Wed, 25 Mar 2020 07:13:20 +0000
Subject: [PATCH 47/47] formatting from feedback

---
 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 9 +++++----
 src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs  | 8 ++++----
 src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs   | 6 +++---
 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs  | 9 +++++----
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
index 02561b0a1..823f13c1a 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs
@@ -215,8 +215,9 @@ public Bucketizer Save(string path)
         /// column or columns with the bucketed data.
         /// </summary>
         /// <param name="source">The DataFrame to add the bucketed data to</param>
-        /// <returns><see cref="DataFrame"/> containing the original data and the new bucketed
-        /// columns</returns>
+        /// <returns>
+        /// <see cref="DataFrame"/> containing the original data and the new bucketed columns
+        /// </returns>
         public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
@@ -255,7 +256,7 @@ public Bucketizer SetHandleInvalid(string value)
             return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString()));
         }
         
-        private static Bucketizer WrapAsBucketizer(object obj) 
-            => new Bucketizer((JvmObjectReference)obj);
+        private static Bucketizer WrapAsBucketizer(object obj) => 
+            new Bucketizer((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
index 5fa774e00..50b4fe04a 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs
@@ -76,7 +76,7 @@ public HashingTF Save(string path)
         /// <summary>
         /// Gets the binary toggle that controls term frequency counts
         /// </summary>
-        /// <returns>bool showing term frequency counts</returns>
+        /// <returns>Flag showing whether the binary toggle is on or off</returns>
         public bool GetBinary()
         {
             return (bool)_jvmObject.Invoke("getBinary");
@@ -139,7 +139,7 @@ public HashingTF SetOutputCol(string value)
         /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the
         /// columns.
         /// </summary>
-        /// <returns>int, the number of features to be used</returns>
+        /// <returns>The number of features to be used</returns>
         public int GetNumFeatures()
         {
             return (int)_jvmObject.Invoke("getNumFeatures");
@@ -178,7 +178,7 @@ public DataFrame Transform(DataFrame source)
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
         }
         
-        private static HashingTF WrapAsHashingTF(object obj) 
-            => new HashingTF((JvmObjectReference)obj);
+        private static HashingTF WrapAsHashingTF(object obj) => 
+            new HashingTF((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
index 16bccb50a..4fc8a4f30 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs
@@ -89,7 +89,7 @@ public IDFModel SetOutputCol(string value)
         /// <summary>
         /// Minimum of documents in which a term should appear for filtering
         /// </summary>
-        /// <returns>int, minimum number of documents a term should appear</returns>
+        /// <returns>Minimum number of documents a term should appear</returns>
         public int GetMinDocFreq()
         {
             return (int)_jvmObject.Invoke("getMinDocFreq");
@@ -139,7 +139,7 @@ public IDFModel Save(string path)
             return WrapAsIDFModel(_jvmObject.Invoke("save", path));
         }
         
-        private static IDFModel WrapAsIDFModel(object obj) 
-            => new IDFModel((JvmObjectReference)obj);
+        private static IDFModel WrapAsIDFModel(object obj) => 
+            new IDFModel((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
index 4a323cb93..c411309dc 100644
--- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
+++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs
@@ -89,8 +89,9 @@ public Tokenizer SetOutputCol(string value)
         /// column
         /// </summary>
         /// <param name="source">The DataFrame to transform</param>
-        /// <returns>New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/>
-        /// transformed</returns>
+        /// <returns>
+        /// New <see cref="DataFrame"/> object with the source <see cref="DataFrame"/> transformed
+        /// </returns>
         public DataFrame Transform(DataFrame source)
         {
             return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source));
@@ -129,7 +130,7 @@ public Tokenizer Save(string path)
             return WrapAsTokenizer(_jvmObject.Invoke("save", path));
         }
         
-        private static Tokenizer WrapAsTokenizer(object obj) 
-            => new Tokenizer((JvmObjectReference)obj);
+        private static Tokenizer WrapAsTokenizer(object obj) => 
+            new Tokenizer((JvmObjectReference)obj);
     }
 }