From 45404bb0b3f7a784370c1a85bb598ab5fd15924e Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 16 Dec 2019 23:25:30 +0000 Subject: [PATCH 01/47] bare bones bucketizer --- .../Microsoft.Spark.Extensions.ML/Class1.cs | 12 +++ .../Microsoft.Spark.Extensions.ML.csproj | 7 ++ src/csharp/Microsoft.Spark.sln | 7 ++ .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 96 +++++++++++++++++++ .../Microsoft.Spark/Microsoft.Spark.csproj | 5 +- src/csharp/Microsoft.Spark/RDD.cs | 2 +- 6 files changed, 124 insertions(+), 5 deletions(-) create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs create mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs new file mode 100644 index 000000000..5874db8d0 --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs @@ -0,0 +1,12 @@ +using System; + +namespace Microsoft.Spark.Extensions.ML +{ + public class Pipeline where T : new() + { + public T Load(string path) + { + return new T(); + } + } +} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj new file mode 100644 index 000000000..27560206d --- /dev/null +++ b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj @@ -0,0 +1,7 @@ + + + + netstandard2.0 + + + diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln index b31c377c7..4b76eb777 100644 --- a/src/csharp/Microsoft.Spark.sln +++ b/src/csharp/Microsoft.Spark.sln @@ -33,6 +33,8 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions. EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Delta.E2ETest", "Extensions\Microsoft.Spark.Extensions.Delta.E2ETest\Microsoft.Spark.Extensions.Delta.E2ETest.csproj", "{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.ML", "Extensions\Microsoft.Spark.Extensions.ML\Microsoft.Spark.Extensions.ML.csproj", "{38672397-3BC7-4818-A84A-7EE1618311CA}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -83,6 +85,10 @@ Global {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Debug|Any CPU.Build.0 = Debug|Any CPU {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.ActiveCfg = Release|Any CPU {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.Build.0 = Release|Any CPU + {38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -92,6 +98,7 @@ Global {4E379DB3-7741-43C2-B32D-17AD96FEA7D0} = {C8C53525-4FEB-4B5B-91A2-619566C72F3E} {2048446B-45AB-4304-B230-50EDF6E8E6A4} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} + {38672397-3BC7-4818-A84A-7EE1618311CA} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs new file mode 100644 index 000000000..28a2c768a --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -0,0 +1,96 @@ +using System; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; + +namespace Microsoft.Spark.ML.Feature +{ + public class Bucketizer : IJvmObjectReferenceProvider + { + + internal Bucketizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + public Bucketizer() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor("org.apache.spark.ml.feature.Bucketizer"); + } + + public Bucketizer(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor("org.apache.spark.ml.feature.Bucketizer", uid); + } + + public static Bucketizer Load(string path) + { + return + WrapAsBucketizer( + SparkEnvironment.JvmBridge.CallStaticJavaMethod("org.apache.spark.ml.feature.Bucketizer", "load", + path)); + } + + public void Save(string path) + { + _jvmObject.Invoke("org.apache.spark.ml.feature.Bucketizer", "save",path); + } + + private readonly JvmObjectReference _jvmObject = null; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + public Bucketizer SetSplits(double[] value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); + } + + public double[] GetSplits() + { + return (double[])_jvmObject.Invoke("getSplits"); + } + + public string GetInputCol() + { + return (string)_jvmObject.Invoke("getInputCol"); + } + + public Bucketizer SetInputCol(string value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); + } + + public Bucketizer SetOutputCol(string value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); + } + + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + private static Bucketizer WrapAsBucketizer(object obj) + { + return new Bucketizer((JvmObjectReference)obj); + } + + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + public string GetHandleInvalid() + { + return (string)_jvmObject.Invoke("getHandleInvalid"); + } + + public Bucketizer SetHandleInvalid(string value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value)); + } + + public StructType TransformSchema(StructType schema) + { + return (StructType)_jvmObject.Invoke("transformScherma", schema); + } + } +} diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 297e3eb41..778862e3c 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -32,10 +32,7 @@ - + diff --git a/src/csharp/Microsoft.Spark/RDD.cs b/src/csharp/Microsoft.Spark/RDD.cs index 7eda57c61..556884560 100644 --- a/src/csharp/Microsoft.Spark/RDD.cs +++ b/src/csharp/Microsoft.Spark/RDD.cs @@ -102,7 +102,7 @@ internal RDD( _prevSerializedMode = prevSerializedMode; } - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// /// Persist this RDD with the default storage level (MEMORY_ONLY). From 95d0014b6971ec267939e8ae6de8a7f43cb3fa4f Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sun, 29 Dec 2019 12:55:56 +0000 Subject: [PATCH 02/47] implement bucketizer --- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 206 ++++++++++++++++++ .../ML/Param/DoubleArrayArrayParam.cs | 38 ++++ 2 files changed, 244 insertions(+) create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs new file mode 100644 index 000000000..260febd5e --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -0,0 +1,206 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.ML.Param; +using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// `Bucketizer` maps a column of continuous features to a column of feature buckets. + /// + /// `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note + /// that when both the `inputCol` and `inputCols` parameters are set, an Exception will be + /// thrown. The `splits` parameter is only used for single column usage, and `splitsArray` is + /// for multiple columns. + /// + public class Bucketizer : IJvmObjectReferenceProvider + { + private readonly JvmObjectReference _jvmObject = null; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + internal Bucketizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + /// + /// Create a `Bucketizer` without any parameters + /// + public Bucketizer() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.Bucketizer"); + } + + /// + /// Create a `Bucketizer` with a UID that is used to give the `Bucketizer` a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public Bucketizer(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.Bucketizer", uid); + } + + /// + /// Split points for splitting a single column into buckets. To split multiple columns use + /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time. + /// + /// + /// Split points for mapping continuous features into buckets. With n+1 splits, there are n + /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last + /// bucket, which also includes y. The splits should be of length >= 3 and strictly + /// increasing. Values outside the splits specified will be treated as errors. + /// + /// `Bucketizer` + public Bucketizer SetSplits(double[] value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); + } + + /// + /// Split points fot splitting multiple columns into buckets. To split a single column use + /// `SetSplits`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time. + /// + /// + /// The array of split points for mapping continuous features into buckets for multiple + /// columns. For each input column, with n+1 splits, there are n buckets. A bucket defined + /// by splits x,y holds values in the range [x,y) except the last bucket, which also + /// includes y. The splits should be of length >= 3 and strictly increasing. + /// Values outside the splits specified will be treated as errors. + /// `Bucketizer` + public Bucketizer SetSplitsArray(double[][] value) + { + DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject, + "setSplitsArray", + "wrapper for double[][] from csharp", value); + + return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", + doubleArrayArray.ReferenceValue)); + } + + /// + /// Sets the column that the `Bucketizer` should read from and convert into buckets + /// + /// The name of the column to as the source of the buckets + /// `Bucketizer` + public Bucketizer SetInputCol(string value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// Sets the columns that `Bucketizer` should read from and convert into buckets. + /// + /// Each column is one set of buckets so if you have two input columns you can have two + /// sets of buckets and two output columns. + /// + /// List of input columns to use as sources for buckets + /// `Bucketizer` + public Bucketizer SetInputCols(List value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); + } + + /// + /// The `Bucketizer` will create a new column in the DataFrame, this is the name of the + /// new column. + /// + /// The name of the new column which contains the bucket ID + /// `Bucketizer` + public Bucketizer SetOutputCol(string value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// The list of columns that the `Bucketizer` will create in the DataFrame. + /// + /// List of column names which will contain the bucket ID + /// `Bucketizer` + public Bucketizer SetOutputCols(List value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); + } + + /// + /// Executes the `Bucketizer` and transforms the DataFrame to include the new column or + /// columns with the bucketed data. + /// + /// The DataFrame to add the bucketed data to + /// `DataFrame` containing the original data and the new bucketed columns + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform" + , source)); + } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// `Bucketizer` + /// + /// The `JvmObjectReference` to convert into a dotnet `Bucketizer` + /// `Bucketizer` + private static Bucketizer WrapAsBucketizer(object obj) + { + return new Bucketizer((JvmObjectReference)obj); + } + + /// + /// The uid that was used to create the `Bucketizer`. If no `UID` is passed in when creating + /// the `Bucketizer` then a random `UID` is created when the `Bucketizer` is created. + /// + /// string `UID` identifying the `Bucketizer` + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + + /// + /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep" + /// + /// `BucketizerInvalidOptions` + public BucketizerInvalidOptions GetHandleInvalid() + { + string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid"); + if (BucketizerInvalidOptions.TryParse(handleInvalid, true, + out BucketizerInvalidOptions result)) + { + return result; + } + + return result; + } + + /// + /// Tells the `Bucketizer` what to do with invalid data. + /// + /// Choices are "skip", "error" or "keep". Default is "error" + /// + /// `BucketizerInvalidOptions`, "skip", "error" or "keep" + /// `Bucketizer` + public Bucketizer SetHandleInvalid(BucketizerInvalidOptions value) + { + return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); + } + + /// + /// dotnet version of the options that can be passed to the `Bucketizer` to tell it how to + /// handle invalid data. + /// + public enum BucketizerInvalidOptions + { + unknown, + skip, + error, + keep + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs new file mode 100644 index 000000000..7afe243c5 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs @@ -0,0 +1,38 @@ +using System; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Newtonsoft.Json; + +namespace Microsoft.Spark.ML.Param +{ + /// + /// Internal class used to help the `Bucketizer` pass a double[][] into the JVM. + /// + class DoubleArrayArrayParam : IJvmObjectReferenceProvider + { + private readonly JvmObjectReference _jvmObject; + + public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.param.DoubleArrayArrayParam", + parent, name, doc); + + string json = JsonConvert.SerializeObject(param); + ReferenceValue = jsonDecode(json); + } + + private JvmObjectReference jsonDecode(string json) + { + return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json); + } + public JvmObjectReference Reference { get; } + + /// + /// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to + /// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the + /// JVM side to get a double[][]. ReferenceValue is the double[][]. + /// + public JvmObjectReference ReferenceValue { get; } + } +} From fb2d0190c33fad85dcd716bc9ee50440e46e0b69 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sun, 29 Dec 2019 13:21:49 +0000 Subject: [PATCH 03/47] first tests --- .../IpcTests/ML/Feature/BucketizerTests.cs | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs new file mode 100644 index 000000000..bd6ff9231 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -0,0 +1,39 @@ +using System; +using System.Linq; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class BucketizerTests + { + private readonly SparkSession _spark; + + public BucketizerTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestBucketizer() + { + Bucketizer bucketizer = new Bucketizer("uid") + .SetInputCol("input_col") + .SetOutputCol("output_col") + .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.skip) + .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}); + + Assert.Equal(Bucketizer.BucketizerInvalidOptions.skip, + bucketizer.GetHandleInvalid()); + + Assert.Equal("uid", bucketizer.Uid()); + + DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); + + DataFrame output = bucketizer.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + } + } +} From d759e60110ba0a59bdb1e7ecc3c1b4a9c86cc857 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sun, 29 Dec 2019 13:27:51 +0000 Subject: [PATCH 04/47] multi column tests --- .../IpcTests/ML/Feature/BucketizerTests.cs | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index bd6ff9231..002a9812f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.Linq; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; @@ -35,5 +36,28 @@ public void TestBucketizer() DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); } + + [Fact] + public void TestBucketizer_MultipleColumns() + { + Bucketizer bucketizer = new Bucketizer() + .SetInputCols(new List(){"input_col_a", "input_col_b"}) + .SetOutputCols(new List(){"output_col_a", "output_col_b"}) + .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.keep) + .SetSplitsArray(new []{ + new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, + new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} + }); + + Assert.Equal(Bucketizer.BucketizerInvalidOptions.keep, + bucketizer.GetHandleInvalid()); + + DataFrame input = + _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)"); + + DataFrame output = bucketizer.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a")); + Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b")); + } } } From 97ef66865fde34f7ff32d619113f06bc478a2bf9 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 8 Jan 2020 21:40:36 +0000 Subject: [PATCH 05/47] Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs Co-Authored-By: Steve Suh --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 260febd5e..766bd64a5 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -31,7 +31,7 @@ internal Bucketizer(JvmObjectReference jvmObject) } /// - /// Create a `Bucketizer` without any parameters + /// Create a without any parameters /// public Bucketizer() { From 45439742ad039efeee768465cd8d030438c7512f Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 8 Jan 2020 21:40:43 +0000 Subject: [PATCH 06/47] Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs Co-Authored-By: Steve Suh --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 766bd64a5..8d4882364 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -104,7 +104,7 @@ public Bucketizer SetInputCol(string value) /// /// List of input columns to use as sources for buckets /// `Bucketizer` - public Bucketizer SetInputCols(List value) + public Bucketizer SetInputCols(IEnumerable value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); } From fd18cf425bd8e873a4436e0aa21f6d191c1587ef Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 8 Jan 2020 21:41:00 +0000 Subject: [PATCH 07/47] Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs Co-Authored-By: Steve Suh --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 8d4882364..a169cc5e7 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -186,7 +186,7 @@ public BucketizerInvalidOptions GetHandleInvalid() /// /// `BucketizerInvalidOptions`, "skip", "error" or "keep" /// `Bucketizer` - public Bucketizer SetHandleInvalid(BucketizerInvalidOptions value) + public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } From 64551c93636e7178d7a85535c26d8d254f804c74 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 8 Jan 2020 21:41:20 +0000 Subject: [PATCH 08/47] Update src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs Co-Authored-By: Steve Suh --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index a169cc5e7..f6b05305e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -167,7 +167,7 @@ public string Uid() /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep" /// /// `BucketizerInvalidOptions` - public BucketizerInvalidOptions GetHandleInvalid() + public string GetHandleInvalid() { string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid"); if (BucketizerInvalidOptions.TryParse(handleInvalid, true, From fb70f403d8af4ae1a659247c039d042e8eb97b56 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 8 Jan 2020 21:41:54 +0000 Subject: [PATCH 09/47] tidying --- .../IpcTests/ML/Feature/BucketizerTests.cs | 5 ++++- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 16 +++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 002a9812f..72da9cbc5 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -1,6 +1,9 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + using System; using System.Collections.Generic; -using System.Linq; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 260febd5e..22d62128c 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -22,14 +22,14 @@ namespace Microsoft.Spark.ML.Feature /// public class Bucketizer : IJvmObjectReferenceProvider { - private readonly JvmObjectReference _jvmObject = null; - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - internal Bucketizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// /// Create a `Bucketizer` without any parameters /// @@ -51,7 +51,7 @@ public Bucketizer(string uid) /// /// Split points for splitting a single column into buckets. To split multiple columns use - /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time. + /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time /// /// /// Split points for mapping continuous features into buckets. With n+1 splits, there are n @@ -135,7 +135,8 @@ public Bucketizer SetOutputCols(List value) /// columns with the bucketed data. /// /// The DataFrame to add the bucketed data to - /// `DataFrame` containing the original data and the new bucketed columns + /// `DataFrame` containing the original data and the new bucketed + /// columns public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform" @@ -146,7 +147,8 @@ public DataFrame Transform(DataFrame source) /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet /// `Bucketizer` /// - /// The `JvmObjectReference` to convert into a dotnet `Bucketizer` + /// The `JvmObjectReference` to convert into a dotnet + /// `Bucketizer` /// `Bucketizer` private static Bucketizer WrapAsBucketizer(object obj) { From 9891847c940a3d9777d53aea25499e30eb281597 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 8 Jan 2020 22:17:19 +0000 Subject: [PATCH 10/47] changes after review --- .../IpcTests/ML/Feature/BucketizerTests.cs | 25 ++-- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 110 ++++++++---------- 2 files changed, 61 insertions(+), 74 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 72da9cbc5..9ec077d23 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -11,7 +11,7 @@ namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature { [Collection("Spark E2E Tests")] - public class BucketizerTests + public class BucketizerTests { private readonly SparkSession _spark; @@ -19,43 +19,44 @@ public BucketizerTests(SparkFixture fixture) { _spark = fixture.Spark; } - + [Fact] public void TestBucketizer() { Bucketizer bucketizer = new Bucketizer("uid") .SetInputCol("input_col") .SetOutputCol("output_col") - .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.skip) + .SetHandleInvalid("skip") .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}); - Assert.Equal(Bucketizer.BucketizerInvalidOptions.skip, + Assert.Equal("skip", bucketizer.GetHandleInvalid()); Assert.Equal("uid", bucketizer.Uid()); - + DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); } - + [Fact] public void TestBucketizer_MultipleColumns() { Bucketizer bucketizer = new Bucketizer() - .SetInputCols(new List(){"input_col_a", "input_col_b"}) - .SetOutputCols(new List(){"output_col_a", "output_col_b"}) - .SetHandleInvalid(Bucketizer.BucketizerInvalidOptions.keep) - .SetSplitsArray(new []{ + .SetInputCols(new List() {"input_col_a", "input_col_b"}) + .SetOutputCols(new List() {"output_col_a", "output_col_b"}) + .SetHandleInvalid("keep") + .SetSplitsArray(new[] + { new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} }); - Assert.Equal(Bucketizer.BucketizerInvalidOptions.keep, + Assert.Equal("keep", bucketizer.GetHandleInvalid()); - DataFrame input = + DataFrame input = _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)"); DataFrame output = bucketizer.Transform(input); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 25465a0b1..a90582584 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -13,12 +13,13 @@ namespace Microsoft.Spark.ML.Feature { /// - /// `Bucketizer` maps a column of continuous features to a column of feature buckets. + /// maps a column of continuous features to a column of feature + /// buckets. /// - /// `Bucketizer` can map multiple columns at once by setting the `inputCols` parameter. Note - /// that when both the `inputCol` and `inputCols` parameters are set, an Exception will be - /// thrown. The `splits` parameter is only used for single column usage, and `splitsArray` is - /// for multiple columns. + /// can map multiple columns at once by setting the inputCols + /// parameter. Note that when both the inputCol and inputCols parameters are set, an Exception + /// will be thrown. The splits parameter is only used for single column usage, and splitsArray + /// is for multiple columns. /// public class Bucketizer : IJvmObjectReferenceProvider { @@ -26,10 +27,10 @@ internal Bucketizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - + private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - + /// /// Create a without any parameters /// @@ -40,7 +41,8 @@ public Bucketizer() } /// - /// Create a `Bucketizer` with a UID that is used to give the `Bucketizer` a unique ID + /// Create a with a UID that is used to give the + /// a unique ID /// /// An immutable unique ID for the object and its derivatives. public Bucketizer(string uid) @@ -48,10 +50,10 @@ public Bucketizer(string uid) _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( "org.apache.spark.ml.feature.Bucketizer", uid); } - + /// /// Split points for splitting a single column into buckets. To split multiple columns use - /// `SetSplitsArray`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time + /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time /// /// /// Split points for mapping continuous features into buckets. With n+1 splits, there are n @@ -59,7 +61,7 @@ public Bucketizer(string uid) /// bucket, which also includes y. The splits should be of length >= 3 and strictly /// increasing. Values outside the splits specified will be treated as errors. /// - /// `Bucketizer` + /// public Bucketizer SetSplits(double[] value) { return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); @@ -67,7 +69,7 @@ public Bucketizer SetSplits(double[] value) /// /// Split points fot splitting multiple columns into buckets. To split a single column use - /// `SetSplits`. You cannot use both `SetSplits` and `SetSplitsArray` at the same time. + /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. /// /// /// The array of split points for mapping continuous features into buckets for multiple @@ -75,7 +77,7 @@ public Bucketizer SetSplits(double[] value) /// by splits x,y holds values in the range [x,y) except the last bucket, which also /// includes y. The splits should be of length >= 3 and strictly increasing. /// Values outside the splits specified will be treated as errors. - /// `Bucketizer` + /// public Bucketizer SetSplitsArray(double[][] value) { DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject, @@ -87,122 +89,106 @@ public Bucketizer SetSplitsArray(double[][] value) } /// - /// Sets the column that the `Bucketizer` should read from and convert into buckets + /// Sets the column that the should read from and convert into + /// buckets /// /// The name of the column to as the source of the buckets - /// `Bucketizer` + /// public Bucketizer SetInputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); } /// - /// Sets the columns that `Bucketizer` should read from and convert into buckets. + /// Sets the columns that should read from and convert into + /// buckets. /// /// Each column is one set of buckets so if you have two input columns you can have two /// sets of buckets and two output columns. /// /// List of input columns to use as sources for buckets - /// `Bucketizer` + /// public Bucketizer SetInputCols(IEnumerable value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); } /// - /// The `Bucketizer` will create a new column in the DataFrame, this is the name of the - /// new column. + /// The will create a new column in the DataFrame, this is the + /// name of the new column. /// /// The name of the new column which contains the bucket ID - /// `Bucketizer` + /// public Bucketizer SetOutputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); } - + /// - /// The list of columns that the `Bucketizer` will create in the DataFrame. + /// The list of columns that the will create in the DataFrame. /// /// List of column names which will contain the bucket ID - /// `Bucketizer` + /// public Bucketizer SetOutputCols(List value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); } - + /// - /// Executes the `Bucketizer` and transforms the DataFrame to include the new column or - /// columns with the bucketed data. + /// Executes the and transforms the DataFrame to include the new + /// column or columns with the bucketed data. /// /// The DataFrame to add the bucketed data to - /// `DataFrame` containing the original data and the new bucketed - /// columns + /// containing the original data and the new bucketed + /// columns public DataFrame Transform(DataFrame source) { - return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform" - , source)); + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } /// /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// `Bucketizer` + /// /// - /// The `JvmObjectReference` to convert into a dotnet - /// `Bucketizer` - /// `Bucketizer` + /// The to convert into a dotnet + /// + /// private static Bucketizer WrapAsBucketizer(object obj) { return new Bucketizer((JvmObjectReference)obj); } /// - /// The uid that was used to create the `Bucketizer`. If no `UID` is passed in when creating - /// the `Bucketizer` then a random `UID` is created when the `Bucketizer` is created. + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. /// - /// string `UID` identifying the `Bucketizer` + /// string UID identifying the public string Uid() { return (string)_jvmObject.Invoke("uid"); } /// - /// How should the `Bucketizer` handle invalid data, choices are "skip", "error" or "keep" + /// How should the handle invalid data, choices are "skip", + /// "error" or "keep" /// - /// `BucketizerInvalidOptions` + /// string showing the way Spark will handle invalid data public string GetHandleInvalid() { - string handleInvalid = (string)_jvmObject.Invoke("getHandleInvalid"); - if (BucketizerInvalidOptions.TryParse(handleInvalid, true, - out BucketizerInvalidOptions result)) - { - return result; - } - - return result; + return (string)_jvmObject.Invoke("getHandleInvalid"); } /// - /// Tells the `Bucketizer` what to do with invalid data. + /// Tells the what to do with invalid data. /// /// Choices are "skip", "error" or "keep". Default is "error" /// - /// `BucketizerInvalidOptions`, "skip", "error" or "keep" - /// `Bucketizer` + /// "skip", "error" or "keep" + /// public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } - - /// - /// dotnet version of the options that can be passed to the `Bucketizer` to tell it how to - /// handle invalid data. - /// - public enum BucketizerInvalidOptions - { - unknown, - skip, - error, - keep - } } } From e2ce7369e71110d25c5f7c466363e3356bd1c55d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 9 Jan 2020 22:41:40 +0000 Subject: [PATCH 11/47] TF-IDF --- .../IpcTests/ML/Feature/HashingTFTests.cs | 40 ++++++ .../IpcTests/ML/Feature/IDFModelTests.cs | 41 ++++++ .../IpcTests/ML/Feature/IDFTests.cs | 40 ++++++ .../IpcTests/ML/Feature/TokenizerTests.cs | 39 ++++++ .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 1 - .../Microsoft.Spark/ML/Feature/HashingTF.cs | 113 +++++++++++++++++ src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 118 ++++++++++++++++++ .../Microsoft.Spark/ML/Feature/IDFModel.cs | 105 ++++++++++++++++ .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 104 +++++++++++++++ 9 files changed, 600 insertions(+), 1 deletion(-) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/IDF.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs new file mode 100644 index 000000000..37de01c40 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -0,0 +1,40 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class HashingTFTests + { + private readonly SparkSession _spark; + + public HashingTFTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestHashingTF() + { + HashingTF HashingTF = new HashingTF("uid") + .SetNumFeatures(10) + .SetInputCol("input_col") + .SetOutputCol("output_col"); + + Assert.Equal("uid", HashingTF.Uid()); + + DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + " as input_col"); + + DataFrame output = HashingTF.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs new file mode 100644 index 000000000..3c88f5872 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -0,0 +1,41 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class IDFModelTests + { + private readonly SparkSession _spark; + + public IDFModelTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestIDFModel() + { + IDF idf = new IDF("uid") + .SetMinDocFreq(2) + .SetInputCol("input_col") + .SetOutputCol("output_col"); + + Assert.Equal("uid", idf.Uid()); + + DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + " as input_col"); + + IDFModel model = idf.Fit(input); + model.Transform(input); + + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs new file mode 100644 index 000000000..c901e813a --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -0,0 +1,40 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class IDFTests + { + private readonly SparkSession _spark; + + public IDFTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestIDF() + { + IDF idf = new IDF("uid") + .SetMinDocFreq(2) + .SetInputCol("input_col") + .SetOutputCol("output_col"); + + Assert.Equal("uid", idf.Uid()); + + DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + " as input_col"); + + IDFModel model = idf.Fit(input); + + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs new file mode 100644 index 000000000..19eb9216f --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -0,0 +1,39 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class TokenizerTests + { + private readonly SparkSession _spark; + + public TokenizerTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestTokenizer() + { + Tokenizer Tokenizer = new Tokenizer("uid") + .SetInputCol("input_col") + .SetOutputCol("output_col"); + + Assert.Equal("uid", Tokenizer.Uid()); + + DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + + " from range(100)"); + + DataFrame output = Tokenizer.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index a90582584..18afccc87 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -8,7 +8,6 @@ using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.ML.Param; using Microsoft.Spark.Sql; -using Microsoft.Spark.Sql.Types; namespace Microsoft.Spark.ML.Feature { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs new file mode 100644 index 000000000..6356ea53b --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -0,0 +1,113 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A Maps a sequence of terms to their term frequencies using the + /// hashing trick. Currently we use Austin Appleby's MurmurHash 3 algorithm + /// (MurmurHash3_x86_32) to calculate the hash code value for the term object. Since a simple + /// modulo is used to transform the hash function to a column index, it is advisable to use a + /// power of two as the numFeatures parameter; otherwise the features will not be mapped evenly + /// to the columns. + /// + public class HashingTF : IJvmObjectReferenceProvider + { + + /// + /// Create a without any parameters + /// + public HashingTF() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.HashingTF"); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public HashingTF(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.HashingTF", uid); + } + + internal HashingTF(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// + public HashingTF SetInputCol(string value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column + /// + /// + public HashingTF SetOutputCol(string value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); + } + + public HashingTF SetNumFeatures(int value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); + } + + /// + /// Executes the and transforms the DataFrame to include the new + /// column or columns with the tokens. + /// + /// The DataFrame to add the tokens to + /// containing the original data and the tokens + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static HashingTF WrapAsHashingTF(object obj) + { + return new HashingTF((JvmObjectReference)obj); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs new file mode 100644 index 000000000..663b887d0 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -0,0 +1,118 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// Inverse document frequency (IDF). The standard formulation is used: + /// idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is + /// the number of documents that contain term t. + /// + /// This implementation supports filtering out terms which do not appear in a minimum number + /// of documents (controlled by the variable minDocFreq). For terms that are not in at least + /// minDocFreq documents, the IDF is found as 0, resulting in TF-IDFs of 0. + /// + public class IDF : IJvmObjectReferenceProvider + { + + /// + /// Create a without any parameters + /// + public IDF() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.IDF"); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public IDF(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.IDF", uid); + } + + internal IDF(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// + public IDF SetInputCol(string value) + { + return WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column + /// + /// + public IDF SetOutputCol(string value) + { + return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// + /// + public IDF SetMinDocFreq(int value) + { + return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value)); + } + + /// + /// Fits a model to the input data. + /// + /// The DataFrame to fit the model to + /// + public IDFModel Fit(DataFrame source) + { + return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); + } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static IDF WrapAsIDF(object obj) + { + return new IDF((JvmObjectReference)obj); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs new file mode 100644 index 000000000..d9cc13882 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -0,0 +1,105 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A that converts the input string to lowercase and then splits it by + /// white spaces. + /// + public class IDFModel : IJvmObjectReferenceProvider + { + + /// + /// Create a without any parameters + /// + public IDFModel() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.IDFModel"); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public IDFModel(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.IDFModel", uid); + } + + internal IDFModel(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Sets the column that the should read from and convert into + /// buckets + /// + /// The name of the column to as the source + /// + public IDFModel SetInputCol(string value) + { + return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column which contains the tokens + /// + /// + public IDFModel SetOutputCol(string value) + { + return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Executes the and transforms the DataFrame to include the new + /// column or columns with the tokens. + /// + /// The DataFrame to add the tokens to + /// containing the original data and the tokens + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static IDFModel WrapAsIDFModel(object obj) + { + return new IDFModel((JvmObjectReference)obj); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs new file mode 100644 index 000000000..3b2d395e9 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -0,0 +1,104 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + /// + /// A that converts the input string to lowercase and then splits it by + /// white spaces. + /// + public class Tokenizer : IJvmObjectReferenceProvider + { + + /// + /// Create a without any parameters + /// + public Tokenizer() + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.Tokenizer"); + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public Tokenizer(string uid) + { + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( + "org.apache.spark.ml.feature.Tokenizer", uid); + } + + internal Tokenizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; + } + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Sets the column that the should read from + /// + /// The name of the column to as the source + /// + public Tokenizer SetInputCol(string value) + { + return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column + /// + /// + public Tokenizer SetOutputCol(string value) + { + return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value)); + } + + /// + /// Executes the and transforms the DataFrame to include the new + /// column + /// + /// The DataFrame to transform + /// + public DataFrame Transform(DataFrame source) + { + return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); + } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static Tokenizer WrapAsTokenizer(object obj) + { + return new Tokenizer((JvmObjectReference)obj); + } + + /// + /// The uid that was used to create the . If no UID is passed in + /// when creating the then a random UID is created when the + /// is created. + /// + /// string UID identifying the + public string Uid() + { + return (string)_jvmObject.Invoke("uid"); + } + } +} From 3cc3f8def7f45ab021e500c79cb6545a1b1b8592 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 9 Jan 2020 23:40:45 +0000 Subject: [PATCH 12/47] removing step --- .../IpcTests/ML/Feature/HashingTFTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 37de01c40..80f35e7bd 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -34,7 +34,6 @@ public void TestHashingTF() " as input_col"); DataFrame output = HashingTF.Transform(input); - Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); } } } From 6cfd0e4e9a2e8a54cf57fbe0dc514c3c53093896 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 9 Jan 2020 23:50:12 +0000 Subject: [PATCH 13/47] single test for IDF and IDFModel --- .../IpcTests/ML/Feature/IDFModelTests.cs | 20 +++++----- .../IpcTests/ML/Feature/IDFTests.cs | 40 ------------------- 2 files changed, 11 insertions(+), 49 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 3c88f5872..06a4a3be0 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -23,19 +23,21 @@ public IDFModelTests(SparkFixture fixture) [Fact] public void TestIDFModel() { - IDF idf = new IDF("uid") - .SetMinDocFreq(2) - .SetInputCol("input_col") - .SetOutputCol("output_col"); + DataFrame sentenceData = + _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); + Tokenizer tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words"); + DataFrame wordsData = tokenizer.Transform(sentenceData); - Assert.Equal("uid", idf.Uid()); + HashingTF hashingTF = new HashingTF() + .SetInputCol("words").SetOutputCol("rawFeatures").SetNumFeatures(20); - DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + - " as input_col"); + DataFrame featurizedData = hashingTF.Transform(wordsData); - IDFModel model = idf.Fit(input); - model.Transform(input); + IDF idf = new IDF().SetInputCol("rawFeatures").SetOutputCol("features"); + IDFModel idfModel = idf.Fit(featurizedData); + DataFrame rescaledData = idfModel.Transform(featurizedData); + } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs deleted file mode 100644 index c901e813a..000000000 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using Microsoft.Spark.ML.Feature; -using Microsoft.Spark.Sql; -using Xunit; - -namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature -{ - [Collection("Spark E2E Tests")] - public class IDFTests - { - private readonly SparkSession _spark; - - public IDFTests(SparkFixture fixture) - { - _spark = fixture.Spark; - } - - [Fact] - public void TestIDF() - { - IDF idf = new IDF("uid") - .SetMinDocFreq(2) - .SetInputCol("input_col") - .SetOutputCol("output_col"); - - Assert.Equal("uid", idf.Uid()); - - DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + - " as input_col"); - - IDFModel model = idf.Fit(input); - - } - } -} From 633a843618ab8156af6c51b8f64ed4150196c0a7 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 07:49:05 +0000 Subject: [PATCH 14/47] SerDe to handle double[][] for Bucketizer --- .../IpcTests/ML/Feature/BucketizerTests.cs | 12 +++++++----- .../Interop/Ipc/PayloadHelper.cs | 15 +++++++++++++++ .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 18 +++++++----------- .../org/apache/spark/api/dotnet/SerDe.scala | 8 +++++++- .../org/apache/spark/api/dotnet/SerDe.scala | 8 +++++++- .../org/apache/spark/api/dotnet/SerDe.scala | 8 +++++++- 6 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 9ec077d23..7ee217eca 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -43,15 +43,17 @@ public void TestBucketizer() [Fact] public void TestBucketizer_MultipleColumns() { + double[][] splitsArray = new[] + { + new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, + new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} + }; + Bucketizer bucketizer = new Bucketizer() .SetInputCols(new List() {"input_col_a", "input_col_b"}) .SetOutputCols(new List() {"output_col_a", "output_col_b"}) .SetHandleInvalid("keep") - .SetSplitsArray(new[] - { - new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, - new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} - }); + .SetSplitsArray(splitsArray); Assert.Equal("keep", bucketizer.GetHandleInvalid()); diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs index 5bfeee865..06dcb8969 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs @@ -24,6 +24,7 @@ internal class PayloadHelper private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' }; private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' }; private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' }; + private static readonly byte[] s_doubleArrayArrayTypeId = new[] {(byte)'A'}; private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' }; private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' }; private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' }; @@ -135,6 +136,19 @@ internal static void ConvertArgsToBytes( SerDe.Write(destination, d); } break; + + case double[][] argDoubleArrayArray: + SerDe.Write(destination, s_doubleArrayArrayTypeId); + SerDe.Write(destination, argDoubleArrayArray.Length); + foreach (double[] doubleArray in argDoubleArrayArray) + { + SerDe.Write(destination, doubleArray.Length); + foreach (double d in doubleArray) + { + SerDe.Write(destination, d); + } + } + break; case IEnumerable argByteArrayEnumerable: SerDe.Write(destination, s_byteArrayTypeId); @@ -286,6 +300,7 @@ internal static byte[] GetTypeId(Type type) if (type == typeof(int[]) || type == typeof(long[]) || type == typeof(double[]) || + type == typeof(double[][]) || typeof(IEnumerable).IsAssignableFrom(type) || typeof(IEnumerable).IsAssignableFrom(type)) { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index a90582584..af2041945 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -27,10 +27,7 @@ internal Bucketizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - - private readonly JvmObjectReference _jvmObject; - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - + /// /// Create a without any parameters /// @@ -50,7 +47,10 @@ public Bucketizer(string uid) _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( "org.apache.spark.ml.feature.Bucketizer", uid); } - + + private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// /// Split points for splitting a single column into buckets. To split multiple columns use /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time @@ -80,12 +80,8 @@ public Bucketizer SetSplits(double[] value) /// public Bucketizer SetSplitsArray(double[][] value) { - DoubleArrayArrayParam doubleArrayArray = new DoubleArrayArrayParam(_jvmObject, - "setSplitsArray", - "wrapper for double[][] from csharp", value); - - return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", - doubleArrayArray.ReferenceValue)); + double[][][] wrappedValue = new[] {value}; + return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", wrappedValue)); } /// diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 6d1ba1077..169e244e1 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -118,6 +118,11 @@ object SerDe { (0 until len).map(_ => readDouble(in)).toArray } + def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = { + val len = readInt(in) + (0 until len).map(_ => readDoubleArr(in)).toArray + } + def readBooleanArr(in: DataInputStream): Array[Boolean] = { val len = readInt(in) (0 until len).map(_ => readBoolean(in)).toArray @@ -140,6 +145,7 @@ object SerDe { case 'g' => readLongArr(dis) case 'c' => readStringArr(dis) case 'd' => readDoubleArr(dis) + case 'A' => readDoubleArrArr(dis) case 'b' => readBooleanArr(dis) case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x)) case 'r' => readBytesArr(dis) @@ -360,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} +} \ No newline at end of file diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 6d1ba1077..169e244e1 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -118,6 +118,11 @@ object SerDe { (0 until len).map(_ => readDouble(in)).toArray } + def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = { + val len = readInt(in) + (0 until len).map(_ => readDoubleArr(in)).toArray + } + def readBooleanArr(in: DataInputStream): Array[Boolean] = { val len = readInt(in) (0 until len).map(_ => readBoolean(in)).toArray @@ -140,6 +145,7 @@ object SerDe { case 'g' => readLongArr(dis) case 'c' => readStringArr(dis) case 'd' => readDoubleArr(dis) + case 'A' => readDoubleArrArr(dis) case 'b' => readBooleanArr(dis) case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x)) case 'r' => readBytesArr(dis) @@ -360,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} +} \ No newline at end of file diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 6d1ba1077..169e244e1 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -118,6 +118,11 @@ object SerDe { (0 until len).map(_ => readDouble(in)).toArray } + def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = { + val len = readInt(in) + (0 until len).map(_ => readDoubleArr(in)).toArray + } + def readBooleanArr(in: DataInputStream): Array[Boolean] = { val len = readInt(in) (0 until len).map(_ => readBoolean(in)).toArray @@ -140,6 +145,7 @@ object SerDe { case 'g' => readLongArr(dis) case 'c' => readStringArr(dis) case 'd' => readDoubleArr(dis) + case 'A' => readDoubleArrArr(dis) case 'b' => readBooleanArr(dis) case 'j' => readStringArr(dis).map(x => JVMObjectTracker.getObject(x)) case 'r' => readBytesArr(dis) @@ -360,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} +} \ No newline at end of file From f4ecbb0106564d7835c3238cce32297a083da68e Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 08:25:29 +0000 Subject: [PATCH 15/47] remove DoubleArrayArrayParam --- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 1 - .../ML/Param/DoubleArrayArrayParam.cs | 38 ------------------- .../org/apache/spark/api/dotnet/SerDe.scala | 2 +- .../org/apache/spark/api/dotnet/SerDe.scala | 2 +- .../org/apache/spark/api/dotnet/SerDe.scala | 2 +- 5 files changed, 3 insertions(+), 42 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index af2041945..380adf887 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -6,7 +6,6 @@ using System.Collections.Generic; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; -using Microsoft.Spark.ML.Param; using Microsoft.Spark.Sql; using Microsoft.Spark.Sql.Types; diff --git a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs b/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs deleted file mode 100644 index 7afe243c5..000000000 --- a/src/csharp/Microsoft.Spark/ML/Param/DoubleArrayArrayParam.cs +++ /dev/null @@ -1,38 +0,0 @@ -using System; -using Microsoft.Spark.Interop; -using Microsoft.Spark.Interop.Ipc; -using Newtonsoft.Json; - -namespace Microsoft.Spark.ML.Param -{ - /// - /// Internal class used to help the `Bucketizer` pass a double[][] into the JVM. - /// - class DoubleArrayArrayParam : IJvmObjectReferenceProvider - { - private readonly JvmObjectReference _jvmObject; - - public DoubleArrayArrayParam(object parent, string name, string doc, double[][] param) - { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.param.DoubleArrayArrayParam", - parent, name, doc); - - string json = JsonConvert.SerializeObject(param); - ReferenceValue = jsonDecode(json); - } - - private JvmObjectReference jsonDecode(string json) - { - return (JvmObjectReference)_jvmObject.Invoke("jsonDecode", json); - } - public JvmObjectReference Reference { get; } - - /// - /// This is the JVM version of the double[][] so that it can be used by the `Bucketizer`, to - /// get the double[][] across the SerDe this serializes as JSON and used jsonDecode on the - /// JVM side to get a double[][]. ReferenceValue is the double[][]. - /// - public JvmObjectReference ReferenceValue { get; } - } -} diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 169e244e1..2dca7fbdb 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -366,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} \ No newline at end of file +} diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 169e244e1..2dca7fbdb 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -366,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} \ No newline at end of file +} diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 169e244e1..2dca7fbdb 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -366,4 +366,4 @@ private object SerializationFormats { val BYTE = "byte" val STRING = "string" val ROW = "row" -} \ No newline at end of file +} From b3d4d0fc780df0e1df3690dc1620d7960be405ea Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 21:58:35 +0000 Subject: [PATCH 16/47] SerDe for double[][] --- .../Microsoft.Spark/Interop/Ipc/JvmBridge.cs | 9 +++ .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 69 ++++++++++++++++++- .../org/apache/spark/api/dotnet/SerDe.scala | 10 +++ .../org/apache/spark/api/dotnet/SerDe.scala | 10 +++ .../org/apache/spark/api/dotnet/SerDe.scala | 10 +++ 5 files changed, 107 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs index 961200ef3..f8b2e9648 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs @@ -364,6 +364,15 @@ private object ReadCollection(Stream s) doubleArray[itemIndex] = SerDe.ReadDouble(s); } returnValue = doubleArray; + break; + case 'A': + var doubleArrayArray = new double[numOfItemsInList][]; + for (int itemIndex = 0; itemIndex < numOfItemsInList; ++itemIndex) + { + doubleArrayArray[itemIndex] = ReadCollection(s) as double[]; + } + returnValue = doubleArrayArray; + break; case 'b': var boolArray = new bool[numOfItemsInList]; diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 380adf887..0c3acbe2b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Linq; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql; @@ -50,6 +51,20 @@ public Bucketizer(string uid) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// + /// Split points for splitting a single column into buckets. To split multiple columns use + /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time + /// + /// Split points for mapping continuous features into buckets. With n+1 splits, there are n + /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last + /// bucket, which also includes y. The splits should be of length >= 3 and strictly + /// increasing. Values outside the splits specified will be treated as errors. + /// + public double[] GetSplits() + { + return (double[])_jvmObject.Invoke("getSplits"); + } + /// /// Split points for splitting a single column into buckets. To split multiple columns use /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time @@ -66,6 +81,16 @@ public Bucketizer SetSplits(double[] value) return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); } + /// + /// Split points fot splitting multiple columns into buckets. To split a single column use + /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. + /// + /// + public double[][] GetSplitsArray() + { + return (double[][])_jvmObject.Invoke("getSplitsArray"); + } + /// /// Split points fot splitting multiple columns into buckets. To split a single column use /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. @@ -83,6 +108,16 @@ public Bucketizer SetSplitsArray(double[][] value) return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", wrappedValue)); } + /// + /// Gets the column that the should read from and convert into + /// buckets + /// + /// + public string GetInputCol() + { + return (string)_jvmObject.Invoke("getInputCol"); + } + /// /// Sets the column that the should read from and convert into /// buckets @@ -93,6 +128,19 @@ public Bucketizer SetInputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); } + + /// + /// Gets the columns that should read from and convert into + /// buckets. + /// + /// Each column is one set of buckets so if you have two input columns you can have two + /// sets of buckets and two output columns. + /// + /// + public IEnumerable GetInputCols() + { + return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList(); + } /// /// Sets the columns that should read from and convert into @@ -107,7 +155,17 @@ public Bucketizer SetInputCols(IEnumerable value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); } - + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + // + public string GetOutputCol() + { + return (string)_jvmObject.Invoke("getOutputCol"); + } + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. @@ -119,6 +177,15 @@ public Bucketizer SetOutputCol(string value) return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); } + /// + /// The list of columns that the will create in the DataFrame. + /// + /// + public IEnumerable GetOutputCols() + { + return ((string[])_jvmObject.Invoke("getOutputCols")).ToList(); + } + /// /// The list of columns that the will create in the DataFrame. /// diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 2dca7fbdb..ca5973b96 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -195,6 +195,7 @@ object SerDe { case "void" => dos.writeByte('n') case "character" => dos.writeByte('c') case "double" => dos.writeByte('d') + case "doublearray" => dos.writeByte('A') case "long" => dos.writeByte('g') case "integer" => dos.writeByte('i') case "logical" => dos.writeByte('b') @@ -258,6 +259,9 @@ object SerDe { case "[D" => writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) + case "[[D" => + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) @@ -343,6 +347,12 @@ object SerDe { value.foreach(v => out.writeDouble(v)) } + def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = { + writeType(out, "doublearray") + out.writeInt(value.length) + value.foreach(v => writeDoubleArr(out, v)) + } + def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = { writeType(out, "logical") out.writeInt(value.length) diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 2dca7fbdb..ca5973b96 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -195,6 +195,7 @@ object SerDe { case "void" => dos.writeByte('n') case "character" => dos.writeByte('c') case "double" => dos.writeByte('d') + case "doublearray" => dos.writeByte('A') case "long" => dos.writeByte('g') case "integer" => dos.writeByte('i') case "logical" => dos.writeByte('b') @@ -258,6 +259,9 @@ object SerDe { case "[D" => writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) + case "[[D" => + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) @@ -343,6 +347,12 @@ object SerDe { value.foreach(v => out.writeDouble(v)) } + def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = { + writeType(out, "doublearray") + out.writeInt(value.length) + value.foreach(v => writeDoubleArr(out, v)) + } + def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = { writeType(out, "logical") out.writeInt(value.length) diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 2dca7fbdb..ca5973b96 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -195,6 +195,7 @@ object SerDe { case "void" => dos.writeByte('n') case "character" => dos.writeByte('c') case "double" => dos.writeByte('d') + case "doublearray" => dos.writeByte('A') case "long" => dos.writeByte('g') case "integer" => dos.writeByte('i') case "logical" => dos.writeByte('b') @@ -258,6 +259,9 @@ object SerDe { case "[D" => writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) + case "[[D" => + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) @@ -343,6 +347,12 @@ object SerDe { value.foreach(v => out.writeDouble(v)) } + def writeDoubleArrArr(out: DataOutputStream, value: Array[Array[Double]]): Unit = { + writeType(out, "doublearray") + out.writeInt(value.length) + value.foreach(v => writeDoubleArr(out, v)) + } + def writeBooleanArr(out: DataOutputStream, value: Array[Boolean]): Unit = { writeType(out, "logical") out.writeInt(value.length) From 500e7ad691477fb3e711f13c731bc50e5ceb26bd Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 22:00:36 +0000 Subject: [PATCH 17/47] spacing as per other fields --- src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs index 06dcb8969..8b6977025 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs @@ -24,7 +24,7 @@ internal class PayloadHelper private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' }; private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' }; private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' }; - private static readonly byte[] s_doubleArrayArrayTypeId = new[] {(byte)'A'}; + private static readonly byte[] s_doubleArrayArrayTypeId = new[] {( byte)'A' }; private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' }; private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' }; private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' }; From 298f4ece6b36487fd83fc30d68dbecc48e6ccb2a Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 22:03:11 +0000 Subject: [PATCH 18/47] formatting --- .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala | 4 ++-- .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala | 4 ++-- .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index ca5973b96..4a6b27a58 100644 --- a/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.3.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -260,8 +260,8 @@ object SerDe { writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) case "[[D" => - writeType(dos, "list") - writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index ca5973b96..4a6b27a58 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -260,8 +260,8 @@ object SerDe { writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) case "[[D" => - writeType(dos, "list") - writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index ca5973b96..4a6b27a58 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -260,8 +260,8 @@ object SerDe { writeType(dos, "list") writeDoubleArr(dos, value.asInstanceOf[Array[Double]]) case "[[D" => - writeType(dos, "list") - writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) + writeType(dos, "list") + writeDoubleArrArr(dos, value.asInstanceOf[Array[Array[Double]]]) case "[Z" => writeType(dos, "list") writeBooleanArr(dos, value.asInstanceOf[Array[Boolean]]) From 72d36fd0fa85d665a2d475fbecf59cb8e629f260 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 22:09:21 +0000 Subject: [PATCH 19/47] adding getters to tests --- artifaa. | 0 build.sh | 0 eng/common/build.sh | 0 .../IpcTests/ML/Feature/BucketizerTests.cs | 8 ++++++++ 4 files changed, 8 insertions(+) create mode 100644 artifaa. mode change 100644 => 100755 build.sh mode change 100644 => 100755 eng/common/build.sh diff --git a/artifaa. b/artifaa. new file mode 100644 index 000000000..e69de29bb diff --git a/build.sh b/build.sh old mode 100644 new mode 100755 diff --git a/eng/common/build.sh b/eng/common/build.sh old mode 100644 new mode 100755 diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 7ee217eca..dbaec110c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -38,6 +38,10 @@ public void TestBucketizer() DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + + Assert.IsType(bucketizer.GetInputCol()); + Assert.IsType(bucketizer.GetOutputCol()); + Assert.IsType(bucketizer.GetSplits()); } [Fact] @@ -64,6 +68,10 @@ public void TestBucketizer_MultipleColumns() DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a")); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b")); + + Assert.IsType>(bucketizer.GetInputCols()); + Assert.IsType>(bucketizer.GetOutputCols()); + Assert.IsType(bucketizer.GetSplitsArray()); } } } From 696186c84bafa2736a3b2ee136b1051ce36d6b89 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 13 Jan 2020 22:11:46 +0000 Subject: [PATCH 20/47] rollback --- artifaa. | 0 build.sh | 0 eng/common/build.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 artifaa. mode change 100755 => 100644 build.sh mode change 100755 => 100644 eng/common/build.sh diff --git a/artifaa. b/artifaa. deleted file mode 100644 index e69de29bb..000000000 diff --git a/build.sh b/build.sh old mode 100755 new mode 100644 diff --git a/eng/common/build.sh b/eng/common/build.sh old mode 100755 new mode 100644 From 33699ea327c39865e8caf1cb23925ed50a6a4b09 Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 15 Jan 2020 07:27:11 +0000 Subject: [PATCH 21/47] Apply suggestions from code review Co-Authored-By: Steve Suh --- .../IpcTests/ML/Feature/BucketizerTests.cs | 6 +++--- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index dbaec110c..8b9a85aab 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -39,9 +39,9 @@ public void TestBucketizer() DataFrame output = bucketizer.Transform(input); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); - Assert.IsType(bucketizer.GetInputCol()); - Assert.IsType(bucketizer.GetOutputCol()); - Assert.IsType(bucketizer.GetSplits()); + Assert.Equal("input_col", bucketizer.GetInputCol()); + Assert.Equal("output_col", bucketizer.GetOutputCol()); + Assert.Equal(expectedSplits, bucketizer.GetSplits()); } [Fact] diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 0c3acbe2b..7ace8c64e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -134,7 +134,7 @@ public Bucketizer SetInputCol(string value) /// buckets. /// /// Each column is one set of buckets so if you have two input columns you can have two - /// sets of buckets and two output columns. + /// sets of buckets and two output columns. /// /// public IEnumerable GetInputCols() @@ -147,7 +147,7 @@ public IEnumerable GetInputCols() /// buckets. /// /// Each column is one set of buckets so if you have two input columns you can have two - /// sets of buckets and two output columns. + /// sets of buckets and two output columns. /// /// List of input columns to use as sources for buckets /// From 5b8060606d6831f92555b2bb17f089d58b4928cd Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 15 Jan 2020 07:38:01 +0000 Subject: [PATCH 22/47] Fixing comments after review --- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 34 +++++++------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 7ace8c64e..f62525a3f 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -52,14 +52,9 @@ public Bucketizer(string uid) JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// - /// Split points for splitting a single column into buckets. To split multiple columns use - /// SetSplitsArray. You cannot use both SetSplits and SetSplitsArray at the same time + /// Gets the splits that were set using SetSplits /// - /// Split points for mapping continuous features into buckets. With n+1 splits, there are n - /// buckets. A bucket defined by splits x,y holds values in the range [x,y) except the last - /// bucket, which also includes y. The splits should be of length >= 3 and strictly - /// increasing. Values outside the splits specified will be treated as errors. - /// + /// double[], the splits to be used to bucket the input column public double[] GetSplits() { return (double[])_jvmObject.Invoke("getSplits"); @@ -82,10 +77,9 @@ public Bucketizer SetSplits(double[] value) } /// - /// Split points fot splitting multiple columns into buckets. To split a single column use - /// SetSplits. You cannot use both SetSplits and SetSplitsArray at the same time. + /// Gets the splits that were set by SetSplitsArray /// - /// + /// double[][], the splits to be used to bucket the input columns public double[][] GetSplitsArray() { return (double[][])_jvmObject.Invoke("getSplitsArray"); @@ -110,9 +104,9 @@ public Bucketizer SetSplitsArray(double[][] value) /// /// Gets the column that the should read from and convert into - /// buckets + /// buckets. This would have been set by SetInputCol /// - /// + /// string, the input column public string GetInputCol() { return (string)_jvmObject.Invoke("getInputCol"); @@ -131,12 +125,9 @@ public Bucketizer SetInputCol(string value) /// /// Gets the columns that should read from and convert into - /// buckets. - /// - /// Each column is one set of buckets so if you have two input columns you can have two - /// sets of buckets and two output columns. + /// buckets. This is set by SetInputCol /// - /// + /// IEnumerable, list of input columns public IEnumerable GetInputCols() { return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList(); @@ -157,10 +148,10 @@ public Bucketizer SetInputCols(IEnumerable value) } /// - /// The will create a new column in the DataFrame, this is the - /// name of the new column. + /// Gets the name of the column the output data will be written to. This is set by + /// SetInputCol /// - // + // string, the output column public string GetOutputCol() { return (string)_jvmObject.Invoke("getOutputCol"); @@ -179,8 +170,9 @@ public Bucketizer SetOutputCol(string value) /// /// The list of columns that the will create in the DataFrame. + /// This is set by SetOutputCols /// - /// + /// IEnumerable, list of output columns public IEnumerable GetOutputCols() { return ((string[])_jvmObject.Invoke("getOutputCols")).ToList(); From 6c12e6aa808fcc73f156433c31e4271687cc642b Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 15 Jan 2020 07:56:54 +0000 Subject: [PATCH 23/47] fixes after review --- .../IpcTests/ML/Feature/BucketizerTests.cs | 55 +++++++++++-------- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 4 +- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 8b9a85aab..a3578fc33 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; +using System.Security.Cryptography; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; @@ -23,44 +24,54 @@ public BucketizerTests(SparkFixture fixture) [Fact] public void TestBucketizer() { - Bucketizer bucketizer = new Bucketizer("uid") - .SetInputCol("input_col") - .SetOutputCol("output_col") - .SetHandleInvalid("skip") - .SetSplits(new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}); + double[] expectedSplits = new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}; - Assert.Equal("skip", - bucketizer.GetHandleInvalid()); + string expectedHandle = "skip"; + string expectedUid = "uid"; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + + Bucketizer bucketizer = new Bucketizer(expectedUid) + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol) + .SetHandleInvalid(expectedHandle) + .SetSplits(expectedSplits); + + Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid()); - Assert.Equal("uid", bucketizer.Uid()); + Assert.Equal(expectedUid, bucketizer.Uid()); DataFrame input = _spark.Sql("SELECT ID as input_col from range(100)"); DataFrame output = bucketizer.Transform(input); - Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); - Assert.Equal("input_col", bucketizer.GetInputCol()); - Assert.Equal("output_col", bucketizer.GetOutputCol()); + Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); + Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); } [Fact] public void TestBucketizer_MultipleColumns() { - double[][] splitsArray = new[] + double[][] expectedSplitsArray = new[] { new[] {Double.MinValue, 0.0, 10.0, 50.0, Double.MaxValue}, new[] {Double.MinValue, 0.0, 10000.0, Double.MaxValue} }; - + + string expectedHandle = "keep"; + + List expectedInputCols = new List() {"input_col_a", "input_col_b"}; + List expectedOutputCols = new List() {"output_col_a", "output_col_b"}; + Bucketizer bucketizer = new Bucketizer() - .SetInputCols(new List() {"input_col_a", "input_col_b"}) - .SetOutputCols(new List() {"output_col_a", "output_col_b"}) - .SetHandleInvalid("keep") - .SetSplitsArray(splitsArray); + .SetInputCols(expectedInputCols) + .SetOutputCols(expectedOutputCols) + .SetHandleInvalid(expectedHandle) + .SetSplitsArray(expectedSplitsArray); - Assert.Equal("keep", - bucketizer.GetHandleInvalid()); + Assert.Equal(expectedHandle, bucketizer.GetHandleInvalid()); DataFrame input = _spark.Sql("SELECT ID as input_col_a, ID as input_col_b from range(100)"); @@ -69,9 +80,9 @@ public void TestBucketizer_MultipleColumns() Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_a")); Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col_b")); - Assert.IsType>(bucketizer.GetInputCols()); - Assert.IsType>(bucketizer.GetOutputCols()); - Assert.IsType(bucketizer.GetSplitsArray()); + Assert.Equal(expectedInputCols, bucketizer.GetInputCols()); + Assert.Equal(expectedOutputCols, bucketizer.GetOutputCols()); + Assert.Equal(expectedSplitsArray, bucketizer.GetSplitsArray()); } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index f62525a3f..1d5e047d2 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -127,7 +127,7 @@ public Bucketizer SetInputCol(string value) /// Gets the columns that should read from and convert into /// buckets. This is set by SetInputCol /// - /// IEnumerable, list of input columns + /// IEnumerable<string>, list of input columns public IEnumerable GetInputCols() { return ((string[])(_jvmObject.Invoke("getInputCols"))).ToList(); @@ -172,7 +172,7 @@ public Bucketizer SetOutputCol(string value) /// The list of columns that the will create in the DataFrame. /// This is set by SetOutputCols /// - /// IEnumerable, list of output columns + /// IEnumerable<string>, list of output columns public IEnumerable GetOutputCols() { return ((string[])_jvmObject.Invoke("getOutputCols")).ToList(); From dc7bf4b05ea36f16ccd31e8a772c5b53c8183563 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 15 Jan 2020 08:15:58 +0000 Subject: [PATCH 24/47] wip --- .../IpcTests/ML/Feature/HashingTFTests.cs | 4 +- .../Microsoft.Spark/ML/Feature/HashingTF.cs | 60 +++++++++++++------ 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 80f35e7bd..3098c92dc 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -23,12 +23,12 @@ public HashingTFTests(SparkFixture fixture) [Fact] public void TestHashingTF() { - HashingTF HashingTF = new HashingTF("uid") + HashingTF HashingTF = new HashingTF(100) .SetNumFeatures(10) .SetInputCol("input_col") .SetOutputCol("output_col"); - Assert.Equal("uid", HashingTF.Uid()); + Assert.Equal(10, HashingTF.GetNumFeatures()); DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + " as input_col"); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 6356ea53b..58d097328 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -33,12 +33,12 @@ public HashingTF() /// /// Create a with a UID that is used to give the /// a unique ID + /// numFeatures number of features (default: 2^20^) /// - /// An immutable unique ID for the object and its derivatives. - public HashingTF(string uid) + public HashingTF(int numFeatures) { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.HashingTF", uid); + "org.apache.spark.ml.feature.HashingTF", numFeatures); } internal HashingTF(JvmObjectReference jvmObject) @@ -49,6 +49,15 @@ internal HashingTF(JvmObjectReference jvmObject) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// + /// Gets the column that the should read from + /// + /// string, the name of the input column + public string GetInputCol() + { + return (string)_jvmObject.Invoke("getInputCol"); + } + /// /// Sets the column that the should read from /// @@ -63,19 +72,43 @@ public HashingTF SetInputCol(string value) /// The will create a new column in the DataFrame, this is the /// name of the new column. /// - /// The name of the new column - /// + /// string, the name of the output col + public string GetOutputCol() + { + return (string)_jvmObject.Invoke("getOutputCol"); + } + + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// The name of the new column /// public HashingTF SetOutputCol(string value) { return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); } - public HashingTF SetNumFeatures(int value) + /// + /// Gets the number of features that should be used + /// + /// + public int NumFeatures() + { + return (int)_jvmObject.Invoke("NumFeatures"); + } + + /// + /// If true, term frequency vector will be binary such that non-zero term counts will be + /// set to 1, default: false + /// + /// Term frequency vector, default: false + /// + public HashingTF SetBinary(bool value) { - return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); + return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); } - + /// /// Executes the and transforms the DataFrame to include the new /// column or columns with the tokens. @@ -98,16 +131,5 @@ private static HashingTF WrapAsHashingTF(object obj) { return new HashingTF((JvmObjectReference)obj); } - - /// - /// The uid that was used to create the . If no UID is passed in - /// when creating the then a random UID is created when the - /// is created. - /// - /// string UID identifying the - public string Uid() - { - return (string)_jvmObject.Invoke("uid"); - } } } From 283f8ea594632c0f6f73cae57bb9c9f2073df344 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 15 Jan 2020 23:00:16 +0000 Subject: [PATCH 25/47] Hashing TF from ml not mllib --- .../IpcTests/ML/Feature/HashingTFTests.cs | 31 ++++++-- .../Microsoft.Spark/ML/Feature/HashingTF.cs | 77 +++++++++++++++---- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 3098c92dc..398eab47c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -4,8 +4,10 @@ using System; using System.Collections.Generic; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature @@ -23,17 +25,34 @@ public HashingTFTests(SparkFixture fixture) [Fact] public void TestHashingTF() { - HashingTF HashingTF = new HashingTF(100) - .SetNumFeatures(10) - .SetInputCol("input_col") - .SetOutputCol("output_col"); + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + int expectedFeatures = 10; + + HashingTF hashingTf = new HashingTF("my-unique-id") + .SetNumFeatures(expectedFeatures) + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol); - Assert.Equal(10, HashingTF.GetNumFeatures()); + Assert.Equal(expectedFeatures, hashingTf.GetNumFeatures()); + Assert.Equal(expectedInputCol, hashingTf.GetInputCol()); + Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol()); DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + " as input_col"); - DataFrame output = HashingTF.Transform(input); + DataFrame output = hashingTf.Transform(input); + Assert.Contains(expectedOutputCol, output.Columns()); + + using (var tempDirectory = new TemporaryDirectory()) + { + hashingTf.Save(tempDirectory.Path); + var loadedHashingTf = HashingTF.Load(tempDirectory.Path); + Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); + } + + hashingTf.SetBinary(true); + Assert.True(hashingTf.GetBinary()); } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 58d097328..b0d8fc078 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -7,6 +7,7 @@ using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql; +using Microsoft.Spark.Sql.Types; namespace Microsoft.Spark.ML.Feature { @@ -27,18 +28,18 @@ public class HashingTF : IJvmObjectReferenceProvider public HashingTF() { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.HashingTF"); + _javaClassName); } /// /// Create a with a UID that is used to give the /// a unique ID - /// numFeatures number of features (default: 2^20^) + /// unique identifier /// - public HashingTF(int numFeatures) + public HashingTF(string uid) { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.HashingTF", numFeatures); + _javaClassName, uid); } internal HashingTF(JvmObjectReference jvmObject) @@ -47,8 +48,51 @@ internal HashingTF(JvmObjectReference jvmObject) } private readonly JvmObjectReference _jvmObject; + private const string _javaClassName = "org.apache.spark.ml.feature.HashingTF"; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// + /// Loads the that was previously saved using Save + /// + /// + /// + public static HashingTF Load(string path) + { + return WrapAsHashingTF(SparkEnvironment.JvmBridge.CallStaticJavaMethod( + _javaClassName, + "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// + /// + public HashingTF Save(string path) + { + return WrapAsHashingTF(_jvmObject.Invoke("save", path)); + } + + /// + /// Gets the binary toggle that controls term frequency counts + /// + /// + public bool GetBinary() + { + return (bool)_jvmObject.Invoke("getBinary"); + } + + /// + /// Binary toggle to control term frequency counts. + /// If true, all non-zero counts are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts + /// + /// binary toggle, default is false + public HashingTF SetBinary(bool value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); + } + /// /// Gets the column that the should read from /// @@ -92,21 +136,28 @@ public HashingTF SetOutputCol(string value) /// /// Gets the number of features that should be used /// - /// - public int NumFeatures() + /// int + public int GetNumFeatures() { - return (int)_jvmObject.Invoke("NumFeatures"); + return (int)_jvmObject.Invoke("getNumFeatures"); + } + + /// + /// Sets the number of features that should be used + /// + /// + public HashingTF SetNumFeatures(int value) + { + return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); } /// - /// If true, term frequency vector will be binary such that non-zero term counts will be - /// set to 1, default: false + /// An immutable unique ID for the object and its derivatives. /// - /// Term frequency vector, default: false - /// - public HashingTF SetBinary(bool value) + /// string + public string Uid() { - return WrapAsHashingTF(_jvmObject.Invoke("setBinary", value)); + return (string)_jvmObject.Invoke("uid"); } /// From 9d0f7ea7df5a3ef3a34fc2bd34b0270c8f34ebea Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 16 Jan 2020 21:12:47 +0000 Subject: [PATCH 26/47] tests for HashingTF --- .../IpcTests/ML/Feature/HashingTFTests.cs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 398eab47c..ce20736b7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; +using System.IO; +using System.Linq; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; @@ -28,6 +30,8 @@ public void TestHashingTF() string expectedInputCol = "input_col"; string expectedOutputCol = "output_col"; int expectedFeatures = 10; + + Assert.IsType(new HashingTF()); HashingTF hashingTf = new HashingTF("my-unique-id") .SetNumFeatures(expectedFeatures) @@ -42,12 +46,13 @@ public void TestHashingTF() " as input_col"); DataFrame output = hashingTf.Transform(input); - Assert.Contains(expectedOutputCol, output.Columns()); + DataFrame outputColumn = output.Select(expectedOutputCol); using (var tempDirectory = new TemporaryDirectory()) { - hashingTf.Save(tempDirectory.Path); - var loadedHashingTf = HashingTF.Load(tempDirectory.Path); + var bucketPath = Path.Join(tempDirectory.Path, "bucket"); + hashingTf.Save(bucketPath); + var loadedHashingTf = HashingTF.Load(bucketPath); Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); } From 107e01b7267afc1cf1661c3bd79574881403a71b Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Thu, 16 Jan 2020 22:12:21 +0000 Subject: [PATCH 27/47] adding tests --- .../IpcTests/ML/Feature/BucketizerTests.cs | 10 ++++ .../IpcTests/ML/Feature/HashingTFTests.cs | 6 +- .../IpcTests/ML/Feature/IDFModelTests.cs | 29 +++++++++- .../IpcTests/ML/Feature/IDFTests.cs | 50 +++++++++++++++++ .../IpcTests/ML/Feature/TokenizerTests.cs | 35 +++++++++--- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 29 ++++++++-- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 56 ++++++++++++++++++- .../Microsoft.Spark/ML/Feature/IDFModel.cs | 46 ++++++++++++++- .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 49 ++++++++++++++-- .../Microsoft.Spark/Microsoft.Spark.csproj | 5 +- 10 files changed, 283 insertions(+), 32 deletions(-) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index a3578fc33..08282320e 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -4,7 +4,9 @@ using System; using System.Collections.Generic; +using System.IO; using System.Security.Cryptography; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; @@ -49,6 +51,14 @@ public void TestBucketizer() Assert.Equal(expectedInputCol, bucketizer.GetInputCol()); Assert.Equal(expectedOutputCol, bucketizer.GetOutputCol()); Assert.Equal(expectedSplits, bucketizer.GetSplits()); + + using (var tempDirectory = new TemporaryDirectory()) + { + var savePath = Path.Join(tempDirectory.Path, "bucket"); + bucketizer.Save(savePath); + var loadedBucketizer = Bucketizer.Load(savePath); + Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); + } } [Fact] diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index ce20736b7..9484fb30f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -50,9 +50,9 @@ public void TestHashingTF() using (var tempDirectory = new TemporaryDirectory()) { - var bucketPath = Path.Join(tempDirectory.Path, "bucket"); - hashingTf.Save(bucketPath); - var loadedHashingTf = HashingTF.Load(bucketPath); + var savePath = Path.Join(tempDirectory.Path, "hashingTF"); + hashingTf.Save(savePath); + var loadedHashingTf = HashingTF.Load(savePath); Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 06a4a3be0..cd55e90a7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; @@ -23,21 +25,42 @@ public IDFModelTests(SparkFixture fixture) [Fact] public void TestIDFModel() { + int expectedDocFrequency = 1980; + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; + + DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); Tokenizer tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); HashingTF hashingTF = new HashingTF() - .SetInputCol("words").SetOutputCol("rawFeatures").SetNumFeatures(20); + .SetInputCol("words") + .SetOutputCol(expectedInputCol) + .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); - - IDF idf = new IDF().SetInputCol("rawFeatures").SetOutputCol("features"); + + IDF idf = new IDF() + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol) + .SetMinDocFreq(expectedDocFrequency); + IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); + + Assert.Equal(expectedInputCol, idf.GetInputCol()); + Assert.Equal(expectedOutputCol, idf.GetOutputCol()); + + Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); + using (var tempDirectory = new TemporaryDirectory()) + { + var modelPath = Path.Join(tempDirectory.Path, "ideModel"); + idfModel.Save(modelPath); + } } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs new file mode 100644 index 000000000..fe7f6d30c --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -0,0 +1,50 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.E2ETest.Utils; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class IDFTests + { + private readonly SparkSession _spark; + + public IDFTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void TestIDFModel() + { + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; + int expectedDocFrequency = 100; + + IDF idf = new IDF() + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol) + .SetMinDocFreq(expectedDocFrequency); + + Assert.Equal(expectedInputCol, idf.GetInputCol()); + Assert.Equal(expectedOutputCol, idf.GetOutputCol()); + Assert.Equal(expectedDocFrequency, idf.GetMinDocFreq()); + + using (var tempDirectory = new TemporaryDirectory()) + { + var savePath = Path.Join(tempDirectory.Path, "IDF"); + idf.Save(savePath); + var loadedIdf = IDF.Load(savePath); + Assert.Equal(idf.Uid(), loadedIdf.Uid()); + } + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 19eb9216f..2a2d9139f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; @@ -23,17 +25,34 @@ public TokenizerTests(SparkFixture fixture) [Fact] public void TestTokenizer() { - Tokenizer Tokenizer = new Tokenizer("uid") - .SetInputCol("input_col") - .SetOutputCol("output_col"); - - Assert.Equal("uid", Tokenizer.Uid()); - + string expectedUid = "theUid"; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); + + Tokenizer tokenizer = new Tokenizer(expectedUid); + + tokenizer + .SetInputCol(expectedInputCol) + .SetOutputCol(expectedOutputCol); + + DataFrame output = tokenizer.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); - DataFrame output = Tokenizer.Transform(input); - Assert.Contains(output.Schema().Fields, (f => f.Name == "output_col")); + Assert.Equal(expectedInputCol, tokenizer.GetInputCol()); + Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol()); + + using (var tempDirectory = new TemporaryDirectory()) + { + var savePath = Path.Join(tempDirectory.Path, "Tokenizer"); + tokenizer.Save(savePath); + var loadedIdf = Tokenizer.Load(savePath); + Assert.Equal(tokenizer.Uid(), loadedIdf.Uid()); + } + + Assert.Equal(expectedUid, tokenizer.Uid()); } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index ce1436fe4..2db86f5cc 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -32,8 +32,7 @@ internal Bucketizer(JvmObjectReference jvmObject) /// public Bucketizer() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer"); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); } /// @@ -43,11 +42,11 @@ public Bucketizer() /// An immutable unique ID for the object and its derivatives. public Bucketizer(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer", uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); } private readonly JvmObjectReference _jvmObject; + private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer"; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -243,5 +242,27 @@ public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } + + /// + /// Loads the that was previously saved using Save + /// + /// + /// + public static Bucketizer Load(string path) + { + return WrapAsBucketizer(SparkEnvironment.JvmBridge.CallStaticJavaMethod( + JavaClassName, + "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// + /// + public Bucketizer Save(string path) + { + return WrapAsBucketizer(_jvmObject.Invoke("save", path)); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index 663b887d0..5f9e376a0 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -21,14 +21,13 @@ namespace Microsoft.Spark.ML.Feature /// public class IDF : IJvmObjectReferenceProvider { - /// /// Create a without any parameters /// public IDF() { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.IDF"); + JavaClassName); } /// @@ -39,7 +38,7 @@ public IDF() public IDF(string uid) { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.IDF", uid); + JavaClassName, uid); } internal IDF(JvmObjectReference jvmObject) @@ -47,9 +46,20 @@ internal IDF(JvmObjectReference jvmObject) _jvmObject = jvmObject; } + private const string JavaClassName = "org.apache.spark.ml.feature.IDF"; + private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + /// /// Sets the column that the should read from /// @@ -60,6 +70,16 @@ public IDF SetInputCol(string value) return WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); } + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. @@ -72,6 +92,15 @@ public IDF SetOutputCol(string value) return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); } + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// int + public int GetMinDocFreq() + { + return (int)_jvmObject.Invoke("getMinDocFreq"); + } + /// /// Minimum of documents in which a term should appear for filtering /// @@ -114,5 +143,26 @@ public string Uid() { return (string)_jvmObject.Invoke("uid"); } + + /// + /// Loads the that was previously saved using Save + /// + /// + /// + public static IDF Load(string path) + { + return WrapAsIDF( + SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// + /// + public IDF Save(string path) + { + return WrapAsIDF(_jvmObject.Invoke("save", path)); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index d9cc13882..0ee54564c 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -23,7 +23,7 @@ public class IDFModel : IJvmObjectReferenceProvider public IDFModel() { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.IDFModel"); + JavaClassName); } /// @@ -34,7 +34,7 @@ public IDFModel() public IDFModel(string uid) { _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.IDFModel", uid); + JavaClassName, uid); } internal IDFModel(JvmObjectReference jvmObject) @@ -44,7 +44,18 @@ internal IDFModel(JvmObjectReference jvmObject) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - + + private const string JavaClassName = "org.apache.spark.ml.feature.IDFModel"; + + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + /// /// Sets the column that the should read from and convert into /// buckets @@ -56,6 +67,16 @@ public IDFModel SetInputCol(string value) return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); } + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. @@ -68,6 +89,15 @@ public IDFModel SetOutputCol(string value) return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); } + /// + /// Minimum of documents in which a term should appear for filtering + /// + /// int + public int GetMinDocFreq() + { + return (int)_jvmObject.Invoke("getMinDocFreq"); + } + /// /// Executes the and transforms the DataFrame to include the new /// column or columns with the tokens. @@ -101,5 +131,15 @@ public string Uid() { return (string)_jvmObject.Invoke("uid"); } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// + /// + public IDFModel Save(string path) + { + return WrapAsIDFModel(_jvmObject.Invoke("save", path)); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index 3b2d395e9..c4ee596a9 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -22,8 +22,7 @@ public class Tokenizer : IJvmObjectReferenceProvider /// public Tokenizer() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Tokenizer"); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); } /// @@ -33,8 +32,7 @@ public Tokenizer() /// An immutable unique ID for the object and its derivatives. public Tokenizer(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Tokenizer", uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); } internal Tokenizer(JvmObjectReference jvmObject) @@ -45,6 +43,17 @@ internal Tokenizer(JvmObjectReference jvmObject) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + private const string JavaClassName = "org.apache.spark.ml.feature.Tokenizer"; + + /// + /// Gets the column that the should read from + /// + /// string, input column + public string GetInputCol() + { + return (string)(_jvmObject.Invoke("getInputCol")); + } + /// /// Sets the column that the should read from /// @@ -55,6 +64,16 @@ public Tokenizer SetInputCol(string value) return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); } + /// + /// The will create a new column in the DataFrame, this is the + /// name of the new column. + /// + /// string, the output column + public string GetOutputCol() + { + return (string)(_jvmObject.Invoke("getOutputCol")); + } + /// /// The will create a new column in the DataFrame, this is the /// name of the new column. @@ -100,5 +119,27 @@ public string Uid() { return (string)_jvmObject.Invoke("uid"); } + + + /// + /// Loads the that was previously saved using Save + /// + /// + /// + public static Tokenizer Load(string path) + { + return WrapAsTokenizer( + SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path)); + } + + /// + /// Saves the so that it can be loaded later using Load + /// + /// + /// + public Tokenizer Save(string path) + { + return WrapAsTokenizer(_jvmObject.Invoke("save", path)); + } } } diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index d473408b1..01a6fd7ec 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -32,10 +32,7 @@ - + From d85ca33540a0ee4e8bc139b88f8c7decf0000e11 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 21:52:59 +0000 Subject: [PATCH 28/47] removing project, in spark main project --- .../Microsoft.Spark.Extensions.ML/Class1.cs | 12 ------------ .../Microsoft.Spark.Extensions.ML.csproj | 7 ------- src/csharp/Microsoft.Spark.sln | 7 ------- 3 files changed, 26 deletions(-) delete mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs delete mode 100644 src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs deleted file mode 100644 index 5874db8d0..000000000 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Class1.cs +++ /dev/null @@ -1,12 +0,0 @@ -using System; - -namespace Microsoft.Spark.Extensions.ML -{ - public class Pipeline where T : new() - { - public T Load(string path) - { - return new T(); - } - } -} diff --git a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj b/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj deleted file mode 100644 index 27560206d..000000000 --- a/src/csharp/Extensions/Microsoft.Spark.Extensions.ML/Microsoft.Spark.Extensions.ML.csproj +++ /dev/null @@ -1,7 +0,0 @@ - - - - netstandard2.0 - - - diff --git a/src/csharp/Microsoft.Spark.sln b/src/csharp/Microsoft.Spark.sln index 4b76eb777..b31c377c7 100644 --- a/src/csharp/Microsoft.Spark.sln +++ b/src/csharp/Microsoft.Spark.sln @@ -33,8 +33,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions. EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Extensions.Delta.E2ETest", "Extensions\Microsoft.Spark.Extensions.Delta.E2ETest\Microsoft.Spark.Extensions.Delta.E2ETest.csproj", "{206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Microsoft.Spark.Extensions.ML", "Extensions\Microsoft.Spark.Extensions.ML\Microsoft.Spark.Extensions.ML.csproj", "{38672397-3BC7-4818-A84A-7EE1618311CA}" -EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -85,10 +83,6 @@ Global {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Debug|Any CPU.Build.0 = Debug|Any CPU {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.ActiveCfg = Release|Any CPU {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63}.Release|Any CPU.Build.0 = Release|Any CPU - {38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {38672397-3BC7-4818-A84A-7EE1618311CA}.Debug|Any CPU.Build.0 = Debug|Any CPU - {38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.ActiveCfg = Release|Any CPU - {38672397-3BC7-4818-A84A-7EE1618311CA}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -98,7 +92,6 @@ Global {4E379DB3-7741-43C2-B32D-17AD96FEA7D0} = {C8C53525-4FEB-4B5B-91A2-619566C72F3E} {2048446B-45AB-4304-B230-50EDF6E8E6A4} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} {206E16CA-ED59-4F5E-8EA1-9BB7BEEACB63} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} - {38672397-3BC7-4818-A84A-7EE1618311CA} = {71A19F75-8279-40AB-BEA0-7D4B153FC416} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FD15FFDB-EA1B-436F-841D-3386DDF94538} From c15ad6b018322487a69dd887234dd343ae3c9d5c Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:02:26 +0000 Subject: [PATCH 29/47] merge --- .../Microsoft.Spark/Interop/Ipc/JvmBridge.cs | 4 ---- .../Interop/Ipc/PayloadHelper.cs | 4 ---- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 19 ++----------------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs index 0bd02aa7e..887e8304f 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/JvmBridge.cs @@ -372,10 +372,6 @@ private object ReadCollection(Stream s) doubleArrayArray[itemIndex] = ReadCollection(s) as double[]; } returnValue = doubleArrayArray; -<<<<<<< HEAD - -======= ->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce break; case 'b': var boolArray = new bool[numOfItemsInList]; diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs index eeedf7bf4..e1771405d 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/PayloadHelper.cs @@ -24,11 +24,7 @@ internal class PayloadHelper private static readonly byte[] s_doubleTypeId = new[] { (byte)'d' }; private static readonly byte[] s_jvmObjectTypeId = new[] { (byte)'j' }; private static readonly byte[] s_byteArrayTypeId = new[] { (byte)'r' }; -<<<<<<< HEAD - private static readonly byte[] s_doubleArrayArrayTypeId = new[] {( byte)'A' }; -======= private static readonly byte[] s_doubleArrayArrayTypeId = new[] { ( byte)'A' }; ->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce private static readonly byte[] s_arrayTypeId = new[] { (byte)'l' }; private static readonly byte[] s_dictionaryTypeId = new[] { (byte)'e' }; private static readonly byte[] s_rowArrTypeId = new[] { (byte)'R' }; diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 70339d03a..1198e6a98 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -27,18 +27,13 @@ internal Bucketizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - + /// /// Create a without any parameters /// public Bucketizer() { -<<<<<<< HEAD _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); -======= - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer"); ->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce } /// @@ -48,19 +43,12 @@ public Bucketizer() /// An immutable unique ID for the object and its derivatives. public Bucketizer(string uid) { -<<<<<<< HEAD _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); } private readonly JvmObjectReference _jvmObject; private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer"; -======= - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - "org.apache.spark.ml.feature.Bucketizer", uid); - } - - private readonly JvmObjectReference _jvmObject; ->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -198,7 +186,6 @@ public Bucketizer SetOutputCols(List value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); } -<<<<<<< HEAD /// /// Loads the that was previously saved using Save @@ -220,8 +207,6 @@ public Bucketizer Save(string path) { return WrapAsBucketizer(_jvmObject.Invoke("save", path)); } -======= ->>>>>>> 739688e1906d209f9fef9d5078a529ce3f1746ce /// /// Executes the and transforms the DataFrame to include the new From 5c358d1eb9ee10bba47dd25b3c405e1ae1849580 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:18:14 +0000 Subject: [PATCH 30/47] testing --- .../IpcTests/ML/Feature/BucketizerTests.cs | 2 ++ .../IpcTests/ML/Feature/IDFModelTests.cs | 12 ++++++++---- .../IpcTests/ML/Feature/TokenizerTests.cs | 6 ++---- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index a07bd9172..5fc52261e 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Xunit; diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 97c08262a..896df31f5 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -32,7 +32,11 @@ public void TestIDFModel() DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); - var tokenizer = new Tokenizer().SetInputCol("sentence").SetOutputCol("words"); + + var tokenizer = new Tokenizer() + .SetInputCol("sentence") + .SetOutputCol("words"); + DataFrame wordsData = tokenizer.Transform(sentenceData); var hashingTF = new HashingTF() @@ -50,9 +54,9 @@ public void TestIDFModel() var idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); - - Assert.Equal(expectedInputCol, idf.GetInputCol()); - Assert.Equal(expectedOutputCol, idf.GetOutputCol()); + + Assert.Equal(expectedInputCol, idfModel.GetInputCol()); + Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 9de3e2f8d..b34a8a2d9 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -32,15 +32,13 @@ public void TestTokenizer() DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); - var tokenizer = new Tokenizer(expectedUid); - - tokenizer + var tokenizer = new Tokenizer(expectedUid) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); DataFrame output = tokenizer.Transform(input); + Assert.Contains(output.Schema().Fields, (f => f.Name == expectedOutputCol)); - Assert.Equal(expectedInputCol, tokenizer.GetInputCol()); Assert.Equal(expectedOutputCol, tokenizer.GetOutputCol()); From 9234dba3cbc4d6a247d8e549eb4f34fbec43f3c1 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:21:02 +0000 Subject: [PATCH 31/47] formatting --- .../Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 896df31f5..55946abb4 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -28,7 +28,6 @@ public void TestIDFModel() var expectedDocFrequency = 1980; var expectedInputCol = "rawFeatures"; var expectedOutputCol = "features"; - DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); From a524396d9822842d1f78c80da31f8634be6f3028 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:29:21 +0000 Subject: [PATCH 32/47] tidying: --- .../IpcTests/ML/Feature/IDFModelTests.cs | 1 - src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs | 1 - src/csharp/Microsoft.Spark/RDD.cs | 2 +- .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala | 4 ++-- .../src/main/scala/org/apache/spark/api/dotnet/SerDe.scala | 4 ++-- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 55946abb4..27e7b3a2d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -56,7 +56,6 @@ public void TestIDFModel() Assert.Equal(expectedInputCol, idfModel.GetInputCol()); Assert.Equal(expectedOutputCol, idfModel.GetOutputCol()); - Assert.Equal(expectedDocFrequency, idfModel.GetMinDocFreq()); using (var tempDirectory = new TemporaryDirectory()) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index c4ee596a9..566885a0c 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -120,7 +120,6 @@ public string Uid() return (string)_jvmObject.Invoke("uid"); } - /// /// Loads the that was previously saved using Save /// diff --git a/src/csharp/Microsoft.Spark/RDD.cs b/src/csharp/Microsoft.Spark/RDD.cs index 556884560..7eda57c61 100644 --- a/src/csharp/Microsoft.Spark/RDD.cs +++ b/src/csharp/Microsoft.Spark/RDD.cs @@ -102,7 +102,7 @@ internal RDD( _prevSerializedMode = prevSerializedMode; } - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// /// Persist this RDD with the default storage level (MEMORY_ONLY). diff --git a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 4a6b27a58..7a77af9b7 100644 --- a/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-2.4.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -119,8 +119,8 @@ object SerDe { } def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = { - val len = readInt(in) - (0 until len).map(_ => readDoubleArr(in)).toArray + val len = readInt(in) + (0 until len).map(_ => readDoubleArr(in)).toArray } def readBooleanArr(in: DataInputStream): Array[Boolean] = { diff --git a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala index 4a6b27a58..7a77af9b7 100644 --- a/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala +++ b/src/scala/microsoft-spark-3.0.x/src/main/scala/org/apache/spark/api/dotnet/SerDe.scala @@ -119,8 +119,8 @@ object SerDe { } def readDoubleArrArr(in: DataInputStream): Array[Array[Double]] = { - val len = readInt(in) - (0 until len).map(_ => readDoubleArr(in)).toArray + val len = readInt(in) + (0 until len).map(_ => readDoubleArr(in)).toArray } def readBooleanArr(in: DataInputStream): Array[Boolean] = { From fa9c065b5709799eaa45244d6ddf0921eb95a94e Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:31:56 +0000 Subject: [PATCH 33/47] removing change --- src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 01a6fd7ec..3bfdd951a 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -32,7 +32,10 @@ - + From 13adf7b21529df51a7b4d2e6aa67cf2bc63b0f17 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:33:05 +0000 Subject: [PATCH 34/47] removing change --- src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 3bfdd951a..d473408b1 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -32,9 +32,9 @@ - From 9147c121a5b6f65761781faaa19a08670f5a658d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:42:33 +0000 Subject: [PATCH 35/47] docs --- .../Microsoft.Spark/ML/Feature/HashingTF.cs | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index b0d8fc078..f04f5314d 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -27,8 +27,7 @@ public class HashingTF : IJvmObjectReferenceProvider /// public HashingTF() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - _javaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName); } /// @@ -38,8 +37,7 @@ public HashingTF() /// public HashingTF(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - _javaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName, uid); } internal HashingTF(JvmObjectReference jvmObject) @@ -58,9 +56,8 @@ internal HashingTF(JvmObjectReference jvmObject) /// public static HashingTF Load(string path) { - return WrapAsHashingTF(SparkEnvironment.JvmBridge.CallStaticJavaMethod( - _javaClassName, - "load", path)); + return WrapAsHashingTF( + SparkEnvironment.JvmBridge.CallStaticJavaMethod(_javaClassName,"load", path)); } /// @@ -76,7 +73,7 @@ public HashingTF Save(string path) /// /// Gets the binary toggle that controls term frequency counts /// - /// + /// bool public bool GetBinary() { return (bool)_jvmObject.Invoke("getBinary"); @@ -113,8 +110,8 @@ public HashingTF SetInputCol(string value) } /// - /// The will create a new column in the DataFrame, this is the - /// name of the new column. + /// The will create a new column in the , + /// this is the name of the new column. /// /// string, the name of the output col public string GetOutputCol() @@ -123,8 +120,8 @@ public string GetOutputCol() } /// - /// The will create a new column in the DataFrame, this is the - /// name of the new column. + /// The will create a new column in the , + /// this is the name of the new column. /// /// The name of the new column /// @@ -145,6 +142,7 @@ public int GetNumFeatures() /// /// Sets the number of features that should be used /// + /// int /// public HashingTF SetNumFeatures(int value) { @@ -164,7 +162,7 @@ public string Uid() /// Executes the and transforms the DataFrame to include the new /// column or columns with the tokens. /// - /// The DataFrame to add the tokens to + /// The to add the tokens to /// containing the original data and the tokens public DataFrame Transform(DataFrame source) { From 90937a8dac64a1399dd4715680489f8e3fb5f730 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Sat, 18 Jan 2020 22:50:16 +0000 Subject: [PATCH 36/47] formatting --- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 24 ++++++------- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 32 ++++++++--------- .../Microsoft.Spark/ML/Feature/IDFModel.cs | 34 +++++++++---------- .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 24 ++++++------- 4 files changed, 56 insertions(+), 58 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 1198e6a98..a43981b27 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -220,18 +220,6 @@ public DataFrame Transform(DataFrame source) return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static Bucketizer WrapAsBucketizer(object obj) - { - return new Bucketizer((JvmObjectReference)obj); - } - /// /// The uid that was used to create the . If no UID is passed in /// when creating the then a random UID is created when the @@ -264,5 +252,17 @@ public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static Bucketizer WrapAsBucketizer(object obj) + { + return new Bucketizer((JvmObjectReference)obj); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index 5f9e376a0..f30e15d72 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -26,8 +26,7 @@ public class IDF : IJvmObjectReferenceProvider /// public IDF() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - JavaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); } /// @@ -37,8 +36,7 @@ public IDF() /// An immutable unique ID for the object and its derivatives. public IDF(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor( - JavaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); } internal IDF(JvmObjectReference jvmObject) @@ -114,25 +112,13 @@ public IDF SetMinDocFreq(int value) /// /// Fits a model to the input data. /// - /// The DataFrame to fit the model to + /// The to fit the model to /// public IDFModel Fit(DataFrame source) { return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static IDF WrapAsIDF(object obj) - { - return new IDF((JvmObjectReference)obj); - } - /// /// The uid that was used to create the . If no UID is passed in /// when creating the then a random UID is created when the @@ -164,5 +150,17 @@ public IDF Save(string path) { return WrapAsIDF(_jvmObject.Invoke("save", path)); } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static IDF WrapAsIDF(object obj) + { + return new IDF((JvmObjectReference)obj); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index 3953ce58c..702a45aec 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -66,8 +66,8 @@ public IDFModel SetInputCol(string value) } /// - /// The will create a new column in the DataFrame, this is the - /// name of the new column. + /// The will create a new column in the , + /// this is the name of the new column. /// /// string, the output column public string GetOutputCol() @@ -97,28 +97,16 @@ public int GetMinDocFreq() } /// - /// Executes the and transforms the DataFrame to include the new - /// column or columns with the tokens. + /// Executes the and transforms the to + /// include the new column or columns with the tokens. /// - /// The DataFrame to add the tokens to + /// The to add the tokens to /// containing the original data and the tokens public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static IDFModel WrapAsIDFModel(object obj) - { - return new IDFModel((JvmObjectReference)obj); - } - /// /// The uid that was used to create the . If no UID is passed in /// when creating the then a random UID is created when the @@ -139,5 +127,17 @@ public IDFModel Save(string path) { return WrapAsIDFModel(_jvmObject.Invoke("save", path)); } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static IDFModel WrapAsIDFModel(object obj) + { + return new IDFModel((JvmObjectReference)obj); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index 566885a0c..35e86b039 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -97,18 +97,6 @@ public DataFrame Transform(DataFrame source) return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static Tokenizer WrapAsTokenizer(object obj) - { - return new Tokenizer((JvmObjectReference)obj); - } - /// /// The uid that was used to create the . If no UID is passed in /// when creating the then a random UID is created when the @@ -140,5 +128,17 @@ public Tokenizer Save(string path) { return WrapAsTokenizer(_jvmObject.Invoke("save", path)); } + + /// + /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet + /// + /// + /// The to convert into a dotnet + /// + /// + private static Tokenizer WrapAsTokenizer(object obj) + { + return new Tokenizer((JvmObjectReference)obj); + } } } From adca1d675e3f1ec5cfa59f1f9c72bec455ecce1d Mon Sep 17 00:00:00 2001 From: Ed Elliott Date: Wed, 5 Feb 2020 22:32:31 +0000 Subject: [PATCH 37/47] Apply suggestions from code review Co-Authored-By: elvaliuliuliu <47404285+elvaliuliuliu@users.noreply.github.com> --- .../IpcTests/ML/Feature/HashingTFTests.cs | 3 ++- .../IpcTests/ML/Feature/TokenizerTests.cs | 3 ++- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index f8aa7befa..390faaeb4 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -42,7 +42,8 @@ public void TestHashingTF() Assert.Equal(expectedInputCol, hashingTf.GetInputCol()); Assert.Equal(expectedOutputCol, hashingTf.GetOutputCol()); - DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + + " as input_col"); " as input_col"); DataFrame output = hashingTf.Transform(input); diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index b34a8a2d9..59255e149 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -29,7 +29,8 @@ public void TestTokenizer() var expectedInputCol = "input_col"; var expectedOutputCol = "output_col"; - DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + + DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + + " from range(100)"); " from range(100)"); var tokenizer = new Tokenizer(expectedUid) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index f30e15d72..7873b085a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -12,7 +12,7 @@ namespace Microsoft.Spark.ML.Feature { /// /// Inverse document frequency (IDF). The standard formulation is used: - /// idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is + /// idf = log((m + 1) / (d(t) + 1)), where m is the total number of documents and d(t) is /// the number of documents that contain term t. /// /// This implementation supports filtering out terms which do not appear in a minimum number From 44a4bb51f487daefd1d66192afacff18274e263e Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 7 Feb 2020 20:36:41 +0000 Subject: [PATCH 38/47] adding datatype udf where sqlType is available --- .../IpcTests/ML/Feature/HashingTFTests.cs | 7 +++---- .../IpcTests/ML/Feature/TokenizerTests.cs | 1 - src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 5 +---- src/csharp/Microsoft.Spark/Sql/Types/DataType.cs | 8 ++++++++ 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 390faaeb4..aaf12af02 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -3,13 +3,11 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections.Generic; using System.IO; using System.Linq; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; -using Microsoft.Spark.Sql.Types; using Xunit; namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature @@ -44,11 +42,12 @@ public void TestHashingTF() DataFrame input = _spark.Sql("SELECT array('this', 'is', 'a', 'string', 'a', 'a')" + " as input_col"); - " as input_col"); DataFrame output = hashingTf.Transform(input); - DataFrame outputColumn = output.Select(expectedOutputCol); + DataFrame outputVector = output.Select(expectedOutputCol); + Assert.Contains(expectedOutputCol, outputVector.Columns()); + using (var tempDirectory = new TemporaryDirectory()) { var savePath = Path.Join(tempDirectory.Path, "hashingTF"); diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 59255e149..3c99fa1e3 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -31,7 +31,6 @@ public void TestTokenizer() DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); - " from range(100)"); var tokenizer = new Tokenizer(expectedUid) .SetInputCol(expectedInputCol) diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index d473408b1..01a6fd7ec 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -32,10 +32,7 @@ - + diff --git a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs index 83bd1770f..a82babbf5 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs @@ -160,6 +160,14 @@ internal static DataType ParseDataType(JToken json) } else if (typeName == "udt") { + if (typeJObject.TryGetValue("class", out JToken classToken)) + { + if (typeJObject.TryGetValue("sqlType", out JToken sqlTypeToken)) + { + return new StructType(sqlTypeToken as JObject); + } + } + throw new NotImplementedException(); } } From 15bae3eeb5c94f9b25f41d64c028cc7f5d21dc6a Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 2 Mar 2020 21:53:29 +0000 Subject: [PATCH 39/47] feedback from review --- .../IpcTests/ML/Feature/BucketizerTests.cs | 5 +++-- .../IpcTests/ML/Feature/HashingTFTests.cs | 11 +++++----- .../IpcTests/ML/Feature/IDFModelTests.cs | 13 +++++++----- .../IpcTests/ML/Feature/IDFTests.cs | 11 +++++----- .../IpcTests/ML/Feature/TokenizerTests.cs | 13 ++++++------ .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 21 ++++++++++--------- .../Microsoft.Spark/ML/Feature/HashingTF.cs | 21 +++++++++++++------ src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 10 ++++----- .../Microsoft.Spark/ML/Feature/IDFModel.cs | 21 ++++++++++++++----- .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 11 +++++----- .../Microsoft.Spark/Sql/Types/DataType.cs | 2 +- 11 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index a8b2c1c20..10b48e634 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -52,9 +52,10 @@ public void TestBucketizer() using (var tempDirectory = new TemporaryDirectory()) { - var savePath = Path.Join(tempDirectory.Path, "bucket"); + string savePath = Path.Join(tempDirectory.Path, "bucket"); bucketizer.Save(savePath); - var loadedBucketizer = Bucketizer.Load(savePath); + + Bucketizer loadedBucketizer = Bucketizer.Load(savePath); Assert.Equal(bucketizer.Uid(), loadedBucketizer.Uid()); } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 20beb5be2..106bca3f2 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -23,9 +23,9 @@ public HashingTFTests(SparkFixture fixture) [Fact] public void TestHashingTF() { - var expectedInputCol = "input_col"; - var expectedOutputCol = "output_col"; - var expectedFeatures = 10; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; + int expectedFeatures = 10; Assert.IsType(new HashingTF()); @@ -48,9 +48,10 @@ public void TestHashingTF() using (var tempDirectory = new TemporaryDirectory()) { - var savePath = Path.Join(tempDirectory.Path, "hashingTF"); + string savePath = Path.Join(tempDirectory.Path, "hashingTF"); hashingTf.Save(savePath); - var loadedHashingTf = HashingTF.Load(savePath); + + HashingTF loadedHashingTf = HashingTF.Load(savePath); Assert.Equal(hashingTf.Uid(), loadedHashingTf.Uid()); } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 8062c66c2..c695f8515 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -23,9 +23,9 @@ public IDFModelTests(SparkFixture fixture) [Fact] public void TestIDFModel() { - var expectedDocFrequency = 1980; - var expectedInputCol = "rawFeatures"; - var expectedOutputCol = "features"; + int expectedDocFrequency = 1980; + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); @@ -48,7 +48,7 @@ public void TestIDFModel() .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); - var idfModel = idf.Fit(featurizedData); + IDFModel idfModel = idf.Fit(featurizedData); DataFrame rescaledData = idfModel.Transform(featurizedData); Assert.Contains(expectedOutputCol, rescaledData.Columns()); @@ -59,8 +59,11 @@ public void TestIDFModel() using (var tempDirectory = new TemporaryDirectory()) { - var modelPath = Path.Join(tempDirectory.Path, "ideModel"); + string modelPath = Path.Join(tempDirectory.Path, "ideModel"); idfModel.Save(modelPath); + + IDFModel loadedModel = IDFModel.Load(modelPath); + Assert.Equal(idfModel.Uid(), loadedModel.Uid()); } } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs index c556a37e3..944d2b24b 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -23,9 +23,9 @@ public IDFTests(SparkFixture fixture) [Fact] public void TestIDFModel() { - var expectedInputCol = "rawFeatures"; - var expectedOutputCol = "features"; - var expectedDocFrequency = 100; + string expectedInputCol = "rawFeatures"; + string expectedOutputCol = "features"; + int expectedDocFrequency = 100; var idf = new IDF() .SetInputCol(expectedInputCol) @@ -38,9 +38,10 @@ public void TestIDFModel() using (var tempDirectory = new TemporaryDirectory()) { - var savePath = Path.Join(tempDirectory.Path, "IDF"); + string savePath = Path.Join(tempDirectory.Path, "IDF"); idf.Save(savePath); - var loadedIdf = IDF.Load(savePath); + + IDF loadedIdf = IDF.Load(savePath); Assert.Equal(idf.Uid(), loadedIdf.Uid()); } } diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 0b6611437..0039d9e07 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -23,9 +23,9 @@ public TokenizerTests(SparkFixture fixture) [Fact] public void TestTokenizer() { - var expectedUid = "theUid"; - var expectedInputCol = "input_col"; - var expectedOutputCol = "output_col"; + string expectedUid = "theUid"; + string expectedInputCol = "input_col"; + string expectedOutputCol = "output_col"; DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); @@ -42,10 +42,11 @@ public void TestTokenizer() using (var tempDirectory = new TemporaryDirectory()) { - var savePath = Path.Join(tempDirectory.Path, "Tokenizer"); + string savePath = Path.Join(tempDirectory.Path, "Tokenizer"); tokenizer.Save(savePath); - var loadedIdf = Tokenizer.Load(savePath); - Assert.Equal(tokenizer.Uid(), loadedIdf.Uid()); + + Tokenizer loadedTokenizer = Tokenizer.Load(savePath); + Assert.Equal(tokenizer.Uid(), loadedTokenizer.Uid()); } Assert.Equal(expectedUid, tokenizer.Uid()); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index a43981b27..d870baf66 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -8,7 +8,6 @@ using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Sql; -using Microsoft.Spark.Sql.Types; namespace Microsoft.Spark.ML.Feature { @@ -23,17 +22,15 @@ namespace Microsoft.Spark.ML.Feature /// public class Bucketizer : IJvmObjectReferenceProvider { - internal Bucketizer(JvmObjectReference jvmObject) - { - _jvmObject = jvmObject; - } + private static readonly string s_bucketizerClassName = + "org.apache.spark.ml.feature.Bucketizer"; /// /// Create a without any parameters /// public Bucketizer() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName); } /// @@ -43,12 +40,16 @@ public Bucketizer() /// An immutable unique ID for the object and its derivatives. public Bucketizer(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_bucketizerClassName, uid); + } + + internal Bucketizer(JvmObjectReference jvmObject) + { + _jvmObject = jvmObject; } private readonly JvmObjectReference _jvmObject; - private const string JavaClassName = "org.apache.spark.ml.feature.Bucketizer"; - + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -195,7 +196,7 @@ public Bucketizer SetOutputCols(List value) public static Bucketizer Load(string path) { return WrapAsBucketizer( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName,"load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_bucketizerClassName,"load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 6471ca563..9aef51934 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -21,12 +21,15 @@ namespace Microsoft.Spark.ML.Feature /// public class HashingTF : IJvmObjectReferenceProvider { + private static readonly string s_hashingTfClassName = + "org.apache.spark.ml.feature.HashingTF"; + /// /// Create a without any parameters /// public HashingTF() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName); } /// @@ -36,7 +39,7 @@ public HashingTF() /// public HashingTF(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(_javaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_hashingTfClassName, uid); } internal HashingTF(JvmObjectReference jvmObject) @@ -45,7 +48,7 @@ internal HashingTF(JvmObjectReference jvmObject) } private readonly JvmObjectReference _jvmObject; - private const string _javaClassName = "org.apache.spark.ml.feature.HashingTF"; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -56,7 +59,7 @@ internal HashingTF(JvmObjectReference jvmObject) public static HashingTF Load(string path) { return WrapAsHashingTF( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(_javaClassName,"load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_hashingTfClassName,"load", path)); } /// @@ -130,7 +133,10 @@ public HashingTF SetOutputCol(string value) } /// - /// Gets the number of features that should be used + /// Gets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two + /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. /// /// int public int GetNumFeatures() @@ -139,7 +145,10 @@ public int GetNumFeatures() } /// - /// Sets the number of features that should be used + /// Sets the number of features that should be used. Since a simple modulo is used to + /// transform the hash function to a column index, it is advisable to use a power of two as + /// the numFeatures parameter; otherwise the features will not be mapped evenly to the + /// columns. /// /// int /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index b774b4257..67fec0890 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -19,12 +19,14 @@ namespace Microsoft.Spark.ML.Feature /// public class IDF : IJvmObjectReferenceProvider { + private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF"; + /// /// Create a without any parameters /// public IDF() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName); } /// @@ -34,7 +36,7 @@ public IDF() /// An immutable unique ID for the object and its derivatives. public IDF(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFClassName, uid); } internal IDF(JvmObjectReference jvmObject) @@ -42,8 +44,6 @@ internal IDF(JvmObjectReference jvmObject) _jvmObject = jvmObject; } - private const string JavaClassName = "org.apache.spark.ml.feature.IDF"; - private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; @@ -136,7 +136,7 @@ public string Uid() public static IDF Load(string path) { return WrapAsIDF( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFClassName, "load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index aeaedd182..c222e1ada 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -14,13 +14,15 @@ namespace Microsoft.Spark.ML.Feature /// public class IDFModel : IJvmObjectReferenceProvider { - + private static readonly string s_IDFModelClassName = + "org.apache.spark.ml.feature.IDFModel"; + /// /// Create a without any parameters /// public IDFModel() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName); } /// @@ -30,7 +32,7 @@ public IDFModel() /// An immutable unique ID for the object and its derivatives. public IDFModel(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_IDFModelClassName, uid); } internal IDFModel(JvmObjectReference jvmObject) @@ -41,8 +43,6 @@ internal IDFModel(JvmObjectReference jvmObject) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - private const string JavaClassName = "org.apache.spark.ml.feature.IDFModel"; - /// /// Gets the column that the should read from /// @@ -116,6 +116,17 @@ public string Uid() return (string)_jvmObject.Invoke("uid"); } + /// + /// Loads the that was previously saved using Save + /// + /// + /// + public static IDFModel Load(string path) + { + return WrapAsIDFModel( + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFModelClassName, "load", path)); + } + /// /// Saves the so that it can be loaded later using Load /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index ab2e0ec76..904978a45 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -14,12 +14,15 @@ namespace Microsoft.Spark.ML.Feature /// public class Tokenizer : IJvmObjectReferenceProvider { + private static readonly string s_tokenizerClassName = + "org.apache.spark.ml.feature.Tokenizer"; + /// /// Create a without any parameters /// public Tokenizer() { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName); } /// @@ -29,7 +32,7 @@ public Tokenizer() /// An immutable unique ID for the object and its derivatives. public Tokenizer(string uid) { - _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(JavaClassName, uid); + _jvmObject = SparkEnvironment.JvmBridge.CallConstructor(s_tokenizerClassName, uid); } internal Tokenizer(JvmObjectReference jvmObject) @@ -39,8 +42,6 @@ internal Tokenizer(JvmObjectReference jvmObject) private readonly JvmObjectReference _jvmObject; JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - - private const string JavaClassName = "org.apache.spark.ml.feature.Tokenizer"; /// /// Gets the column that the should read from @@ -113,7 +114,7 @@ public string Uid() public static Tokenizer Load(string path) { return WrapAsTokenizer( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(JavaClassName, "load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_tokenizerClassName, "load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs index a82babbf5..20698cace 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/DataType.cs @@ -164,7 +164,7 @@ internal static DataType ParseDataType(JToken json) { if (typeJObject.TryGetValue("sqlType", out JToken sqlTypeToken)) { - return new StructType(sqlTypeToken as JObject); + return new StructType((JObject)sqlTypeToken); } } From 64066a5074e3d2b30936235dc31f6cfd116da7a3 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 2 Mar 2020 22:54:20 +0000 Subject: [PATCH 40/47] fixes from feedback --- .../IpcTests/ML/Feature/HashingTFTests.cs | 8 ++++++++ src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 3 ++- src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs | 3 ++- src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs | 3 ++- .../Microsoft.Spark/Sql/Types/ComplexTypes.cs | 13 ++++++++++++- 5 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 106bca3f2..63bc54bdf 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -2,7 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Collections.Generic; using System.IO; +using System.Linq; using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; @@ -57,6 +60,11 @@ public void TestHashingTF() hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); + + IEnumerable vectors = outputVector.Collect(); + Row row = vectors.First(); + Assert.Equal(1.0, ((row.Values[0] as Row).Values[3] as object[])[1]); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index d870baf66..32abf176b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -196,7 +196,8 @@ public Bucketizer SetOutputCols(List value) public static Bucketizer Load(string path) { return WrapAsBucketizer( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_bucketizerClassName,"load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_bucketizerClassName,"load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index c222e1ada..0b2a1e802 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -124,7 +124,8 @@ public string Uid() public static IDFModel Load(string path) { return WrapAsIDFModel( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_IDFModelClassName, "load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_IDFModelClassName, "load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index 904978a45..b69712227 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -114,7 +114,8 @@ public string Uid() public static Tokenizer Load(string path) { return WrapAsTokenizer( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_tokenizerClassName, "load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_tokenizerClassName, "load", path)); } /// diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs index 2b65ea6d1..909266133 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs @@ -3,7 +3,9 @@ // See the LICENSE file in the project root for more information. using System; +using System.Collections; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using Microsoft.Spark.Interop.Ipc; using Newtonsoft.Json.Linq; @@ -71,7 +73,16 @@ private DataType FromJson(JObject json) internal override bool NeedConversion() => true; - internal override object FromInternal(object obj) => throw new NotImplementedException(); + internal override object FromInternal(object obj) + { + switch (obj) + { + case ArrayList objArrayList: + return objArrayList.ToArray(); + } + + throw new NotImplementedException(); + } } /// From 37cf616918733239683c975cb2d5efb85c137f9c Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 3 Mar 2020 20:50:29 +0000 Subject: [PATCH 41/47] reverting fix for ArrayType --- .../IpcTests/ML/Feature/HashingTFTests.cs | 5 ----- src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs | 11 +---------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index 63bc54bdf..a6d9952da 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -60,11 +60,6 @@ public void TestHashingTF() hashingTf.SetBinary(true); Assert.True(hashingTf.GetBinary()); - - IEnumerable vectors = outputVector.Collect(); - Row row = vectors.First(); - Assert.Equal(1.0, ((row.Values[0] as Row).Values[3] as object[])[1]); - } } } diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs index 909266133..c99b141b9 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs @@ -73,16 +73,7 @@ private DataType FromJson(JObject json) internal override bool NeedConversion() => true; - internal override object FromInternal(object obj) - { - switch (obj) - { - case ArrayList objArrayList: - return objArrayList.ToArray(); - } - - throw new NotImplementedException(); - } + internal override object FromInternal(object obj) => throw new NotImplementedException(); } /// From cd07e5682ed962f4d424fde3a296de04a57fda1d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 3 Mar 2020 21:08:02 +0000 Subject: [PATCH 42/47] params comments --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 4 ++-- src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs | 4 ++-- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 9 ++++----- src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs | 4 ++-- src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs | 7 +++---- 5 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 32abf176b..9f68546be 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -191,7 +191,7 @@ public Bucketizer SetOutputCols(List value) /// /// Loads the that was previously saved using Save /// - /// + /// The path the previous was saved to /// public static Bucketizer Load(string path) { @@ -203,7 +203,7 @@ public static Bucketizer Load(string path) /// /// Saves the so that it can be loaded later using Load /// - /// + /// The path to save the to /// public Bucketizer Save(string path) { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 9aef51934..77eb1b2e7 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -54,7 +54,7 @@ internal HashingTF(JvmObjectReference jvmObject) /// /// Loads the that was previously saved using Save /// - /// + /// The path the previous was saved to /// public static HashingTF Load(string path) { @@ -65,7 +65,7 @@ public static HashingTF Load(string path) /// /// Saves the so that it can be loaded later using Load /// - /// + /// The path to save the to /// public HashingTF Save(string path) { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index 67fec0890..fe92b1e23 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -80,8 +80,7 @@ public string GetOutputCol() /// The will create a new column in the DataFrame, this is the /// name of the new column. /// - /// The name of the new column - /// + /// The name of the new column /// public IDF SetOutputCol(string value) { @@ -100,7 +99,7 @@ public int GetMinDocFreq() /// /// Minimum of documents in which a term should appear for filtering /// - /// + /// int, the minimum of documents a term should appear in /// public IDF SetMinDocFreq(int value) { @@ -131,7 +130,7 @@ public string Uid() /// /// Loads the that was previously saved using Save /// - /// + /// The path the previous was saved to /// public static IDF Load(string path) { @@ -142,7 +141,7 @@ public static IDF Load(string path) /// /// Saves the so that it can be loaded later using Load /// - /// + /// The path to save the to /// public IDF Save(string path) { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index 0b2a1e802..20cc6a886 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -119,7 +119,7 @@ public string Uid() /// /// Loads the that was previously saved using Save /// - /// + /// The path the previous was saved to /// public static IDFModel Load(string path) { @@ -131,7 +131,7 @@ public static IDFModel Load(string path) /// /// Saves the so that it can be loaded later using Load /// - /// + /// The path to save the to /// public IDFModel Save(string path) { diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index b69712227..cfd605f33 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -76,8 +76,7 @@ public string GetOutputCol() /// The will create a new column in the DataFrame, this is the /// name of the new column. /// - /// The name of the new column - /// + /// The name of the new column /// public Tokenizer SetOutputCol(string value) { @@ -109,7 +108,7 @@ public string Uid() /// /// Loads the that was previously saved using Save /// - /// + /// The path the previous was saved to /// public static Tokenizer Load(string path) { @@ -121,7 +120,7 @@ public static Tokenizer Load(string path) /// /// Saves the so that it can be loaded later using Load /// - /// + /// The path to save the to /// public Tokenizer Save(string path) { From 573fc1a38aea7e20e1686b8b9ade0af0d5c5066a Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 23 Mar 2020 21:41:59 +0000 Subject: [PATCH 43/47] formatting and comments from feedback --- .../IpcTests/ML/Feature/BucketizerTests.cs | 2 +- .../IpcTests/ML/Feature/HashingTFTests.cs | 2 +- .../IpcTests/ML/Feature/IDFModelTests.cs | 6 +-- .../Microsoft.Spark/ML/Feature/Bucketizer.cs | 37 +++++++----------- .../Microsoft.Spark/ML/Feature/HashingTF.cs | 38 ++++++++----------- src/csharp/Microsoft.Spark/ML/Feature/IDF.cs | 31 ++++++--------- .../Microsoft.Spark/ML/Feature/IDFModel.cs | 28 +++++--------- .../Microsoft.Spark/ML/Feature/Tokenizer.cs | 29 ++++++-------- .../Microsoft.Spark/Microsoft.Spark.csproj | 5 ++- .../Microsoft.Spark/Sql/Types/ComplexTypes.cs | 2 - 10 files changed, 70 insertions(+), 110 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs index 10b48e634..11037bc6d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/BucketizerTests.cs @@ -63,7 +63,7 @@ public void TestBucketizer() [Fact] public void TestBucketizer_MultipleColumns() { - var expectedSplitsArray = new[] + var expectedSplitsArray = new double[][] { new[] { double.MinValue, 0.0, 10.0, 50.0, double.MaxValue}, new[] { double.MinValue, 0.0, 10000.0, double.MaxValue} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs index a6d9952da..7b6882bea 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/HashingTFTests.cs @@ -32,7 +32,7 @@ public void TestHashingTF() Assert.IsType(new HashingTF()); - var hashingTf = new HashingTF("my-unique-id") + HashingTF hashingTf = new HashingTF("my-unique-id") .SetNumFeatures(expectedFeatures) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index c695f8515..314030ca7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -30,20 +30,20 @@ public void TestIDFModel() DataFrame sentenceData = _spark.Sql("SELECT 0.0 as label, 'Hi I heard about Spark' as sentence"); - var tokenizer = new Tokenizer() + Tokenizer tokenizer = new Tokenizer() .SetInputCol("sentence") .SetOutputCol("words"); DataFrame wordsData = tokenizer.Transform(sentenceData); - var hashingTF = new HashingTF() + HashingTF hashingTF = new HashingTF() .SetInputCol("words") .SetOutputCol(expectedInputCol) .SetNumFeatures(20); DataFrame featurizedData = hashingTF.Transform(wordsData); - var idf = new IDF() + IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 9f68546be..02561b0a1 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -24,6 +24,8 @@ public class Bucketizer : IJvmObjectReferenceProvider { private static readonly string s_bucketizerClassName = "org.apache.spark.ml.feature.Bucketizer"; + + private readonly JvmObjectReference _jvmObject; /// /// Create a without any parameters @@ -47,9 +49,7 @@ internal Bucketizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - - private readonly JvmObjectReference _jvmObject; - + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -71,7 +71,7 @@ public double[] GetSplits() /// bucket, which also includes y. The splits should be of length >= 3 and strictly /// increasing. Values outside the splits specified will be treated as errors. /// - /// + /// New object public Bucketizer SetSplits(double[] value) { return WrapAsBucketizer(_jvmObject.Invoke("setSplits", value)); @@ -96,7 +96,7 @@ public double[][] GetSplitsArray() /// by splits x,y holds values in the range [x,y) except the last bucket, which also /// includes y. The splits should be of length >= 3 and strictly increasing. /// Values outside the splits specified will be treated as errors. - /// + /// New object public Bucketizer SetSplitsArray(double[][] value) { return WrapAsBucketizer(_jvmObject.Invoke("setSplitsArray", (object)value)); @@ -117,7 +117,7 @@ public string GetInputCol() /// buckets /// /// The name of the column to as the source of the buckets - /// + /// New object public Bucketizer SetInputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCol", value)); @@ -141,7 +141,7 @@ public IEnumerable GetInputCols() /// sets of buckets and two output columns. /// /// List of input columns to use as sources for buckets - /// + /// New object public Bucketizer SetInputCols(IEnumerable value) { return WrapAsBucketizer(_jvmObject.Invoke("setInputCols", value)); @@ -162,7 +162,7 @@ public string GetOutputCol() /// name of the new column. /// /// The name of the new column which contains the bucket ID - /// + /// New object public Bucketizer SetOutputCol(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCol", value)); @@ -182,7 +182,7 @@ public IEnumerable GetOutputCols() /// The list of columns that the will create in the DataFrame. /// /// List of column names which will contain the bucket ID - /// + /// New object public Bucketizer SetOutputCols(List value) { return WrapAsBucketizer(_jvmObject.Invoke("setOutputCols", value)); @@ -192,7 +192,7 @@ public Bucketizer SetOutputCols(List value) /// Loads the that was previously saved using Save /// /// The path the previous was saved to - /// + /// New object public static Bucketizer Load(string path) { return WrapAsBucketizer( @@ -204,7 +204,7 @@ public static Bucketizer Load(string path) /// Saves the so that it can be loaded later using Load /// /// The path to save the to - /// + /// New object public Bucketizer Save(string path) { return WrapAsBucketizer(_jvmObject.Invoke("save", path)); @@ -249,22 +249,13 @@ public string GetHandleInvalid() /// Choices are "skip", "error" or "keep". Default is "error" /// /// "skip", "error" or "keep" - /// + /// New object public Bucketizer SetHandleInvalid(string value) { return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static Bucketizer WrapAsBucketizer(object obj) - { - return new Bucketizer((JvmObjectReference)obj); - } + private static Bucketizer WrapAsBucketizer(object obj) + => new Bucketizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 77eb1b2e7..5fa774e00 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -24,6 +24,8 @@ public class HashingTF : IJvmObjectReferenceProvider private static readonly string s_hashingTfClassName = "org.apache.spark.ml.feature.HashingTF"; + private readonly JvmObjectReference _jvmObject; + /// /// Create a without any parameters /// @@ -47,26 +49,25 @@ internal HashingTF(JvmObjectReference jvmObject) _jvmObject = jvmObject; } - private readonly JvmObjectReference _jvmObject; - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// /// Loads the that was previously saved using Save /// /// The path the previous was saved to - /// + /// New object public static HashingTF Load(string path) { return WrapAsHashingTF( - SparkEnvironment.JvmBridge.CallStaticJavaMethod(s_hashingTfClassName,"load", path)); + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_hashingTfClassName, "load", path)); } /// /// Saves the so that it can be loaded later using Load /// /// The path to save the to - /// + /// New object public HashingTF Save(string path) { return WrapAsHashingTF(_jvmObject.Invoke("save", path)); @@ -75,7 +76,7 @@ public HashingTF Save(string path) /// /// Gets the binary toggle that controls term frequency counts /// - /// bool + /// bool showing term frequency counts public bool GetBinary() { return (bool)_jvmObject.Invoke("getBinary"); @@ -105,7 +106,7 @@ public string GetInputCol() /// Sets the column that the should read from /// /// The name of the column to as the source - /// + /// New object public HashingTF SetInputCol(string value) { return WrapAsHashingTF(_jvmObject.Invoke("setInputCol", value)); @@ -126,7 +127,7 @@ public string GetOutputCol() /// this is the name of the new column. /// /// The name of the new column - /// + /// New object public HashingTF SetOutputCol(string value) { return WrapAsHashingTF(_jvmObject.Invoke("setOutputCol", value)); @@ -138,7 +139,7 @@ public HashingTF SetOutputCol(string value) /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the /// columns. /// - /// int + /// int, the number of features to be used public int GetNumFeatures() { return (int)_jvmObject.Invoke("getNumFeatures"); @@ -151,7 +152,7 @@ public int GetNumFeatures() /// columns. /// /// int - /// + /// New object public HashingTF SetNumFeatures(int value) { return WrapAsHashingTF(_jvmObject.Invoke("setNumFeatures", value)); @@ -160,7 +161,7 @@ public HashingTF SetNumFeatures(int value) /// /// An immutable unique ID for the object and its derivatives. /// - /// string + /// string, unique ID for the object public string Uid() { return (string)_jvmObject.Invoke("uid"); @@ -176,17 +177,8 @@ public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static HashingTF WrapAsHashingTF(object obj) - { - return new HashingTF((JvmObjectReference)obj); - } + + private static HashingTF WrapAsHashingTF(object obj) + => new HashingTF((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs index fe92b1e23..5c2259aaf 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDF.cs @@ -21,6 +21,8 @@ public class IDF : IJvmObjectReferenceProvider { private static readonly string s_IDFClassName = "org.apache.spark.ml.feature.IDF"; + private readonly JvmObjectReference _jvmObject; + /// /// Create a without any parameters /// @@ -43,8 +45,7 @@ internal IDF(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - - private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -60,7 +61,7 @@ public string GetInputCol() /// Sets the column that the should read from /// /// The name of the column to as the source - /// + /// New object public IDF SetInputCol(string value) { return WrapAsIDF(_jvmObject.Invoke("setInputCol", value)); @@ -81,7 +82,7 @@ public string GetOutputCol() /// name of the new column. /// /// The name of the new column - /// + /// New object public IDF SetOutputCol(string value) { return WrapAsIDF(_jvmObject.Invoke("setOutputCol", value)); @@ -90,7 +91,7 @@ public IDF SetOutputCol(string value) /// /// Minimum of documents in which a term should appear for filtering /// - /// int + /// int, minimum number of documents in which a term should appear public int GetMinDocFreq() { return (int)_jvmObject.Invoke("getMinDocFreq"); @@ -100,7 +101,7 @@ public int GetMinDocFreq() /// Minimum of documents in which a term should appear for filtering /// /// int, the minimum of documents a term should appear in - /// + /// New object public IDF SetMinDocFreq(int value) { return WrapAsIDF(_jvmObject.Invoke("setMinDocFreq", value)); @@ -110,7 +111,7 @@ public IDF SetMinDocFreq(int value) /// Fits a model to the input data. /// /// The to fit the model to - /// + /// New object public IDFModel Fit(DataFrame source) { return new IDFModel((JvmObjectReference)_jvmObject.Invoke("fit", source)); @@ -131,7 +132,7 @@ public string Uid() /// Loads the that was previously saved using Save /// /// The path the previous was saved to - /// + /// New object, loaded from path public static IDF Load(string path) { return WrapAsIDF( @@ -142,22 +143,12 @@ public static IDF Load(string path) /// Saves the so that it can be loaded later using Load /// /// The path to save the to - /// + /// New object public IDF Save(string path) { return WrapAsIDF(_jvmObject.Invoke("save", path)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static IDF WrapAsIDF(object obj) - { - return new IDF((JvmObjectReference)obj); - } + private static IDF WrapAsIDF(object obj) => new IDF((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index 20cc6a886..16bccb50a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -16,6 +16,8 @@ public class IDFModel : IJvmObjectReferenceProvider { private static readonly string s_IDFModelClassName = "org.apache.spark.ml.feature.IDFModel"; + + private readonly JvmObjectReference _jvmObject; /// /// Create a without any parameters @@ -39,8 +41,7 @@ internal IDFModel(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - - private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -57,7 +58,7 @@ public string GetInputCol() /// buckets /// /// The name of the column to as the source - /// + /// New object public IDFModel SetInputCol(string value) { return WrapAsIDFModel(_jvmObject.Invoke("setInputCol", value)); @@ -79,7 +80,7 @@ public string GetOutputCol() /// /// The name of the new column which contains the tokens /// - /// + /// New object public IDFModel SetOutputCol(string value) { return WrapAsIDFModel(_jvmObject.Invoke("setOutputCol", value)); @@ -88,7 +89,7 @@ public IDFModel SetOutputCol(string value) /// /// Minimum of documents in which a term should appear for filtering /// - /// int + /// int, minimum number of documents a term should appear public int GetMinDocFreq() { return (int)_jvmObject.Invoke("getMinDocFreq"); @@ -120,7 +121,7 @@ public string Uid() /// Loads the that was previously saved using Save /// /// The path the previous was saved to - /// + /// New object, loaded from path public static IDFModel Load(string path) { return WrapAsIDFModel( @@ -132,22 +133,13 @@ public static IDFModel Load(string path) /// Saves the so that it can be loaded later using Load /// /// The path to save the to - /// + /// New object public IDFModel Save(string path) { return WrapAsIDFModel(_jvmObject.Invoke("save", path)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static IDFModel WrapAsIDFModel(object obj) - { - return new IDFModel((JvmObjectReference)obj); - } + private static IDFModel WrapAsIDFModel(object obj) + => new IDFModel((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index cfd605f33..4a323cb93 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -17,6 +17,8 @@ public class Tokenizer : IJvmObjectReferenceProvider private static readonly string s_tokenizerClassName = "org.apache.spark.ml.feature.Tokenizer"; + private readonly JvmObjectReference _jvmObject; + /// /// Create a without any parameters /// @@ -39,8 +41,7 @@ internal Tokenizer(JvmObjectReference jvmObject) { _jvmObject = jvmObject; } - - private readonly JvmObjectReference _jvmObject; + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; /// @@ -56,7 +57,7 @@ public string GetInputCol() /// Sets the column that the should read from /// /// The name of the column to as the source - /// + /// New object public Tokenizer SetInputCol(string value) { return WrapAsTokenizer(_jvmObject.Invoke("setInputCol", value)); @@ -77,7 +78,7 @@ public string GetOutputCol() /// name of the new column. /// /// The name of the new column - /// + /// New object public Tokenizer SetOutputCol(string value) { return WrapAsTokenizer(_jvmObject.Invoke("setOutputCol", value)); @@ -88,7 +89,8 @@ public Tokenizer SetOutputCol(string value) /// column /// /// The DataFrame to transform - /// + /// New object with the source + /// transformed public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); @@ -109,7 +111,7 @@ public string Uid() /// Loads the that was previously saved using Save /// /// The path the previous was saved to - /// + /// New object, loaded from path public static Tokenizer Load(string path) { return WrapAsTokenizer( @@ -121,22 +123,13 @@ public static Tokenizer Load(string path) /// Saves the so that it can be loaded later using Load /// /// The path to save the to - /// + /// New object public Tokenizer Save(string path) { return WrapAsTokenizer(_jvmObject.Invoke("save", path)); } - /// - /// The reference we get back from each call isn't usable unless we wrap it in a new dotnet - /// - /// - /// The to convert into a dotnet - /// - /// - private static Tokenizer WrapAsTokenizer(object obj) - { - return new Tokenizer((JvmObjectReference)obj); - } + private static Tokenizer WrapAsTokenizer(object obj) + => new Tokenizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 35488668d..6520c9505 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -33,7 +33,10 @@ - + diff --git a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs index c99b141b9..2b65ea6d1 100644 --- a/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs +++ b/src/csharp/Microsoft.Spark/Sql/Types/ComplexTypes.cs @@ -3,9 +3,7 @@ // See the LICENSE file in the project root for more information. using System; -using System.Collections; using System.Collections.Generic; -using System.Diagnostics; using System.Linq; using Microsoft.Spark.Interop.Ipc; using Newtonsoft.Json.Linq; From 4cd86e38e14922ad39b5834a1ed047d89f345345 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 23 Mar 2020 21:50:56 +0000 Subject: [PATCH 44/47] typo ideModel and not idfModel --- .../IpcTests/ML/Feature/IDFModelTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs index 314030ca7..623b7322c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFModelTests.cs @@ -59,7 +59,7 @@ public void TestIDFModel() using (var tempDirectory = new TemporaryDirectory()) { - string modelPath = Path.Join(tempDirectory.Path, "ideModel"); + string modelPath = Path.Join(tempDirectory.Path, "idfModel"); idfModel.Save(modelPath); IDFModel loadedModel = IDFModel.Load(modelPath); From 57729ee275730b52998b0064e100146548a95834 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 23 Mar 2020 21:51:45 +0000 Subject: [PATCH 45/47] cant use var here --- .../Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs index 944d2b24b..3dea63de7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/IDFTests.cs @@ -27,7 +27,7 @@ public void TestIDFModel() string expectedOutputCol = "features"; int expectedDocFrequency = 100; - var idf = new IDF() + IDF idf = new IDF() .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol) .SetMinDocFreq(expectedDocFrequency); From da7660eb6725b96b8c820206b0ed486a79203a64 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 23 Mar 2020 21:53:13 +0000 Subject: [PATCH 46/47] cant use var here --- .../IpcTests/ML/Feature/TokenizerTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs index 0039d9e07..8cdb4e03a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/TokenizerTests.cs @@ -30,7 +30,7 @@ public void TestTokenizer() DataFrame input = _spark.Sql("SELECT 'hello I AM a string TO, TOKENIZE' as input_col" + " from range(100)"); - var tokenizer = new Tokenizer(expectedUid) + Tokenizer tokenizer = new Tokenizer(expectedUid) .SetInputCol(expectedInputCol) .SetOutputCol(expectedOutputCol); From 22ff5e53f22560bdf9face72d219b6db36a360bb Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Wed, 25 Mar 2020 07:13:20 +0000 Subject: [PATCH 47/47] formatting from feedback --- src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs | 9 +++++---- src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs | 8 ++++---- src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs | 6 +++--- src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs | 9 +++++---- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs index 02561b0a1..823f13c1a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Bucketizer.cs @@ -215,8 +215,9 @@ public Bucketizer Save(string path) /// column or columns with the bucketed data. /// /// The DataFrame to add the bucketed data to - /// containing the original data and the new bucketed - /// columns + /// + /// containing the original data and the new bucketed columns + /// public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); @@ -255,7 +256,7 @@ public Bucketizer SetHandleInvalid(string value) return WrapAsBucketizer(_jvmObject.Invoke("setHandleInvalid", value.ToString())); } - private static Bucketizer WrapAsBucketizer(object obj) - => new Bucketizer((JvmObjectReference)obj); + private static Bucketizer WrapAsBucketizer(object obj) => + new Bucketizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs index 5fa774e00..50b4fe04a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/HashingTF.cs @@ -76,7 +76,7 @@ public HashingTF Save(string path) /// /// Gets the binary toggle that controls term frequency counts /// - /// bool showing term frequency counts + /// Flag showing whether the binary toggle is on or off public bool GetBinary() { return (bool)_jvmObject.Invoke("getBinary"); @@ -139,7 +139,7 @@ public HashingTF SetOutputCol(string value) /// as the numFeatures parameter; otherwise the features will not be mapped evenly to the /// columns. /// - /// int, the number of features to be used + /// The number of features to be used public int GetNumFeatures() { return (int)_jvmObject.Invoke("getNumFeatures"); @@ -178,7 +178,7 @@ public DataFrame Transform(DataFrame source) return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); } - private static HashingTF WrapAsHashingTF(object obj) - => new HashingTF((JvmObjectReference)obj); + private static HashingTF WrapAsHashingTF(object obj) => + new HashingTF((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs index 16bccb50a..4fc8a4f30 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/IDFModel.cs @@ -89,7 +89,7 @@ public IDFModel SetOutputCol(string value) /// /// Minimum of documents in which a term should appear for filtering /// - /// int, minimum number of documents a term should appear + /// Minimum number of documents a term should appear public int GetMinDocFreq() { return (int)_jvmObject.Invoke("getMinDocFreq"); @@ -139,7 +139,7 @@ public IDFModel Save(string path) return WrapAsIDFModel(_jvmObject.Invoke("save", path)); } - private static IDFModel WrapAsIDFModel(object obj) - => new IDFModel((JvmObjectReference)obj); + private static IDFModel WrapAsIDFModel(object obj) => + new IDFModel((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs index 4a323cb93..c411309dc 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/Tokenizer.cs @@ -89,8 +89,9 @@ public Tokenizer SetOutputCol(string value) /// column /// /// The DataFrame to transform - /// New object with the source - /// transformed + /// + /// New object with the source transformed + /// public DataFrame Transform(DataFrame source) { return new DataFrame((JvmObjectReference)_jvmObject.Invoke("transform", source)); @@ -129,7 +130,7 @@ public Tokenizer Save(string path) return WrapAsTokenizer(_jvmObject.Invoke("save", path)); } - private static Tokenizer WrapAsTokenizer(object obj) - => new Tokenizer((JvmObjectReference)obj); + private static Tokenizer WrapAsTokenizer(object obj) => + new Tokenizer((JvmObjectReference)obj); } }