From cf9cf89189472ee7b85833ce24d138ec387ead28 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 14 Apr 2020 12:18:45 -0700 Subject: [PATCH 01/66] Broadcast encryption support plus test --- .../IpcTests/BroadcastTests.cs | 22 ++++++++++++++++++ .../Processor/BroadcastVariableProcessor.cs | 19 ++++++++++++--- src/csharp/Microsoft.Spark/Broadcast.cs | 23 ++++++++++++++----- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 000c8f27e..4611245a0 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -33,6 +33,28 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } + /// + /// Test Broadcast support by using multiple broadcast variables in a UDF with + /// encryption enabled. + /// + [Fact] + public void TestMultipleBroadcastWithEncryption() + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); + var obj1 = new TestBroadcastVariable(1, "first"); + var obj2 = new TestBroadcastVariable(2, "second"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); + + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); + + var expected = new string[] { "hello first and second", "world first and second" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + } + /// /// Test Broadcast support by using multiple broadcast variables in a UDF. /// diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 41c817d02..0016ad883 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -4,8 +4,10 @@ using System; using System.IO; +using System.Net; using System.Runtime.Serialization.Formatters.Binary; using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Network; namespace Microsoft.Spark.Worker.Processor { @@ -25,6 +27,7 @@ internal BroadcastVariableProcessor(Version version) internal BroadcastVariables Process(Stream stream) { var broadcastVars = new BroadcastVariables(); + ISocketWrapper socket = null; if (_version >= new Version(Versions.V2_3_2)) { @@ -37,7 +40,11 @@ internal BroadcastVariables Process(Stream stream) { broadcastVars.DecryptionServerPort = SerDe.ReadInt32(stream); broadcastVars.Secret = SerDe.ReadString(stream); - // TODO: Handle the authentication. + socket = SocketFactory.CreateSocket(); + socket.Connect( + IPAddress.Loopback, + broadcastVars.DecryptionServerPort, + broadcastVars.Secret); } var formatter = new BinaryFormatter(); @@ -48,8 +55,14 @@ internal BroadcastVariables Process(Stream stream) { if (broadcastVars.DecryptionServerNeeded) { - throw new NotImplementedException( - "broadcastDecryptionServer is not implemented."); + var readBid = SerDe.ReadInt64(socket.InputStream); + if (bid != readBid) + { + throw new Exception($"Encrypted broadcast id {readBid} does not " + + $"match regular stream broadcast id {bid}"); + } + object value = formatter.Deserialize(socket.InputStream); + BroadcastRegistry.Add(bid, value); } else { diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 20ae5c869..4e078e31d 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -2,10 +2,12 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; +using System.Net; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Network; using Microsoft.Spark.Services; @@ -169,20 +171,29 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( bool encryptionEnabled = bool.Parse( sc.GetConf().Get("spark.io.encryption.enabled", "false")); + var pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( + "org.apache.spark.api.python.PythonRDD", + "setupBroadcast", + _path); + if (encryptionEnabled) { - throw new NotImplementedException("Broadcast encryption is not supported yet."); + var pair = (JvmObjectReference[])pythonBroadcast.Invoke("setupEncryptionServer"); + + using ISocketWrapper socket = SocketFactory.CreateSocket(); + socket.Connect( + IPAddress.Loopback, + (int)pair[0].Invoke("intValue"), + (string)pair[1].Invoke("toString")); + Dump(value, socket.OutputStream); + socket.OutputStream.Flush(); + pythonBroadcast.Invoke("waitTillDataReceived"); } else { WriteToFile(value); } - var pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( - "org.apache.spark.api.python.PythonRDD", - "setupBroadcast", - _path); - return (JvmObjectReference)javaSparkContext.Invoke("broadcast", pythonBroadcast); } From 03b79393e71910a33a39864e563fcbeb2de56658 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 19 Apr 2020 22:31:05 -0700 Subject: [PATCH 02/66] Adding section for UDF serialization --- docs/broadcast-guide.md | 92 +++++++++++++++++++++ docs/udf-guide.md | 172 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 264 insertions(+) create mode 100644 docs/broadcast-guide.md create mode 100644 docs/udf-guide.md diff --git a/docs/broadcast-guide.md b/docs/broadcast-guide.md new file mode 100644 index 000000000..4286c569e --- /dev/null +++ b/docs/broadcast-guide.md @@ -0,0 +1,92 @@ +# Guide to using Broadcast Variables + +This is a guide to show how to use broadcast variables in .NET for Apache Spark. + +## What are Broadcast Variables + +[Broadcast variables in Apache Spark](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) are a mechanism for sharing variables across executors that are meant to be read-only. They allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. + +### How to use broadcast variables in .NET for Apache Spark + +Broadcast variables are created from a variable `v` by calling `SparkContext.Broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `Value()` method on it. + +Example: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf = Udf( + str => $"{str}: {bv.Value()}"); +``` + +The type of broadcast variable is captured by using Generics in C#, as can be seen in the above example. + +### Deleting broadcast variables + +The broadcast variable can be deleted from all executors by calling the `Destroy()` function on it. + +```csharp +// Destroying the broadcast variable bv: +bv.Destroy(); +``` + +> Note: `Destroy` deletes all data and metadata related to the broadcast variable. Use this with caution- once a broadcast variable has been destroyed, it cannot be used again. + +#### Caveat of using Destroy + +One important thing to keep in mind while using broadcast variables in UDFs is to limit the scope of the variable to only the UDF that is referencing it. The [guide to using UDFs](udf-guide.md) describes this phenomenon in detail. This is especially crucial when calling `Destroy` on the broadcast variable. If the broadcast variable that has been destroyed is visible to or accessible from other UDFs, it gets picked up for serialization by all those UDFs, even if it is not being referenced by them. This will throw an error as .NET for Apache Spark is not able to serialize the destroyed broadcast variable. + +Example to demonstrate: + +```csharp +string v = "Variable to be broadcasted"; +Broadcast bv = SparkContext.Broadcast(v); + +// Using the broadcast variable in a UDF: +Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + +// Destroying bv +bv.Destroy(); + +// Calling udf1 after destroying bv throws the following expected exception: +// org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed +df.Select(udf1(df["_1"])).Show(); + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 throws the following (unexpected) exception: +// [Error] [JvmBridge] org.apache.spark.SparkException: Task not serializable +df.Select(udf2(df["_1"])).Show(); +``` + +The recommended way of implementing above desired behavior: + +```csharp +string v = "Variable to be broadcasted"; +// Restricting the visibility of bv to only the UDF referencing it +{ + Broadcast bv = SparkContext.Broadcast(v); + + // Using the broadcast variable in a UDF: + Func udf1 = Udf( + str => $"{str}: {bv.Value()}"); + + // Destroying bv + bv.Destroy(); +} + +// Different UDF udf2 that is not referencing bv +Func udf2 = Udf( + str => $"{str}: not referencing broadcast variable"); + +// Calling udf2 works fine as expected +df.Select(udf2(df["_1"])).Show(); +``` + This ensures that destroying `bv` doesn't affect calling `udf2` because of unexpected serialization behavior. + + Broadcast variables are very useful for transmitting read-only data to all executors, as the data is sent only once and this gives huge performance benefits when compared with using local variables that get shipped to the executors with each task. Please refer to the [official documentation](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) to get a deeper understanding of broadcast variables and why they are used. \ No newline at end of file diff --git a/docs/udf-guide.md b/docs/udf-guide.md new file mode 100644 index 000000000..bb308815d --- /dev/null +++ b/docs/udf-guide.md @@ -0,0 +1,172 @@ +# Guide to User-Defined Functions (UDFs) + +This is a guide to show how to use UDFs in .NET for Apache Spark. + +## What are UDFs + +[User-Defined Functions (UDFs)](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/expressions/UserDefinedFunction.html) are a feature of Spark that allow developers to use custom functions to extend the system's built-in functionality. They transform values from a single row within a table to produce a single corresponding output value per row based on the logic defined in the UDF. + +Let's take the following as an example for a UDF definition: + +```csharp +string s1 = "hello"; +Func udf = Udf( + str => $"{s1} {str}"); + +``` +The above defined UDF takes a `string` as an input (in the form of a [Column](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Column.cs#L14) of a [Dataframe](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/DataFrame.cs#L24)), and returns a `string` with `hello` appended in front of the input. + +For a sample Dataframe, let's take the following Dataframe `df`: + +```text ++-------+ +| name| ++-------+ +|Michael| +| Andy| +| Justin| ++-------+ +``` + +Now let's apply the above defined `udf` to the dataframe `df`: + +```csharp +DataFrame udfResult = df.Select(udf(df["name"])); +``` + +This would return the below as the Dataframe `udfResult`: + +```text ++-------------+ +| name| ++-------------+ +|hello Michael| +| hello Andy| +| hello Justin| ++-------------+ +``` +To get a better understanding of how to implement UDFs, please take a look at the [UDF helper functions](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Functions.cs#L3616) and some [test examples](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs#L49). + +## UDF serialization + +Since UDFs are functions that need to be executed on the workers, they have to be serialized and sent to the workers as part of the payload from the driver. This involves serializing the [delegate](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/) which is a reference to the method, along with its [target](https://docs.microsoft.com/en-us/dotnet/api/system.delegate.target?view=netframework-4.8) which is the class instance on which the current delegate invokes the instance method. Please take a look at this [code](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Utils/CommandSerDe.cs#L149) to get a better understanding of how UDF serialization is being done. + +## Good to know while implementing UDFs + +One behavior to be aware of while implementing UDFs in .NET for Apache Spark is how the target of the UDF gets serialized. .NET for Apache Spark uses .NET Core, which does not support serializing delegates, so it is instead done by using reflection to serialize the target where the delegate is defined. When multiple delegates are defined in a common scope, they have a shared closure that becomes the target of reflection for serialization. Let's take an example to illustrate what that means. + +The following code snippet defines two string variables that are being referenced in two function delegates, that just return the respective strings as result: + +```csharp +using System; + +public class C { + public void M() { + string s1 = "s1"; + string s2 = "s2"; + Func a = str => s1; + Func b = str => s2; + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + public string s2; + + internal string b__0(string str) + { + return s1; + } + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + <>c__DisplayClass0_.s2 = "s2"; + Func func = new Func(<>c__DisplayClass0_.b__0); + Func func2 = new Func(<>c__DisplayClass0_.b__1); + } +} +``` +As can be seen in the above IL code, both `func` and `func2` share the same closure `<>c__DisplayClass0_0`, which is the target that is serialized when serializing the delegates `func` and `func2`. Hence, even though `Func a` is only referencing `s1`, `s2` also gets serialized when sending over the bytes to the workers. + +This can lead to some unexpected behaviors at runtime (like in the case of using [broadcast variables](broadcast-guide.md)), which is why we recommend restricting the visibility of the variables used in a function to that function's scope. +Taking the above example to better explain what that means: + +Recommended user code to implement desired behavior of previous code snippet: + +```csharp +using System; + +public class C { + public void M() { + { + string s1 = "s1"; + Func a = str => s1; + } + { + string s2 = "s2"; + Func b = str => s2; + } + } +} +``` + +The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: + +```csharp +public class C +{ + [CompilerGenerated] + private sealed class <>c__DisplayClass0_0 + { + public string s1; + + internal string b__0(string str) + { + return s1; + } + } + + [CompilerGenerated] + private sealed class <>c__DisplayClass0_1 + { + public string s2; + + internal string b__1(string str) + { + return s2; + } + } + + public void M() + { + <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); + <>c__DisplayClass0_.s1 = "s1"; + Func func = new Func(<>c__DisplayClass0_.b__0); + <>c__DisplayClass0_1 <>c__DisplayClass0_2 = new <>c__DisplayClass0_1(); + <>c__DisplayClass0_2.s2 = "s2"; + Func func2 = new Func(<>c__DisplayClass0_2.b__1); + } +} +``` + +Here we see that `func` and `func2` no longer share a closure and have their own separate closures `<>c__DisplayClass0_0` and `<>c__DisplayClass0_1` respectively. When used as the target for serialization, nothing other than the referenced variables will get serialized for the delegate. + +This above behavior is important to keep in mind while implementing multiple UDFs in a common scope. +To learn more about UDFs in general, please review the following articles that explain UDFs and how to use them: [UDFs in databricks(scala)](https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html), [Spark UDFs and some gotchas](https://medium.com/@achilleus/spark-udfs-we-can-use-them-but-should-we-use-them-2c5a561fde6d). \ No newline at end of file From 4ef693dbf7616b738a6ae70d1e9dc8c12dd8e5d3 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 19 Apr 2020 22:32:56 -0700 Subject: [PATCH 03/66] removing guides from master --- docs/broadcast-guide.md | 92 --------------------- docs/udf-guide.md | 172 ---------------------------------------- 2 files changed, 264 deletions(-) delete mode 100644 docs/broadcast-guide.md delete mode 100644 docs/udf-guide.md diff --git a/docs/broadcast-guide.md b/docs/broadcast-guide.md deleted file mode 100644 index 4286c569e..000000000 --- a/docs/broadcast-guide.md +++ /dev/null @@ -1,92 +0,0 @@ -# Guide to using Broadcast Variables - -This is a guide to show how to use broadcast variables in .NET for Apache Spark. - -## What are Broadcast Variables - -[Broadcast variables in Apache Spark](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) are a mechanism for sharing variables across executors that are meant to be read-only. They allow the programmer to keep a read-only variable cached on each machine rather than shipping a copy of it with tasks. They can be used, for example, to give every node a copy of a large input dataset in an efficient manner. - -### How to use broadcast variables in .NET for Apache Spark - -Broadcast variables are created from a variable `v` by calling `SparkContext.Broadcast(v)`. The broadcast variable is a wrapper around `v`, and its value can be accessed by calling the `Value()` method on it. - -Example: - -```csharp -string v = "Variable to be broadcasted"; -Broadcast bv = SparkContext.Broadcast(v); - -// Using the broadcast variable in a UDF: -Func udf = Udf( - str => $"{str}: {bv.Value()}"); -``` - -The type of broadcast variable is captured by using Generics in C#, as can be seen in the above example. - -### Deleting broadcast variables - -The broadcast variable can be deleted from all executors by calling the `Destroy()` function on it. - -```csharp -// Destroying the broadcast variable bv: -bv.Destroy(); -``` - -> Note: `Destroy` deletes all data and metadata related to the broadcast variable. Use this with caution- once a broadcast variable has been destroyed, it cannot be used again. - -#### Caveat of using Destroy - -One important thing to keep in mind while using broadcast variables in UDFs is to limit the scope of the variable to only the UDF that is referencing it. The [guide to using UDFs](udf-guide.md) describes this phenomenon in detail. This is especially crucial when calling `Destroy` on the broadcast variable. If the broadcast variable that has been destroyed is visible to or accessible from other UDFs, it gets picked up for serialization by all those UDFs, even if it is not being referenced by them. This will throw an error as .NET for Apache Spark is not able to serialize the destroyed broadcast variable. - -Example to demonstrate: - -```csharp -string v = "Variable to be broadcasted"; -Broadcast bv = SparkContext.Broadcast(v); - -// Using the broadcast variable in a UDF: -Func udf1 = Udf( - str => $"{str}: {bv.Value()}"); - -// Destroying bv -bv.Destroy(); - -// Calling udf1 after destroying bv throws the following expected exception: -// org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed -df.Select(udf1(df["_1"])).Show(); - -// Different UDF udf2 that is not referencing bv -Func udf2 = Udf( - str => $"{str}: not referencing broadcast variable"); - -// Calling udf2 throws the following (unexpected) exception: -// [Error] [JvmBridge] org.apache.spark.SparkException: Task not serializable -df.Select(udf2(df["_1"])).Show(); -``` - -The recommended way of implementing above desired behavior: - -```csharp -string v = "Variable to be broadcasted"; -// Restricting the visibility of bv to only the UDF referencing it -{ - Broadcast bv = SparkContext.Broadcast(v); - - // Using the broadcast variable in a UDF: - Func udf1 = Udf( - str => $"{str}: {bv.Value()}"); - - // Destroying bv - bv.Destroy(); -} - -// Different UDF udf2 that is not referencing bv -Func udf2 = Udf( - str => $"{str}: not referencing broadcast variable"); - -// Calling udf2 works fine as expected -df.Select(udf2(df["_1"])).Show(); -``` - This ensures that destroying `bv` doesn't affect calling `udf2` because of unexpected serialization behavior. - - Broadcast variables are very useful for transmitting read-only data to all executors, as the data is sent only once and this gives huge performance benefits when compared with using local variables that get shipped to the executors with each task. Please refer to the [official documentation](https://spark.apache.org/docs/2.2.0/rdd-programming-guide.html#broadcast-variables) to get a deeper understanding of broadcast variables and why they are used. \ No newline at end of file diff --git a/docs/udf-guide.md b/docs/udf-guide.md deleted file mode 100644 index bb308815d..000000000 --- a/docs/udf-guide.md +++ /dev/null @@ -1,172 +0,0 @@ -# Guide to User-Defined Functions (UDFs) - -This is a guide to show how to use UDFs in .NET for Apache Spark. - -## What are UDFs - -[User-Defined Functions (UDFs)](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/expressions/UserDefinedFunction.html) are a feature of Spark that allow developers to use custom functions to extend the system's built-in functionality. They transform values from a single row within a table to produce a single corresponding output value per row based on the logic defined in the UDF. - -Let's take the following as an example for a UDF definition: - -```csharp -string s1 = "hello"; -Func udf = Udf( - str => $"{s1} {str}"); - -``` -The above defined UDF takes a `string` as an input (in the form of a [Column](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Column.cs#L14) of a [Dataframe](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/DataFrame.cs#L24)), and returns a `string` with `hello` appended in front of the input. - -For a sample Dataframe, let's take the following Dataframe `df`: - -```text -+-------+ -| name| -+-------+ -|Michael| -| Andy| -| Justin| -+-------+ -``` - -Now let's apply the above defined `udf` to the dataframe `df`: - -```csharp -DataFrame udfResult = df.Select(udf(df["name"])); -``` - -This would return the below as the Dataframe `udfResult`: - -```text -+-------------+ -| name| -+-------------+ -|hello Michael| -| hello Andy| -| hello Justin| -+-------------+ -``` -To get a better understanding of how to implement UDFs, please take a look at the [UDF helper functions](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Sql/Functions.cs#L3616) and some [test examples](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs#L49). - -## UDF serialization - -Since UDFs are functions that need to be executed on the workers, they have to be serialized and sent to the workers as part of the payload from the driver. This involves serializing the [delegate](https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/delegates/) which is a reference to the method, along with its [target](https://docs.microsoft.com/en-us/dotnet/api/system.delegate.target?view=netframework-4.8) which is the class instance on which the current delegate invokes the instance method. Please take a look at this [code](https://github.com/dotnet/spark/blob/master/src/csharp/Microsoft.Spark/Utils/CommandSerDe.cs#L149) to get a better understanding of how UDF serialization is being done. - -## Good to know while implementing UDFs - -One behavior to be aware of while implementing UDFs in .NET for Apache Spark is how the target of the UDF gets serialized. .NET for Apache Spark uses .NET Core, which does not support serializing delegates, so it is instead done by using reflection to serialize the target where the delegate is defined. When multiple delegates are defined in a common scope, they have a shared closure that becomes the target of reflection for serialization. Let's take an example to illustrate what that means. - -The following code snippet defines two string variables that are being referenced in two function delegates, that just return the respective strings as result: - -```csharp -using System; - -public class C { - public void M() { - string s1 = "s1"; - string s2 = "s2"; - Func a = str => s1; - Func b = str => s2; - } -} -``` - -The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: - -```csharp -public class C -{ - [CompilerGenerated] - private sealed class <>c__DisplayClass0_0 - { - public string s1; - - public string s2; - - internal string b__0(string str) - { - return s1; - } - - internal string b__1(string str) - { - return s2; - } - } - - public void M() - { - <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); - <>c__DisplayClass0_.s1 = "s1"; - <>c__DisplayClass0_.s2 = "s2"; - Func func = new Func(<>c__DisplayClass0_.b__0); - Func func2 = new Func(<>c__DisplayClass0_.b__1); - } -} -``` -As can be seen in the above IL code, both `func` and `func2` share the same closure `<>c__DisplayClass0_0`, which is the target that is serialized when serializing the delegates `func` and `func2`. Hence, even though `Func a` is only referencing `s1`, `s2` also gets serialized when sending over the bytes to the workers. - -This can lead to some unexpected behaviors at runtime (like in the case of using [broadcast variables](broadcast-guide.md)), which is why we recommend restricting the visibility of the variables used in a function to that function's scope. -Taking the above example to better explain what that means: - -Recommended user code to implement desired behavior of previous code snippet: - -```csharp -using System; - -public class C { - public void M() { - { - string s1 = "s1"; - Func a = str => s1; - } - { - string s2 = "s2"; - Func b = str => s2; - } - } -} -``` - -The above C# code generates the following C# disassembly (credit source: [sharplab.io](sharplab.io)) code from the compiler: - -```csharp -public class C -{ - [CompilerGenerated] - private sealed class <>c__DisplayClass0_0 - { - public string s1; - - internal string b__0(string str) - { - return s1; - } - } - - [CompilerGenerated] - private sealed class <>c__DisplayClass0_1 - { - public string s2; - - internal string b__1(string str) - { - return s2; - } - } - - public void M() - { - <>c__DisplayClass0_0 <>c__DisplayClass0_ = new <>c__DisplayClass0_0(); - <>c__DisplayClass0_.s1 = "s1"; - Func func = new Func(<>c__DisplayClass0_.b__0); - <>c__DisplayClass0_1 <>c__DisplayClass0_2 = new <>c__DisplayClass0_1(); - <>c__DisplayClass0_2.s2 = "s2"; - Func func2 = new Func(<>c__DisplayClass0_2.b__1); - } -} -``` - -Here we see that `func` and `func2` no longer share a closure and have their own separate closures `<>c__DisplayClass0_0` and `<>c__DisplayClass0_1` respectively. When used as the target for serialization, nothing other than the referenced variables will get serialized for the delegate. - -This above behavior is important to keep in mind while implementing multiple UDFs in a common scope. -To learn more about UDFs in general, please review the following articles that explain UDFs and how to use them: [UDFs in databricks(scala)](https://docs.databricks.com/spark/latest/spark-sql/udf-scala.html), [Spark UDFs and some gotchas](https://medium.com/@achilleus/spark-udfs-we-can-use-them-but-should-we-use-them-2c5a561fde6d). \ No newline at end of file From 79f6a6f3d65e354ab9811426833680c4dcaaf0d7 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 23 Jun 2020 16:01:39 -0700 Subject: [PATCH 04/66] Adding ChunkedStream --- src/csharp/Microsoft.Spark/Broadcast.cs | 1 + src/csharp/Microsoft.Spark/ChunkedStream.cs | 24 +++++++++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 src/csharp/Microsoft.Spark/ChunkedStream.cs diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 4e078e31d..b4907eaf2 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -185,6 +185,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); + ChunkedStream bdrcstChunked = new ChunkedStream(socket.OutputStream, 8192); Dump(value, socket.OutputStream); socket.OutputStream.Flush(); pythonBroadcast.Invoke("waitTillDataReceived"); diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs new file mode 100644 index 000000000..821f69211 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Net.Sockets; +using System.Text; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark +{ + public class ChunkedStream + { + private readonly int _bufferSize; + private byte[] _buffer; + //private int _currentPos; + //private Stream _wrapped; + + internal ChunkedStream(Stream wrapped, int bufferSize) + { + _bufferSize = bufferSize; + _buffer = new byte[_bufferSize]; + Console.WriteLine("here"); + } + } +} From ce135f01411bad5ff0037420b5590b66028e8c2e Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 3 Jul 2020 01:00:31 -0700 Subject: [PATCH 05/66] pushing latest changes --- src/csharp/Microsoft.Spark/ChunkedStream.cs | 31 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs index 821f69211..c6b2e2a2e 100644 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Net.Sockets; using System.Text; using Microsoft.Spark.Interop.Ipc; @@ -11,14 +12,38 @@ public class ChunkedStream { private readonly int _bufferSize; private byte[] _buffer; - //private int _currentPos; - //private Stream _wrapped; + private int _currentPos; + private Stream _wrapped; internal ChunkedStream(Stream wrapped, int bufferSize) { _bufferSize = bufferSize; _buffer = new byte[_bufferSize]; - Console.WriteLine("here"); + _currentPos = 0; + _wrapped = wrapped; + } + + public void Write(byte[] bytes) + { + int bytePos = 0; + int bytesRemaining = bytes.Length; + while (bytesRemaining > 0) + { + int newPos = bytesRemaining + _currentPos; + if (newPos < _bufferSize) + { + Array.Copy(_buffer, _currentPos, bytes, bytePos, bytesRemaining); + _currentPos = newPos; + bytesRemaining = 0; + } + else + { + // fill the buffer, send the length then the contents, and start filling again + int spaceLeft = _bufferSize - _currentPos; + int newBytePos = bytePos + spaceLeft; + Array.Copy(_buffer, _currentPos, bytes, bytePos, spaceLeft); + } + } } } } From f8f0420f9da84af0f065e99d59f2c0812b49e573 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 7 Jul 2020 08:09:53 -0700 Subject: [PATCH 06/66] Adding chunkedstream logic --- .../IpcTests/BroadcastTests.cs | 1 + src/csharp/Microsoft.Spark/Broadcast.cs | 5 +- src/csharp/Microsoft.Spark/ChunkedStream.cs | 47 +++++++++++++++++-- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index d233a0cd3..41de9c6e3 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -41,6 +41,7 @@ public void TestMultipleBroadcastWithEncryption() _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); var obj1 = new TestBroadcastVariable(1, "first"); var obj2 = new TestBroadcastVariable(2, "second"); + Broadcast bc = _spark.SparkContext.Broadcast(5); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 04cdc2c7a..86eac0738 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -187,8 +187,9 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); ChunkedStream bdrcstChunked = new ChunkedStream(socket.OutputStream, 8192); - Dump(value, socket.OutputStream); - socket.OutputStream.Flush(); + bdrcstChunked.Write(value); + bdrcstChunked.Close(); + //socket.OutputStream.Flush(); pythonBroadcast.Invoke("waitTillDataReceived"); } else diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs index c6b2e2a2e..a5ec88f90 100644 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -3,6 +3,8 @@ using System.IO; using System.Linq; using System.Net.Sockets; +using System.Runtime.Serialization.Formatters.Binary; +using System.Security.Cryptography; using System.Text; using Microsoft.Spark.Interop.Ipc; @@ -23,8 +25,27 @@ internal ChunkedStream(Stream wrapped, int bufferSize) _wrapped = wrapped; } - public void Write(byte[] bytes) + internal void WriteInt(int value, Stream stream) { + byte[] bytes = BitConverter.GetBytes(value); + if (BitConverter.IsLittleEndian) + Array.Reverse(bytes); + stream.Write(bytes, 0, bytes.Length); + } + + internal byte[] ConvertToByteArray(object value) + { + var formatter = new BinaryFormatter(); + using (var ms = new MemoryStream()) + { + formatter.Serialize(ms, value); + return ms.ToArray(); + } + } + + public void Write(object value) + { + byte[] bytes = ConvertToByteArray(value); int bytePos = 0; int bytesRemaining = bytes.Length; while (bytesRemaining > 0) @@ -32,18 +53,36 @@ public void Write(byte[] bytes) int newPos = bytesRemaining + _currentPos; if (newPos < _bufferSize) { - Array.Copy(_buffer, _currentPos, bytes, bytePos, bytesRemaining); + Array.Copy(bytes, bytePos, _buffer, _currentPos, bytesRemaining); _currentPos = newPos; bytesRemaining = 0; } else { - // fill the buffer, send the length then the contents, and start filling again + // Fill the buffer, send the length then the contents, and start filling again. int spaceLeft = _bufferSize - _currentPos; int newBytePos = bytePos + spaceLeft; - Array.Copy(_buffer, _currentPos, bytes, bytePos, spaceLeft); + Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); + WriteInt(_bufferSize, _wrapped); + _wrapped.Write(_buffer, 0, _bufferSize); + bytesRemaining -= spaceLeft; + bytePos = newBytePos; + _currentPos = 0; } } } + + public void Close() + { + // If there is anything left in the buffer, write it out first. + if (_currentPos > 0) + { + WriteInt(_currentPos, _wrapped); + _wrapped.Write(_buffer, 0, _currentPos + 1); + } + // -1 length indicates to the receiving end that we're done. + WriteInt(-1, _wrapped); + _wrapped.Close(); + } } } From 6bab99604db5cc8b8528b54216085afb96cbaff7 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 27 Jul 2020 21:10:51 +0100 Subject: [PATCH 07/66] CountVectorizer --- .../ML/Feature/CountVectorizerModelTests.cs | 73 +++++++ .../ML/Feature/CountVectorizerTests.cs | 70 +++++++ .../ML/Feature/CountVectorizer.cs | 195 ++++++++++++++++++ .../ML/Feature/CountVectorizerModel.cs | 170 +++++++++++++++ 4 files changed, 508 insertions(+) create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs create mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs create mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs new file mode 100644 index 000000000..3c3132dd9 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.IO; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class CountVectorizerModelTests + { + private readonly SparkSession _spark; + + public CountVectorizerModelTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void Test_CountVectorizerModel() + { + DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + + "'TOKENIZE') as input from range(100)"); + + const string inputColumn = "input"; + const string outputColumn = "output"; + const double minTf = 10.0; + const bool binary = false; + + List vocabulary = new List() + { + "hello", + "I", + "AM", + "TO", + "TOKENIZE" + }; + + var countVectorizerModel = new CountVectorizerModel(vocabulary); + + Assert.IsType(new CountVectorizerModel("my-uid", vocabulary)); + + countVectorizerModel = countVectorizerModel + .SetInputCol(inputColumn) + .SetOutputCol(outputColumn) + .SetMinTF(minTf) + .SetBinary(binary); + + Assert.Equal(inputColumn, countVectorizerModel.GetInputCol()); + Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol()); + Assert.Equal(minTf, countVectorizerModel.GetMinTF()); + Assert.Equal(binary, countVectorizerModel.GetBinary()); + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel"); + countVectorizerModel.Save(savePath); + + CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath); + Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid()); + } + + Assert.IsType(countVectorizerModel.GetVocabSize()); + Assert.NotEmpty(countVectorizerModel.ExplainParams()); + Assert.NotEmpty(countVectorizerModel.ToString()); + } + } +} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs new file mode 100644 index 000000000..d54bfe376 --- /dev/null +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -0,0 +1,70 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.IO; +using Microsoft.Spark.ML.Feature; +using Microsoft.Spark.Sql; +using Microsoft.Spark.UnitTest.TestUtils; +using Xunit; + +namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature +{ + [Collection("Spark E2E Tests")] + public class CountVectorizerTests + { + private readonly SparkSession _spark; + + public CountVectorizerTests(SparkFixture fixture) + { + _spark = fixture.Spark; + } + + [Fact] + public void Test_CountVectorizer() + { + DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + + "'TOKENIZE') as input from range(100)"); + + const string inputColumn = "input"; + const string outputColumn = "output"; + const double minDf = 1; + const double maxDf = 100; + const double minTf = 10; + const int vocabSize = 10000; + const bool binary = false; + + var countVectorizer = new CountVectorizer(); + + countVectorizer + .SetInputCol(inputColumn) + .SetOutputCol(outputColumn) + .SetMinDF(minDf) + .SetMaxDF(maxDf) + .SetMinTF(minTf) + .SetVocabSize(vocabSize); + + Assert.IsType(countVectorizer.Fit(input)); + Assert.Equal(inputColumn, countVectorizer.GetInputCol()); + Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); + Assert.Equal(minDf, countVectorizer.GetMinDF()); + Assert.Equal(maxDf, countVectorizer.GetMaxDF()); + Assert.Equal(minTf, countVectorizer.GetMinTF()); + Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); + Assert.Equal(binary, countVectorizer.GetBinary()); + + using (var tempDirectory = new TemporaryDirectory()) + { + string savePath = Path.Join(tempDirectory.Path, "countVectorizer"); + countVectorizer.Save(savePath); + + CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath); + Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid()); + } + + Assert.NotEmpty(countVectorizer.ExplainParams()); + Assert.NotEmpty(countVectorizer.ToString()); + } + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs new file mode 100644 index 000000000..41e0dbdd0 --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -0,0 +1,195 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; +using Microsoft.Spark.Sql; + +namespace Microsoft.Spark.ML.Feature +{ + public class CountVectorizer : FeatureBase, IJvmObjectReferenceProvider + { + private static readonly string s_countVectorizerClassName = + "org.apache.spark.ml.feature.CountVectorizer"; + + /// + /// Create a without any parameters + /// + public CountVectorizer() : base(s_countVectorizerClassName) + { + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid) + { + } + + internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject) + { + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// Fits a model to the input data. + /// The to fit the model to. + /// + public CountVectorizerModel Fit(DataFrame dataFrame) => + new CountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); + + /// + /// Loads the that was previously saved using Save + /// + /// + /// The path the previous was saved to + /// + /// New object + public static CountVectorizer Load(string path) => + WrapAsType((JvmObjectReference) + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_countVectorizerClassName,"load", path)); + + /// + /// Gets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// boolean + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); + + /// + /// Sets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// Turn the binary toggle on or off + /// with the new binary toggle value set + public CountVectorizer SetBinary(bool value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); + + private static CountVectorizer WrapAsCountVectorizer(object obj) => + new CountVectorizer((JvmObjectReference)obj); + + /// + /// Gets the column that the should read from and convert + /// into buckets. This would have been set by SetInputCol + /// + /// string, the input column + public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; + + /// + /// Sets the column that the should read from. + /// + /// The name of the column to as the source. + /// with the input column set + public CountVectorizer SetInputCol(string value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); + + /// + /// The will create a new column in the DataFrame, this is + /// the name of the new column. + /// + /// The name of the output column. + public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; + + /// + /// The will create a new column in the DataFrame, this + /// is the name of the new column. + /// + /// The name of the output column which will be created. + /// New with the output column set + public CountVectorizer SetOutputCol(string value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); + + /// + /// Gets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); + + /// + /// Sets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + /// New with the max df value set + public CountVectorizer SetMaxDF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); + + /// + /// Gets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); + + /// + /// Sets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + /// New with the min df value set + public CountVectorizer SetMinDF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinDF", value)); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + /// New with the min term frequency set + public CountVectorizer SetMinTF(double value) => + WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); + + /// + /// Gets the max size of the vocabulary. CountVectorizer will build a vocabulary that only + /// considers the top vocabSize terms ordered by term frequency across the corpus. + /// + /// The max size of the vocabulary + public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + + /// + /// Sets the max size of the vocabulary. will build a + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across + /// the corpus. + /// + /// The max vocabulary size + /// with the max vocab value set + public CountVectorizer SetVocabSize(int value) => + WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); + } +} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs new file mode 100644 index 000000000..8a6e427df --- /dev/null +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -0,0 +1,170 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System.Collections.Generic; +using Microsoft.Spark.Interop; +using Microsoft.Spark.Interop.Ipc; + +namespace Microsoft.Spark.ML.Feature +{ + public class CountVectorizerModel : FeatureBase + , IJvmObjectReferenceProvider + { + private static readonly string s_countVectorizerModelClassName = + "org.apache.spark.ml.feature.CountVectorizerModel"; + + /// + /// Create a without any parameters + /// + /// The vocabulary to use + public CountVectorizerModel(List vocabulary) : + this(SparkEnvironment.JvmBridge.CallConstructor( + s_countVectorizerModelClassName, vocabulary)) + { + } + + /// + /// Create a with a UID that is used to give the + /// a unique ID + /// + /// An immutable unique ID for the object and its derivatives. + /// The vocabulary to use + public CountVectorizerModel(string uid, List vocabulary) : + this(SparkEnvironment.JvmBridge.CallConstructor( + s_countVectorizerModelClassName, uid, vocabulary)) + { + } + + internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) + { + } + + JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; + + /// + /// Loads the that was previously saved using Save + /// + /// + /// The path the previous was saved to + /// + /// New object + public static CountVectorizerModel Load(string path) => + WrapAsType((JvmObjectReference) + SparkEnvironment.JvmBridge.CallStaticJavaMethod( + s_countVectorizerModelClassName,"load", path)); + + /// + /// Gets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// boolean + public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); + + /// + /// Sets the binary toggle to control the output vector values. If True, all nonzero counts + /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic + /// models that model binary events rather than integer counts. Default: false + /// + /// Turn the binary toggle on or off + /// + /// with the new binary toggle value set + /// + public CountVectorizerModel SetBinary(bool value) => + WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); + + private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => + new CountVectorizerModel((JvmObjectReference)obj); + + /// + /// Gets the column that the should read from and + /// convert into buckets. This would have been set by SetInputCol + /// + /// string, the input column + public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; + + /// + /// Sets the column that the should read from. + /// + /// The name of the column to as the source. + /// with the input column set + public CountVectorizerModel SetInputCol(string value) => + WrapAsCountVectorizerModel( + (JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); + + /// + /// The will create a new column in the DataFrame, this + /// is the name of the new column. + /// + /// The name of the output column. + public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; + + /// + /// The will create a new column in the DataFrame, + /// this is the name of the new column. + /// + /// The name of the output column which will be created. + /// New with the output column set + public CountVectorizerModel SetOutputCol(string value) => + WrapAsCountVectorizerModel( + (JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); + + /// + /// Gets the maximum number of different documents a term could appear in to be included in + /// the vocabulary. A term that appears more than the threshold will be ignored. If this is + /// an integer greater than or equal to 1, this specifies the maximum number of documents + /// the term could appear in; if this is a double in [0,1), then this specifies the maximum + /// fraction of documents the term could appear in. + /// + /// The maximum document term frequency + public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); + + /// + /// Gets the minimum number of different documents a term must appear in to be included in + /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the + /// number of documents the term must appear in; if this is a double in [0,1), then this + /// specifies the fraction of documents. + /// + /// The minimum document term frequency + public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); + + /// + /// Filter to ignore rare words in a document. For each document, terms with + /// frequency/count less than the given threshold are ignored. If this is an integer + /// greater than or equal to 1, then this specifies a count (of times the term must appear + /// in the document); if this is a double in [0,1), then this specifies a fraction (out of + /// the document's token count). + /// + /// Note that the parameter is only used in transform of CountVectorizerModel and does not + /// affect fitting. + /// + /// Minimum term frequency + /// + /// New with the min term frequency set + /// + public CountVectorizerModel SetMinTF(double value) => + WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); + + /// + /// Gets the max size of the vocabulary. will build a + /// vocabulary that only considers the top vocabSize terms ordered by term frequency across + /// the corpus. + /// + /// The max size of the vocabulary + public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + } +} From e2a566b1f4b29775be9b57616a258802e294f304 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Mon, 27 Jul 2020 21:24:35 +0100 Subject: [PATCH 08/66] moving private methods to bottom --- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs | 6 +++--- .../Microsoft.Spark/ML/Feature/CountVectorizerModel.cs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index 41e0dbdd0..cf68f7c4a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -71,9 +71,6 @@ public static CountVectorizer Load(string path) => public CountVectorizer SetBinary(bool value) => WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - private static CountVectorizer WrapAsCountVectorizer(object obj) => - new CountVectorizer((JvmObjectReference)obj); - /// /// Gets the column that the should read from and convert /// into buckets. This would have been set by SetInputCol @@ -191,5 +188,8 @@ public CountVectorizer SetMinTF(double value) => /// with the max vocab value set public CountVectorizer SetVocabSize(int value) => WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); + + private static CountVectorizer WrapAsCountVectorizer(object obj) => + new CountVectorizer((JvmObjectReference)obj); } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs index 8a6e427df..8e225a179 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -74,9 +74,6 @@ public static CountVectorizerModel Load(string path) => public CountVectorizerModel SetBinary(bool value) => WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => - new CountVectorizerModel((JvmObjectReference)obj); - /// /// Gets the column that the should read from and /// convert into buckets. This would have been set by SetInputCol @@ -166,5 +163,8 @@ public CountVectorizerModel SetMinTF(double value) => /// /// The max size of the vocabulary public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); + + private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => + new CountVectorizerModel((JvmObjectReference)obj); } } From 5f682a601ec783f1609e6fd6e32c4d83ff1491d1 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Tue, 28 Jul 2020 20:47:31 +0100 Subject: [PATCH 09/66] changing wrap method --- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs | 2 +- src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index cf68f7c4a..b3fa0ef8a 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -49,7 +49,7 @@ public CountVectorizerModel Fit(DataFrame dataFrame) => /// /// New object public static CountVectorizer Load(string path) => - WrapAsType((JvmObjectReference) + WrapAsCountVectorizer((JvmObjectReference) SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_countVectorizerClassName,"load", path)); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs index 8e225a179..52bbd72c3 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs @@ -50,7 +50,7 @@ internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) /// /// New object public static CountVectorizerModel Load(string path) => - WrapAsType((JvmObjectReference) + WrapAsCountVectorizerModel((JvmObjectReference) SparkEnvironment.JvmBridge.CallStaticJavaMethod( s_countVectorizerModelClassName,"load", path)); From 31371db73b4faa653c07fdb8082e7aed02c0a031 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 18:45:46 +0100 Subject: [PATCH 10/66] setting min version required --- .../IpcTests/ML/Feature/CountVectorizerTests.cs | 14 ++++++++++---- .../Microsoft.Spark/ML/Feature/CountVectorizer.cs | 2 ++ .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 3 ++- src/csharp/Microsoft.Spark/Microsoft.Spark.csproj | 5 +---- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs index d54bfe376..95b9bc504 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs @@ -4,6 +4,7 @@ using System; using System.IO; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.ML.Feature; using Microsoft.Spark.Sql; using Microsoft.Spark.UnitTest.TestUtils; @@ -30,7 +31,6 @@ public void Test_CountVectorizer() const string inputColumn = "input"; const string outputColumn = "output"; const double minDf = 1; - const double maxDf = 100; const double minTf = 10; const int vocabSize = 10000; const bool binary = false; @@ -41,7 +41,6 @@ public void Test_CountVectorizer() .SetInputCol(inputColumn) .SetOutputCol(outputColumn) .SetMinDF(minDf) - .SetMaxDF(maxDf) .SetMinTF(minTf) .SetVocabSize(vocabSize); @@ -49,7 +48,6 @@ public void Test_CountVectorizer() Assert.Equal(inputColumn, countVectorizer.GetInputCol()); Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); Assert.Equal(minDf, countVectorizer.GetMinDF()); - Assert.Equal(maxDf, countVectorizer.GetMaxDF()); Assert.Equal(minTf, countVectorizer.GetMinTF()); Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); Assert.Equal(binary, countVectorizer.GetBinary()); @@ -65,6 +63,14 @@ public void Test_CountVectorizer() Assert.NotEmpty(countVectorizer.ExplainParams()); Assert.NotEmpty(countVectorizer.ToString()); - } + } + + [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] + public void CountVectorizer_MaxDF() + { + const double maxDf = 100; + CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf); + Assert.Equal(maxDf, countVectorizer.GetMaxDF()); + } } } diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs index b3fa0ef8a..5689e19fd 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs @@ -110,6 +110,7 @@ public CountVectorizer SetOutputCol(string value) => /// fraction of documents the term could appear in. /// /// The maximum document term frequency + [Since(Versions.V2_4_0)] public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); /// @@ -121,6 +122,7 @@ public CountVectorizer SetOutputCol(string value) => /// /// The maximum document term frequency /// New with the max df value set + [Since(Versions.V2_4_0)] public CountVectorizer SetMaxDF(double value) => WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index fcc90b43d..0895dace1 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -98,7 +98,7 @@ public Param.Param GetParam(string paramName) => public T Set(Param.Param param, object value) => WrapAsType((JvmObjectReference)_jvmObject.Invoke("set", param, value)); - private static T WrapAsType(JvmObjectReference reference) + internal static T WrapAsType(JvmObjectReference reference) { ConstructorInfo constructor = typeof(T) .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) @@ -111,5 +111,6 @@ private static T WrapAsType(JvmObjectReference reference) return (T)constructor.Invoke(new object[] {reference}); } + } } diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index 2cddc5627..f284de8c6 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -38,10 +38,7 @@ - + From 60eb82f40ac37c553ca00a3ab4d0e404e4447dca Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:52:23 +0100 Subject: [PATCH 11/66] undoing csproj change --- .ionide/symbolCache.db | Bin 28672 -> 0 bytes .../Microsoft.Spark/Microsoft.Spark.csproj | 5 ++++- 2 files changed, 4 insertions(+), 1 deletion(-) delete mode 100644 .ionide/symbolCache.db diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db deleted file mode 100644 index 43e567d6d682d85dd32b3baebb0fdf61f67c1643..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 diff --git a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj index f284de8c6..2cddc5627 100644 --- a/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj +++ b/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj @@ -38,7 +38,10 @@ - + From ed36375561e3495a675f9ac14ab80f79f3fbb38d Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:55:49 +0100 Subject: [PATCH 12/66] member doesnt need to be internal --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 0895dace1..8446b9f4e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -98,7 +98,7 @@ public Param.Param GetParam(string paramName) => public T Set(Param.Param param, object value) => WrapAsType((JvmObjectReference)_jvmObject.Invoke("set", param, value)); - internal static T WrapAsType(JvmObjectReference reference) + private static T WrapAsType(JvmObjectReference reference) { ConstructorInfo constructor = typeof(T) .GetConstructors(BindingFlags.NonPublic | BindingFlags.Instance) From c7baf7231914b10300175e67158b604d646b97d4 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 19:56:29 +0100 Subject: [PATCH 13/66] too many lines --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 8446b9f4e..9ccd64d5b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -106,11 +106,10 @@ private static T WrapAsType(JvmObjectReference reference) { ParameterInfo[] parameters = c.GetParameters(); return (parameters.Length == 1) && - (parameters[0].ParameterType == typeof(JvmObjectReference)); + (parameters[0].ParameterType == typeof(JvmObjectReference)); }); return (T)constructor.Invoke(new object[] {reference}); } - } } From d13303ccaeb691691c4d294d96e0995f3597becb Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:01:07 +0100 Subject: [PATCH 14/66] removing whitespace change --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 9ccd64d5b..326268a5e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From f5b477c72158599b1c6552c7eb1af20edfab7779 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:01:57 +0100 Subject: [PATCH 15/66] removing whitespace change --- src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 326268a5e..9ccd64d5b 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From 73db52b400637585b2216f44aac616828800b9d2 Mon Sep 17 00:00:00 2001 From: GOEddieUK Date: Fri, 31 Jul 2020 20:06:12 +0100 Subject: [PATCH 16/66] ionide --- .ionide/symbolCache.db | Bin 0 -> 28672 bytes .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 .ionide/symbolCache.db diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db new file mode 100644 index 0000000000000000000000000000000000000000..43e567d6d682d85dd32b3baebb0fdf61f67c1643 GIT binary patch literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 literal 0 HcmV?d00001 diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 9ccd64d5b..326268a5e 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,7 +105,7 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && + return (parameters.Length == 1) && (parameters[0].ParameterType == typeof(JvmObjectReference)); }); From 9c33ce8523c5bc7348dfe9b953c4be62f6f166c1 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 12 Aug 2020 15:23:14 -0700 Subject: [PATCH 17/66] changes --- src/csharp/Microsoft.Spark/Broadcast.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 86eac0738..f8fa019f3 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -5,6 +5,7 @@ using System.Net; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; +using System.Security.Cryptography; using System.Threading; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; @@ -187,7 +188,9 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); ChunkedStream bdrcstChunked = new ChunkedStream(socket.OutputStream, 8192); - bdrcstChunked.Write(value); + byte[] values = new byte[] { 0x80, 0x02, (byte)'X', 0x05, 0x00, 0x00, 0x00, + (byte)'h', (byte)'e', (byte)'l', (byte)'l', (byte)'o', (byte)'q', 0x00, (byte)'.' }; + bdrcstChunked.Write(values); bdrcstChunked.Close(); //socket.OutputStream.Flush(); pythonBroadcast.Invoke("waitTillDataReceived"); From 8e1685cd270657c5e7a6769e732bf85d5ae6cb2e Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 13 Aug 2020 12:59:34 -0700 Subject: [PATCH 18/66] Revert "Merge branch 'master' into ml/countvectorizer" This reverts commit a766146f56014ccae4118b35495b84da588af94f, reversing changes made to 73db52b400637585b2216f44aac616828800b9d2. Reverting countvectorizer changes --- .gitignore | 3 --- .ionide/symbolCache.db | Bin 0 -> 28672 bytes .../Processor/BroadcastVariableProcessor.cs | 3 +-- 3 files changed, 1 insertion(+), 5 deletions(-) create mode 100644 .ionide/symbolCache.db diff --git a/.gitignore b/.gitignore index faada9c8a..251cfa7e2 100644 --- a/.gitignore +++ b/.gitignore @@ -367,6 +367,3 @@ hs_err_pid* # The target folder contains the output of building **/target/** - -# F# vs code -.ionide/ diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db new file mode 100644 index 0000000000000000000000000000000000000000..43e567d6d682d85dd32b3baebb0fdf61f67c1643 GIT binary patch literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 literal 0 HcmV?d00001 diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index bf8f48ed8..41c817d02 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -54,8 +54,7 @@ internal BroadcastVariables Process(Stream stream) else { string path = SerDe.ReadString(stream); - using FileStream fStream = - File.Open(path, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream fStream = File.Open(path, FileMode.Open, FileAccess.Read); object value = formatter.Deserialize(fStream); BroadcastRegistry.Add(bid, value); } From 255515eecbd6cb8e7919fbd2b857d99e335c66d2 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 13 Aug 2020 13:04:05 -0700 Subject: [PATCH 19/66] Revert "Merge branch 'ml/countvectorizer' of https://github.com/GoEddie/spark" This reverts commit ad6bcede69de012c22178825e76c6b175c770b8f, reversing changes made to 4c5d502a9f56e79ea071b12d2a49dced3873dea8. reverting countvectorizer changes -2 --- .../ML/Feature/CountVectorizerModelTests.cs | 73 ------- .../ML/Feature/CountVectorizerTests.cs | 76 ------- .../ML/Feature/CountVectorizer.cs | 197 ------------------ .../ML/Feature/CountVectorizerModel.cs | 170 --------------- .../Microsoft.Spark/ML/Feature/FeatureBase.cs | 4 +- 5 files changed, 2 insertions(+), 518 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs delete mode 100644 src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs delete mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs delete mode 100644 src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs deleted file mode 100644 index 3c3132dd9..000000000 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerModelTests.cs +++ /dev/null @@ -1,73 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.Collections.Generic; -using System.IO; -using Microsoft.Spark.ML.Feature; -using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; -using Xunit; - -namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature -{ - [Collection("Spark E2E Tests")] - public class CountVectorizerModelTests - { - private readonly SparkSession _spark; - - public CountVectorizerModelTests(SparkFixture fixture) - { - _spark = fixture.Spark; - } - - [Fact] - public void Test_CountVectorizerModel() - { - DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + - "'TOKENIZE') as input from range(100)"); - - const string inputColumn = "input"; - const string outputColumn = "output"; - const double minTf = 10.0; - const bool binary = false; - - List vocabulary = new List() - { - "hello", - "I", - "AM", - "TO", - "TOKENIZE" - }; - - var countVectorizerModel = new CountVectorizerModel(vocabulary); - - Assert.IsType(new CountVectorizerModel("my-uid", vocabulary)); - - countVectorizerModel = countVectorizerModel - .SetInputCol(inputColumn) - .SetOutputCol(outputColumn) - .SetMinTF(minTf) - .SetBinary(binary); - - Assert.Equal(inputColumn, countVectorizerModel.GetInputCol()); - Assert.Equal(outputColumn, countVectorizerModel.GetOutputCol()); - Assert.Equal(minTf, countVectorizerModel.GetMinTF()); - Assert.Equal(binary, countVectorizerModel.GetBinary()); - using (var tempDirectory = new TemporaryDirectory()) - { - string savePath = Path.Join(tempDirectory.Path, "countVectorizerModel"); - countVectorizerModel.Save(savePath); - - CountVectorizerModel loadedModel = CountVectorizerModel.Load(savePath); - Assert.Equal(countVectorizerModel.Uid(), loadedModel.Uid()); - } - - Assert.IsType(countVectorizerModel.GetVocabSize()); - Assert.NotEmpty(countVectorizerModel.ExplainParams()); - Assert.NotEmpty(countVectorizerModel.ToString()); - } - } -} diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs deleted file mode 100644 index 95b9bc504..000000000 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/ML/Feature/CountVectorizerTests.cs +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System; -using System.IO; -using Microsoft.Spark.E2ETest.Utils; -using Microsoft.Spark.ML.Feature; -using Microsoft.Spark.Sql; -using Microsoft.Spark.UnitTest.TestUtils; -using Xunit; - -namespace Microsoft.Spark.E2ETest.IpcTests.ML.Feature -{ - [Collection("Spark E2E Tests")] - public class CountVectorizerTests - { - private readonly SparkSession _spark; - - public CountVectorizerTests(SparkFixture fixture) - { - _spark = fixture.Spark; - } - - [Fact] - public void Test_CountVectorizer() - { - DataFrame input = _spark.Sql("SELECT array('hello', 'I', 'AM', 'a', 'string', 'TO', " + - "'TOKENIZE') as input from range(100)"); - - const string inputColumn = "input"; - const string outputColumn = "output"; - const double minDf = 1; - const double minTf = 10; - const int vocabSize = 10000; - const bool binary = false; - - var countVectorizer = new CountVectorizer(); - - countVectorizer - .SetInputCol(inputColumn) - .SetOutputCol(outputColumn) - .SetMinDF(minDf) - .SetMinTF(minTf) - .SetVocabSize(vocabSize); - - Assert.IsType(countVectorizer.Fit(input)); - Assert.Equal(inputColumn, countVectorizer.GetInputCol()); - Assert.Equal(outputColumn, countVectorizer.GetOutputCol()); - Assert.Equal(minDf, countVectorizer.GetMinDF()); - Assert.Equal(minTf, countVectorizer.GetMinTF()); - Assert.Equal(vocabSize, countVectorizer.GetVocabSize()); - Assert.Equal(binary, countVectorizer.GetBinary()); - - using (var tempDirectory = new TemporaryDirectory()) - { - string savePath = Path.Join(tempDirectory.Path, "countVectorizer"); - countVectorizer.Save(savePath); - - CountVectorizer loadedVectorizer = CountVectorizer.Load(savePath); - Assert.Equal(countVectorizer.Uid(), loadedVectorizer.Uid()); - } - - Assert.NotEmpty(countVectorizer.ExplainParams()); - Assert.NotEmpty(countVectorizer.ToString()); - } - - [SkipIfSparkVersionIsLessThan(Versions.V2_4_0)] - public void CountVectorizer_MaxDF() - { - const double maxDf = 100; - CountVectorizer countVectorizer = new CountVectorizer().SetMaxDF(maxDf); - Assert.Equal(maxDf, countVectorizer.GetMaxDF()); - } - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs deleted file mode 100644 index 5689e19fd..000000000 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizer.cs +++ /dev/null @@ -1,197 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using Microsoft.Spark.Interop; -using Microsoft.Spark.Interop.Ipc; -using Microsoft.Spark.Sql; - -namespace Microsoft.Spark.ML.Feature -{ - public class CountVectorizer : FeatureBase, IJvmObjectReferenceProvider - { - private static readonly string s_countVectorizerClassName = - "org.apache.spark.ml.feature.CountVectorizer"; - - /// - /// Create a without any parameters - /// - public CountVectorizer() : base(s_countVectorizerClassName) - { - } - - /// - /// Create a with a UID that is used to give the - /// a unique ID - /// - /// An immutable unique ID for the object and its derivatives. - public CountVectorizer(string uid) : base(s_countVectorizerClassName, uid) - { - } - - internal CountVectorizer(JvmObjectReference jvmObject) : base(jvmObject) - { - } - - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - - /// Fits a model to the input data. - /// The to fit the model to. - /// - public CountVectorizerModel Fit(DataFrame dataFrame) => - new CountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("fit", dataFrame)); - - /// - /// Loads the that was previously saved using Save - /// - /// - /// The path the previous was saved to - /// - /// New object - public static CountVectorizer Load(string path) => - WrapAsCountVectorizer((JvmObjectReference) - SparkEnvironment.JvmBridge.CallStaticJavaMethod( - s_countVectorizerClassName,"load", path)); - - /// - /// Gets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// boolean - public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); - - /// - /// Sets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// Turn the binary toggle on or off - /// with the new binary toggle value set - public CountVectorizer SetBinary(bool value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - - /// - /// Gets the column that the should read from and convert - /// into buckets. This would have been set by SetInputCol - /// - /// string, the input column - public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; - - /// - /// Sets the column that the should read from. - /// - /// The name of the column to as the source. - /// with the input column set - public CountVectorizer SetInputCol(string value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); - - /// - /// The will create a new column in the DataFrame, this is - /// the name of the new column. - /// - /// The name of the output column. - public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; - - /// - /// The will create a new column in the DataFrame, this - /// is the name of the new column. - /// - /// The name of the output column which will be created. - /// New with the output column set - public CountVectorizer SetOutputCol(string value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); - - /// - /// Gets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - [Since(Versions.V2_4_0)] - public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); - - /// - /// Sets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - /// New with the max df value set - [Since(Versions.V2_4_0)] - public CountVectorizer SetMaxDF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMaxDF", value)); - - /// - /// Gets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); - - /// - /// Sets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - /// New with the min df value set - public CountVectorizer SetMinDF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinDF", value)); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - /// New with the min term frequency set - public CountVectorizer SetMinTF(double value) => - WrapAsCountVectorizer((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); - - /// - /// Gets the max size of the vocabulary. CountVectorizer will build a vocabulary that only - /// considers the top vocabSize terms ordered by term frequency across the corpus. - /// - /// The max size of the vocabulary - public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); - - /// - /// Sets the max size of the vocabulary. will build a - /// vocabulary that only considers the top vocabSize terms ordered by term frequency across - /// the corpus. - /// - /// The max vocabulary size - /// with the max vocab value set - public CountVectorizer SetVocabSize(int value) => - WrapAsCountVectorizer(_jvmObject.Invoke("setVocabSize", value)); - - private static CountVectorizer WrapAsCountVectorizer(object obj) => - new CountVectorizer((JvmObjectReference)obj); - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs b/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs deleted file mode 100644 index 52bbd72c3..000000000 --- a/src/csharp/Microsoft.Spark/ML/Feature/CountVectorizerModel.cs +++ /dev/null @@ -1,170 +0,0 @@ -// Licensed to the .NET Foundation under one or more agreements. -// The .NET Foundation licenses this file to you under the MIT license. -// See the LICENSE file in the project root for more information. - -using System.Collections.Generic; -using Microsoft.Spark.Interop; -using Microsoft.Spark.Interop.Ipc; - -namespace Microsoft.Spark.ML.Feature -{ - public class CountVectorizerModel : FeatureBase - , IJvmObjectReferenceProvider - { - private static readonly string s_countVectorizerModelClassName = - "org.apache.spark.ml.feature.CountVectorizerModel"; - - /// - /// Create a without any parameters - /// - /// The vocabulary to use - public CountVectorizerModel(List vocabulary) : - this(SparkEnvironment.JvmBridge.CallConstructor( - s_countVectorizerModelClassName, vocabulary)) - { - } - - /// - /// Create a with a UID that is used to give the - /// a unique ID - /// - /// An immutable unique ID for the object and its derivatives. - /// The vocabulary to use - public CountVectorizerModel(string uid, List vocabulary) : - this(SparkEnvironment.JvmBridge.CallConstructor( - s_countVectorizerModelClassName, uid, vocabulary)) - { - } - - internal CountVectorizerModel(JvmObjectReference jvmObject) : base(jvmObject) - { - } - - JvmObjectReference IJvmObjectReferenceProvider.Reference => _jvmObject; - - /// - /// Loads the that was previously saved using Save - /// - /// - /// The path the previous was saved to - /// - /// New object - public static CountVectorizerModel Load(string path) => - WrapAsCountVectorizerModel((JvmObjectReference) - SparkEnvironment.JvmBridge.CallStaticJavaMethod( - s_countVectorizerModelClassName,"load", path)); - - /// - /// Gets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// boolean - public bool GetBinary() => (bool)_jvmObject.Invoke("getBinary"); - - /// - /// Sets the binary toggle to control the output vector values. If True, all nonzero counts - /// (after minTF filter applied) are set to 1. This is useful for discrete probabilistic - /// models that model binary events rather than integer counts. Default: false - /// - /// Turn the binary toggle on or off - /// - /// with the new binary toggle value set - /// - public CountVectorizerModel SetBinary(bool value) => - WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setBinary", value)); - - /// - /// Gets the column that the should read from and - /// convert into buckets. This would have been set by SetInputCol - /// - /// string, the input column - public string GetInputCol() => _jvmObject.Invoke("getInputCol") as string; - - /// - /// Sets the column that the should read from. - /// - /// The name of the column to as the source. - /// with the input column set - public CountVectorizerModel SetInputCol(string value) => - WrapAsCountVectorizerModel( - (JvmObjectReference)_jvmObject.Invoke("setInputCol", value)); - - /// - /// The will create a new column in the DataFrame, this - /// is the name of the new column. - /// - /// The name of the output column. - public string GetOutputCol() => _jvmObject.Invoke("getOutputCol") as string; - - /// - /// The will create a new column in the DataFrame, - /// this is the name of the new column. - /// - /// The name of the output column which will be created. - /// New with the output column set - public CountVectorizerModel SetOutputCol(string value) => - WrapAsCountVectorizerModel( - (JvmObjectReference)_jvmObject.Invoke("setOutputCol", value)); - - /// - /// Gets the maximum number of different documents a term could appear in to be included in - /// the vocabulary. A term that appears more than the threshold will be ignored. If this is - /// an integer greater than or equal to 1, this specifies the maximum number of documents - /// the term could appear in; if this is a double in [0,1), then this specifies the maximum - /// fraction of documents the term could appear in. - /// - /// The maximum document term frequency - public double GetMaxDF() => (double)_jvmObject.Invoke("getMaxDF"); - - /// - /// Gets the minimum number of different documents a term must appear in to be included in - /// the vocabulary. If this is an integer greater than or equal to 1, this specifies the - /// number of documents the term must appear in; if this is a double in [0,1), then this - /// specifies the fraction of documents. - /// - /// The minimum document term frequency - public double GetMinDF() => (double)_jvmObject.Invoke("getMinDF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - public double GetMinTF() => (double)_jvmObject.Invoke("getMinTF"); - - /// - /// Filter to ignore rare words in a document. For each document, terms with - /// frequency/count less than the given threshold are ignored. If this is an integer - /// greater than or equal to 1, then this specifies a count (of times the term must appear - /// in the document); if this is a double in [0,1), then this specifies a fraction (out of - /// the document's token count). - /// - /// Note that the parameter is only used in transform of CountVectorizerModel and does not - /// affect fitting. - /// - /// Minimum term frequency - /// - /// New with the min term frequency set - /// - public CountVectorizerModel SetMinTF(double value) => - WrapAsCountVectorizerModel((JvmObjectReference)_jvmObject.Invoke("setMinTF", value)); - - /// - /// Gets the max size of the vocabulary. will build a - /// vocabulary that only considers the top vocabSize terms ordered by term frequency across - /// the corpus. - /// - /// The max size of the vocabulary - public int GetVocabSize() => (int)_jvmObject.Invoke("getVocabSize"); - - private static CountVectorizerModel WrapAsCountVectorizerModel(object obj) => - new CountVectorizerModel((JvmObjectReference)obj); - } -} diff --git a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs index 326268a5e..fcc90b43d 100644 --- a/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs +++ b/src/csharp/Microsoft.Spark/ML/Feature/FeatureBase.cs @@ -105,8 +105,8 @@ private static T WrapAsType(JvmObjectReference reference) .Single(c => { ParameterInfo[] parameters = c.GetParameters(); - return (parameters.Length == 1) && - (parameters[0].ParameterType == typeof(JvmObjectReference)); + return (parameters.Length == 1) && + (parameters[0].ParameterType == typeof(JvmObjectReference)); }); return (T)constructor.Invoke(new object[] {reference}); From 3c2c936b007d7b5d761fda737625dc8f7d03728b Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 14 Aug 2020 13:32:54 -0700 Subject: [PATCH 20/66] fixing merge errors --- .gitignore | 3 +++ .../Processor/BroadcastVariableProcessor.cs | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 251cfa7e2..8e67b5699 100644 --- a/.gitignore +++ b/.gitignore @@ -367,3 +367,6 @@ hs_err_pid* # The target folder contains the output of building **/target/** + +# F# vs code +.ionide/ \ No newline at end of file diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 41c817d02..bf8f48ed8 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -54,7 +54,8 @@ internal BroadcastVariables Process(Stream stream) else { string path = SerDe.ReadString(stream); - using FileStream fStream = File.Open(path, FileMode.Open, FileAccess.Read); + using FileStream fStream = + File.Open(path, FileMode.Open, FileAccess.Read, FileShare.Read); object value = formatter.Deserialize(fStream); BroadcastRegistry.Add(bid, value); } From 88e834d53b7be8931147a095a7b0df3c08cd9aa8 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 19 Aug 2020 19:24:14 -0700 Subject: [PATCH 21/66] removing ionid --- .gitignore | 2 +- .ionide/symbolCache.db | Bin 28672 -> 0 bytes 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 .ionide/symbolCache.db diff --git a/.gitignore b/.gitignore index 8e67b5699..faada9c8a 100644 --- a/.gitignore +++ b/.gitignore @@ -369,4 +369,4 @@ hs_err_pid* **/target/** # F# vs code -.ionide/ \ No newline at end of file +.ionide/ diff --git a/.ionide/symbolCache.db b/.ionide/symbolCache.db deleted file mode 100644 index 43e567d6d682d85dd32b3baebb0fdf61f67c1643..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 28672 zcmeHPYiuJ|6}A(f3n*{Ao>?UI&RXX3biC7$u_ zru*Uw_|aboAq4!aPz(G7Dp3^)2^9$msX|362qA<70{*l}EvWneB<{TIImtK)+7qdu zu{?L~_%Yvi&pr3fz2}^JGgp@L0vB1UR7NW^3^wa~*(5A|iH8H;*B z&*Jr7uND(C74gzvnY~{#(YNt3Bw$FukbofpLjr~b3<($#FeG3|z>t6;0j&hSxNf$G zdV0*S4h!s^BA3}J-Ki9L<b^F{6=TjCuvK9>U*;l97n^=RUn$l~UNgYJCMRsfNA1?9Bl`LCS%K#Ngzf z;{QO@KD+;){!jcL`9JXd#`A*vLwDKru4~cx6X&wy2aZ|$+xDBbpW9ZfKeZmWylYu) z{ax!q%P(3^n}21#VfwY{Wv%+^=L?ZG(qpDCr$c=8^*P6`^IVl5<5tIVd0~tSzgigM z?z5uk`LPT6Y_-By)&wRae!(ne*4gR?lUBdKT&?7)Y>8Rp+k4w%>fhA!qkaWU!g8j& z9avyP?TH38h17hd$}v3E&T>vpABO?_g&-RIX#2Phe6h%7MQ!I9p4+7FTpy5icQ=}> z549iWIkuWzms8^H1xPCcSS58?UH(Q%WgSo}pSi&1%ghFqw{V?jb6g|G_h<$0h{v%C z-gd4n!}2^=x>Jxh}w>8SGqHC9v|D(^V*GlIoc1w|iaru~I2##}Q_|T1nBeK3-|27DujYN}U*Qnog;yWG zZr^}f59*n?f&gs=trhybzAhapG&7CzWFvl0k6CDOn7FsU92`wI{g3@Pu)FM&(n0byiegJk8tp$;dZ(^ zv=Y$fNsAXqBh!KP@Nsv!7c*PDz?GP*+?q1NVCFOrK}H;XiU+ZH0EsJTZI6;T-JAq- zjuS+DU_#0}R28`M+eFlgwc9Ox&3NKZQZlZ_NMqF!>tue`EenJqeO= z-(k%EWxt9n4P*W<`j}){81sMGVPVYwoyPqCY2B>G{GWCz|I_n-YoKm$sAt63{IKZ@ zCjVW()Axq2+xxcnw&!C{$^DLd+4ZLDu=DedKRby1-}XDUcWqa#zqMYm{LFH`_1mrC zmLIi@o8L1(L;w7AFA|##XrEvtfaXaX7#xVzi>ihYNYj*Mww$X`*YV|QzC^=M?s7b{ zR2E&Ad_Jr7vJDN1UN$i$u~P>{b1*gdEEdg`lZyRFw(CoE4YZL{_BddI=VPNxf)tFtiS1c{kwLRP_Bs2KJFK{M=Zfgp z5MzaS^-RLY01>>A4J9(PJCPk;3|-Gg3h=}8Y*2oI=KR!=4YAJvd^~4-*c;Zwp=d)e zwB3Zp8E>CHASQ8d{J&ySn^K6#J;CrWR!`-^; zqM`|6+mM`(61?aElrp4!0${VlSjO{EDzvau3op=cAg;PpUaK$*T(-!H0bn9Ea6!6~ zfK+Z2jZ}ANN{^JVURgcM@|@U<%-5<_8pe2m6F=O3P0ZtfS{ltsMe8cM8#S4aNHRO7 zP>{8>qXSDzJ6)YVwwmL`gM=7R(3d8$>Y$^w!>jYuqOlTbI-J)^}tn~k4R`# z%gp&{VwO;uNmcVHV%BjK=nOZ4Rh#YB_F$tnC4XE!;#3Ygq;Hbn}0)EjsT`7(7Hm(^n4Shvcww9bHjiGUTkb|JUFBEjjap;AiR+{ zRfi)Sw-Q%wk3G;24hEwfM_e&LA1|CP7zp+@8d-fr1qb>{%Ti8k6mY>C>QgR<x&4FUUSXg<^HJJmg z+yu~gPaO(^pg4n>Feuf`;H>7b0t(b?!3eIh&iXMNlE!-ryH6Vpc*fwWGc*$D;%gTqk&q-Cl6 z&AhPQ$ki?Yc)V6Ocw}&wR=2ebVDZ#{FgVz|M=$;yfW}jQD)4qec<>o;`)ASWbD(2j zKwGjUe(ny9$7ZuaBe${y!12mP!8_>_%6uN&A8%dEKw#jIev4{6q2zTjkIn%KC!$yb zu1H5@5XzU1b*Mvx8ey0CdUjSW00%Hm3Vam6}SWN^JQ4~XCiyMMBOimJ2yEVw`3RFM``6MP+8rh)j(l45I9&m?x{pdOk1GyTW(8~A=7nOb-8>4TyfiALgBG{yP< zfN9y}f7L(ad(C&&`$zA~ooi^`*Tu02K?}xl$2k0 zbAGCma`TY_FI()QQ7l(SGLY<5OiOdr+o8!4+e1*~+h_)dnql67B-dwfHZZUiz zzabvn1HdDMHf^Ll5)zb@WM2U=c;2LBe==J4CfT-n1+TU#`8q*c)H!K{K&_voHgaIsS6#Dmy)4GV_@uWLz6 zCQj|6Ygy;mRq^1efCpaJLU|n1bS;|z2Cr*L98HWhx|R)qg4eYq#uF1dLT%c$kd6vq zg6CvS^dv?aO!Sley-p$ur@`DqgVL}*9Lx0e7KVs&&S^rGFGR!5--h~O1_78cOIj<(HBye+O7eATP3F@vFj0ZfVApl|kbfCFCq0p*~C8rNO| z5O|G}`oPsFL8V8~HcD~+AGF&{{x#oUeXn>w_CEI7Jzw?M-QRS_UEg(`cE0b-IsWD- z*gvut>CcRpApt`Ih6D@=7!vqDl0cTI`}N6#m<}|fmUX}y7k4Wa4`L?jmVGkJ6*QPm zYp1g@6xEwkmnZ@8xJSUAea_QpR|;ZeGEFy$x=)|HN*vD)pBhe0C^NlncK8B;yY%7BE>k_JC|$KEu;nbLo^u3l z$A=f=Oa&b*Noz^uG)L4O_hgO6Lz)T9gtSOo1n!-O7mYD7>XTNG)BbMWD7*yKTnn{p2kcErn^;5QO|^~@dfjb=g`(vO!rQ!^N%?io_SZ? zsF<0ra;!kxqpY{^imti@RPfs8^z`%@jfgc_8rKLgN6`#2wD@X-UA_4A%qCGc(UU6~ zuc#A&3DvB(O|fV<3^&xn+pKdkFx9uS8&=()>Y`TKtVBwi1a7T|SN5q>Q|Fbs-R8Wi r5i_V}JR8*1*wjg2b^;fqlb8T7-kvp6;i)FU08O@N<|Vu8nsWLNN`{i9 From f8baee55e14c2aebba3e49ba2469e0da5f2431bb Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 20 Aug 2020 14:37:58 -0700 Subject: [PATCH 22/66] Working changes --- .../Processor/BroadcastVariableProcessor.cs | 1 + src/csharp/Microsoft.Spark/Broadcast.cs | 24 +++++++++---------- src/csharp/Microsoft.Spark/ChunkedStream.cs | 10 ++++---- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 5feb2f113..d84a844aa 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -79,6 +79,7 @@ internal BroadcastVariables Process(Stream stream) BroadcastRegistry.Remove(bid); } } + socket.Dispose(); return broadcastVars; } } diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index f8fa019f3..067ac01ab 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -5,14 +5,12 @@ using System.Net; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; -using System.Security.Cryptography; using System.Threading; using Microsoft.Spark.Interop; using Microsoft.Spark.Interop.Ipc; using Microsoft.Spark.Network; using Microsoft.Spark.Services; - namespace Microsoft.Spark { /// @@ -29,6 +27,10 @@ public sealed class Broadcast : IJvmObjectReferenceProvider private readonly string _path; [NonSerialized] private readonly JvmObjectReference _jvmObject; + [NonSerialized] + private readonly SparkContext _sc; + [NonSerialized] + private JvmObjectReference _pythonBroadcast; private readonly long _bid; @@ -36,6 +38,7 @@ internal Broadcast(SparkContext sc, T value) { _path = CreateTempFilePath(sc.GetConf()); _jvmObject = CreateBroadcast(sc, value); + _sc = sc; _bid = (long)_jvmObject.Invoke("id"); } @@ -173,34 +176,31 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( bool encryptionEnabled = bool.Parse( sc.GetConf().Get("spark.io.encryption.enabled", "false")); - var pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( + _pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( "org.apache.spark.api.python.PythonRDD", "setupBroadcast", _path); if (encryptionEnabled) { - var pair = (JvmObjectReference[])pythonBroadcast.Invoke("setupEncryptionServer"); + var pair = (JvmObjectReference[])_pythonBroadcast.Invoke("setupEncryptionServer"); using ISocketWrapper socket = SocketFactory.CreateSocket(); socket.Connect( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); - ChunkedStream bdrcstChunked = new ChunkedStream(socket.OutputStream, 8192); - byte[] values = new byte[] { 0x80, 0x02, (byte)'X', 0x05, 0x00, 0x00, 0x00, - (byte)'h', (byte)'e', (byte)'l', (byte)'l', (byte)'o', (byte)'q', 0x00, (byte)'.' }; - bdrcstChunked.Write(values); - bdrcstChunked.Close(); - //socket.OutputStream.Flush(); - pythonBroadcast.Invoke("waitTillDataReceived"); + ChunkedStream chunked = new ChunkedStream(socket.OutputStream, 8192); + chunked.Write(value); + chunked.Close(); + _pythonBroadcast.Invoke("waitTillDataReceived"); } else { WriteToFile(value); } - return (JvmObjectReference)javaSparkContext.Invoke("broadcast", pythonBroadcast); + return (JvmObjectReference)javaSparkContext.Invoke("broadcast", _pythonBroadcast); } /// diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs index a5ec88f90..f756a4a6f 100644 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -36,11 +36,9 @@ internal void WriteInt(int value, Stream stream) internal byte[] ConvertToByteArray(object value) { var formatter = new BinaryFormatter(); - using (var ms = new MemoryStream()) - { - formatter.Serialize(ms, value); - return ms.ToArray(); - } + using var ms = new MemoryStream(); + formatter.Serialize(ms, value); + return ms.ToArray(); } public void Write(object value) @@ -78,7 +76,7 @@ public void Close() if (_currentPos > 0) { WriteInt(_currentPos, _wrapped); - _wrapped.Write(_buffer, 0, _currentPos + 1); + _wrapped.Write(_buffer, 0, _currentPos); } // -1 length indicates to the receiving end that we're done. WriteInt(-1, _wrapped); From e77881ea7de4065f96e9fc5fd37e4b702b203ff9 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 21 Aug 2020 21:45:59 -0700 Subject: [PATCH 23/66] Fixing worker unit tests failing --- .../Processor/BroadcastVariableProcessor.cs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index d84a844aa..553782e86 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -40,11 +40,14 @@ internal BroadcastVariables Process(Stream stream) { broadcastVars.DecryptionServerPort = SerDe.ReadInt32(stream); broadcastVars.Secret = SerDe.ReadString(stream); - socket = SocketFactory.CreateSocket(); - socket.Connect( - IPAddress.Loopback, - broadcastVars.DecryptionServerPort, - broadcastVars.Secret); + if (broadcastVars.Count > 0) + { + socket = SocketFactory.CreateSocket(); + socket.Connect( + IPAddress.Loopback, + broadcastVars.DecryptionServerPort, + broadcastVars.Secret); + } } var formatter = new BinaryFormatter(); @@ -79,7 +82,6 @@ internal BroadcastVariables Process(Stream stream) BroadcastRegistry.Remove(bid); } } - socket.Dispose(); return broadcastVars; } } From e3ab1a758cb818523fffbfebedf132cfe267333f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 21 Aug 2020 22:10:32 -0700 Subject: [PATCH 24/66] Adding comments and cleaning code --- src/csharp/Microsoft.Spark/ChunkedStream.cs | 54 +++++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs index f756a4a6f..2dbfe7164 100644 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -1,30 +1,39 @@ using System; -using System.Collections.Generic; using System.IO; -using System.Linq; -using System.Net.Sockets; using System.Runtime.Serialization.Formatters.Binary; -using System.Security.Cryptography; -using System.Text; -using Microsoft.Spark.Interop.Ipc; namespace Microsoft.Spark { + /// + /// This is a stream-like object that takes a stream of data, of unknown length, and breaks it + /// into fixed length frames.The intended use case is serializing large data and sending it + /// immediately over a socket -- we do not want to buffer the entire data before sending it, + /// but the receiving end needs to know whether or not there is more data coming. + /// It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, + /// it first sends the buffer size, then the data. This repeats as long as there is more data + /// to send. When this is closed, it sends the length of whatever data is in the buffer, then + /// that data, and finally a "length" of -1 to indicate the stream has completed. + /// public class ChunkedStream { private readonly int _bufferSize; - private byte[] _buffer; + private readonly byte[] _buffer; private int _currentPos; - private Stream _wrapped; + private readonly Stream _stream; - internal ChunkedStream(Stream wrapped, int bufferSize) + internal ChunkedStream(Stream stream, int bufferSize) { _bufferSize = bufferSize; _buffer = new byte[_bufferSize]; _currentPos = 0; - _wrapped = wrapped; + _stream = stream; } + /// + /// Writes the given integer value into the stream in Big Endian format. + /// + /// Int value to write to stream. + /// Stream to write value into. internal void WriteInt(int value, Stream stream) { byte[] bytes = BitConverter.GetBytes(value); @@ -33,6 +42,11 @@ internal void WriteInt(int value, Stream stream) stream.Write(bytes, 0, bytes.Length); } + /// + /// Converts the given object value into array of bytes. + /// + /// Value of type object to convert to byte array. + /// Array of bytes internal byte[] ConvertToByteArray(object value) { var formatter = new BinaryFormatter(); @@ -41,6 +55,10 @@ internal byte[] ConvertToByteArray(object value) return ms.ToArray(); } + /// + /// Writes the value into the stream of type in fixed chunks. + /// + /// Value of type object to write. public void Write(object value) { byte[] bytes = ConvertToByteArray(value); @@ -61,8 +79,8 @@ public void Write(object value) int spaceLeft = _bufferSize - _currentPos; int newBytePos = bytePos + spaceLeft; Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); - WriteInt(_bufferSize, _wrapped); - _wrapped.Write(_buffer, 0, _bufferSize); + WriteInt(_bufferSize, _stream); + _stream.Write(_buffer, 0, _bufferSize); bytesRemaining -= spaceLeft; bytePos = newBytePos; _currentPos = 0; @@ -70,17 +88,21 @@ public void Write(object value) } } + /// + /// Writes the remaining bytes left in _buffer and finishes it by writing -1 to the _stream + /// and then closing it. + /// public void Close() { // If there is anything left in the buffer, write it out first. if (_currentPos > 0) { - WriteInt(_currentPos, _wrapped); - _wrapped.Write(_buffer, 0, _currentPos); + WriteInt(_currentPos, _stream); + _stream.Write(_buffer, 0, _currentPos); } // -1 length indicates to the receiving end that we're done. - WriteInt(-1, _wrapped); - _wrapped.Close(); + WriteInt(-1, _stream); + _stream.Close(); } } } From 0cb63ef3a0cd8e085e154e22e9665ee430cdf6f0 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 21 Aug 2020 23:52:11 -0700 Subject: [PATCH 25/66] fixing pipeline hung --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 41de9c6e3..0f718511d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; using Xunit; using static Microsoft.Spark.Sql.Functions; @@ -35,7 +36,7 @@ public BroadcastTests(SparkFixture fixture) /// Test Broadcast support by using multiple broadcast variables in a UDF with /// encryption enabled. /// - [Fact] + [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestMultipleBroadcastWithEncryption() { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); From b25d73c194816fd8ed7a85527a7b5bfca4e2efa2 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 23 Aug 2020 17:48:21 -0700 Subject: [PATCH 26/66] PR review comments --- .../IpcTests/BroadcastTests.cs | 1 + src/csharp/Microsoft.Spark/Broadcast.cs | 9 +- .../Interop/Ipc/ChunkedStream.cs | 108 ++++++++++++++++++ 3 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 0f718511d..a8d2f44a7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -61,6 +61,7 @@ public void TestMultipleBroadcastWithEncryption() [Fact] public void TestMultipleBroadcastWithoutEncryption() { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "false"); var obj1 = new TestBroadcastVariable(1, "first"); var obj2 = new TestBroadcastVariable(2, "second"); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 067ac01ab..e862172da 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -27,10 +27,6 @@ public sealed class Broadcast : IJvmObjectReferenceProvider private readonly string _path; [NonSerialized] private readonly JvmObjectReference _jvmObject; - [NonSerialized] - private readonly SparkContext _sc; - [NonSerialized] - private JvmObjectReference _pythonBroadcast; private readonly long _bid; @@ -38,7 +34,6 @@ internal Broadcast(SparkContext sc, T value) { _path = CreateTempFilePath(sc.GetConf()); _jvmObject = CreateBroadcast(sc, value); - _sc = sc; _bid = (long)_jvmObject.Invoke("id"); } @@ -176,7 +171,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( bool encryptionEnabled = bool.Parse( sc.GetConf().Get("spark.io.encryption.enabled", "false")); - _pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( + var _pythonBroadcast = (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( "org.apache.spark.api.python.PythonRDD", "setupBroadcast", _path); @@ -190,7 +185,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); - ChunkedStream chunked = new ChunkedStream(socket.OutputStream, 8192); + var chunked = new ChunkedStream(socket.OutputStream, 8192); chunked.Write(value); chunked.Close(); _pythonBroadcast.Invoke("waitTillDataReceived"); diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs new file mode 100644 index 000000000..fa8a7647f --- /dev/null +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs @@ -0,0 +1,108 @@ +using System; +using System.IO; +using System.Runtime.Serialization.Formatters.Binary; + +namespace Microsoft.Spark.Interop.Ipc +{ + /// + /// This is a stream-like object that takes a stream of data, of unknown length, and breaks it + /// into fixed length frames. The intended use case is serializing large data and sending it + /// immediately over a socket -- we do not want to buffer the entire data before sending it, + /// but the receiving end needs to know whether or not there is more data coming. + /// It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, + /// it first sends the buffer size, then the data. This repeats as long as there is more data + /// to send. When this is closed, it sends the length of whatever data is in the buffer, then + /// that data, and finally a "length" of -1 to indicate the stream has completed. + /// + public class ChunkedStream + { + private readonly int _bufferSize; + private readonly byte[] _buffer; + private int _currentPos; + private readonly Stream _stream; + + internal ChunkedStream(Stream stream, int bufferSize) + { + _bufferSize = bufferSize; + _buffer = new byte[_bufferSize]; + _currentPos = 0; + _stream = stream; + } + + /// + /// Writes the value into the stream of type in fixed chunks. + /// + /// Value of type object to write. + public void Write(object value) + { + byte[] bytes = ConvertToByteArray(value); + int bytePos = 0; + int bytesRemaining = bytes.Length; + while (bytesRemaining > 0) + { + int newPos = bytesRemaining + _currentPos; + if (newPos < _bufferSize) + { + Array.Copy(bytes, bytePos, _buffer, _currentPos, bytesRemaining); + _currentPos = newPos; + bytesRemaining = 0; + } + else + { + // Fill the buffer, send the length then the contents, and start filling again. + int spaceLeft = _bufferSize - _currentPos; + int newBytePos = bytePos + spaceLeft; + Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); + WriteInt(_bufferSize, _stream); + SerDe.Write(_stream, _buffer); + bytesRemaining -= spaceLeft; + bytePos = newBytePos; + _currentPos = 0; + } + } + } + + /// + /// Writes the remaining bytes left in _buffer and finishes it by writing -1 to the _stream + /// and then closing it. + /// + public void Close() + { + // If there is anything left in the buffer, write it out first. + if (_currentPos > 0) + { + WriteInt(_currentPos, _stream); + SerDe.Write(_stream, _buffer, _currentPos); + } + // -1 length indicates to the receiving end that we're done. + WriteInt(-1, _stream); + _stream.Close(); + } + + /// + /// Writes the given integer value into the stream in Big Endian format. + /// + /// Int value to write to stream. + /// Stream to write value into. + internal void WriteInt(int value, Stream stream) + { + byte[] bytes = BitConverter.GetBytes(value); + if (BitConverter.IsLittleEndian) + Array.Reverse(bytes); + SerDe.Write(stream, bytes); + } + + /// + /// Converts the given object value into array of bytes. + /// + /// Value of type object to convert to byte array. + /// Array of bytes + internal byte[] ConvertToByteArray(object value) + { + var formatter = new BinaryFormatter(); + using var ms = new MemoryStream(); + formatter.Serialize(ms, value); + return ms.ToArray(); + } + } +} From 7582c857ae43aaf1e3e99595ca67fbd8a58d4dae Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 24 Aug 2020 00:10:20 -0700 Subject: [PATCH 27/66] adding broadcast test to pipeline to filter out --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 8ba73e0c1..36ead1bc1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,6 +17,7 @@ variables: TestsToFilterOut: "(FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameGroupedMapUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameVectorUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroy)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ From 7de60e3c7b536484916f593c5f4588131d04d166 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 26 Aug 2020 11:05:53 -0700 Subject: [PATCH 28/66] PR review changes --- .../Processor/BroadcastVariableProcessor.cs | 2 ++ src/csharp/Microsoft.Spark/ChunkedStream.cs | 20 ++++--------------- 2 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 553782e86..d705f3efb 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -82,6 +82,8 @@ internal BroadcastVariables Process(Stream stream) BroadcastRegistry.Remove(bid); } } + if (socket != null) + socket.Dispose(); return broadcastVars; } } diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs index 2dbfe7164..df43b6993 100644 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ b/src/csharp/Microsoft.Spark/ChunkedStream.cs @@ -1,6 +1,7 @@ using System; using System.IO; using System.Runtime.Serialization.Formatters.Binary; +using Microsoft.Spark.Interop.Ipc; namespace Microsoft.Spark { @@ -29,19 +30,6 @@ internal ChunkedStream(Stream stream, int bufferSize) _stream = stream; } - /// - /// Writes the given integer value into the stream in Big Endian format. - /// - /// Int value to write to stream. - /// Stream to write value into. - internal void WriteInt(int value, Stream stream) - { - byte[] bytes = BitConverter.GetBytes(value); - if (BitConverter.IsLittleEndian) - Array.Reverse(bytes); - stream.Write(bytes, 0, bytes.Length); - } - /// /// Converts the given object value into array of bytes. /// @@ -79,7 +67,7 @@ public void Write(object value) int spaceLeft = _bufferSize - _currentPos; int newBytePos = bytePos + spaceLeft; Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); - WriteInt(_bufferSize, _stream); + SerDe.Write(_stream, _bufferSize); _stream.Write(_buffer, 0, _bufferSize); bytesRemaining -= spaceLeft; bytePos = newBytePos; @@ -97,11 +85,11 @@ public void Close() // If there is anything left in the buffer, write it out first. if (_currentPos > 0) { - WriteInt(_currentPos, _stream); + SerDe.Write(_stream, _currentPos); _stream.Write(_buffer, 0, _currentPos); } // -1 length indicates to the receiving end that we're done. - WriteInt(-1, _stream); + SerDe.Write(_stream, -1); _stream.Close(); } } From edd098e4f2d51a94eaa1bc41fc96ff6918f62db3 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 27 Aug 2020 14:12:07 -0700 Subject: [PATCH 29/66] removing extra copy of file --- src/csharp/Microsoft.Spark/ChunkedStream.cs | 96 --------------------- 1 file changed, 96 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark/ChunkedStream.cs diff --git a/src/csharp/Microsoft.Spark/ChunkedStream.cs b/src/csharp/Microsoft.Spark/ChunkedStream.cs deleted file mode 100644 index df43b6993..000000000 --- a/src/csharp/Microsoft.Spark/ChunkedStream.cs +++ /dev/null @@ -1,96 +0,0 @@ -using System; -using System.IO; -using System.Runtime.Serialization.Formatters.Binary; -using Microsoft.Spark.Interop.Ipc; - -namespace Microsoft.Spark -{ - /// - /// This is a stream-like object that takes a stream of data, of unknown length, and breaks it - /// into fixed length frames.The intended use case is serializing large data and sending it - /// immediately over a socket -- we do not want to buffer the entire data before sending it, - /// but the receiving end needs to know whether or not there is more data coming. - /// It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, - /// it first sends the buffer size, then the data. This repeats as long as there is more data - /// to send. When this is closed, it sends the length of whatever data is in the buffer, then - /// that data, and finally a "length" of -1 to indicate the stream has completed. - /// - public class ChunkedStream - { - private readonly int _bufferSize; - private readonly byte[] _buffer; - private int _currentPos; - private readonly Stream _stream; - - internal ChunkedStream(Stream stream, int bufferSize) - { - _bufferSize = bufferSize; - _buffer = new byte[_bufferSize]; - _currentPos = 0; - _stream = stream; - } - - /// - /// Converts the given object value into array of bytes. - /// - /// Value of type object to convert to byte array. - /// Array of bytes - internal byte[] ConvertToByteArray(object value) - { - var formatter = new BinaryFormatter(); - using var ms = new MemoryStream(); - formatter.Serialize(ms, value); - return ms.ToArray(); - } - - /// - /// Writes the value into the stream of type in fixed chunks. - /// - /// Value of type object to write. - public void Write(object value) - { - byte[] bytes = ConvertToByteArray(value); - int bytePos = 0; - int bytesRemaining = bytes.Length; - while (bytesRemaining > 0) - { - int newPos = bytesRemaining + _currentPos; - if (newPos < _bufferSize) - { - Array.Copy(bytes, bytePos, _buffer, _currentPos, bytesRemaining); - _currentPos = newPos; - bytesRemaining = 0; - } - else - { - // Fill the buffer, send the length then the contents, and start filling again. - int spaceLeft = _bufferSize - _currentPos; - int newBytePos = bytePos + spaceLeft; - Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); - SerDe.Write(_stream, _bufferSize); - _stream.Write(_buffer, 0, _bufferSize); - bytesRemaining -= spaceLeft; - bytePos = newBytePos; - _currentPos = 0; - } - } - } - - /// - /// Writes the remaining bytes left in _buffer and finishes it by writing -1 to the _stream - /// and then closing it. - /// - public void Close() - { - // If there is anything left in the buffer, write it out first. - if (_currentPos > 0) - { - SerDe.Write(_stream, _currentPos); - _stream.Write(_buffer, 0, _currentPos); - } - // -1 length indicates to the receiving end that we're done. - SerDe.Write(_stream, -1); - _stream.Close(); - } - } -} From f8b9071f1bd46f06bf8ebac13ee897f6a060b0a6 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 28 Aug 2020 18:42:14 -0700 Subject: [PATCH 30/66] Removing ChunkedStream class --- src/csharp/Microsoft.Spark/Broadcast.cs | 41 ++++++- .../Interop/Ipc/ChunkedStream.cs | 108 ------------------ .../Microsoft.Spark/Interop/Ipc/SerDe.cs | 11 ++ 3 files changed, 49 insertions(+), 111 deletions(-) delete mode 100644 src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index e862172da..b28aaf9f6 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -2,6 +2,7 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; +using System.Linq; using System.Net; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; @@ -185,9 +186,8 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); - var chunked = new ChunkedStream(socket.OutputStream, 8192); - chunked.Write(value); - chunked.Close(); + WriteInChunks(value, 8192, socket.OutputStream); + socket.OutputStream.Close(); _pythonBroadcast.Invoke("waitTillDataReceived"); } else @@ -198,6 +198,41 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( return (JvmObjectReference)javaSparkContext.Invoke("broadcast", _pythonBroadcast); } + /// TODO: This is not performant as it writes to stream only after serializing the whole + /// value, instead of serializing and writing in chunks like Python. + /// + /// Function to write the broadcast value into the stream in fixed size chunks. + /// + /// Broadcast value to be written to the stream + /// Size of chunk to write values in + /// Stream connecting to encryption server to write value to + private void WriteInChunks(object value, int chunkSize, Stream stream) + { + var formatter = new BinaryFormatter(); + using var ms = new MemoryStream(); + formatter.Serialize(ms, value); + byte[] valueBytes = ms.ToArray(); + int bytesRemaining = valueBytes.Length; + int bytePos = 0; + while (bytesRemaining > 0) + { + if (bytesRemaining < chunkSize) + { + SerDe.Write(stream, bytesRemaining); + SerDe.Write(stream, valueBytes, bytePos, bytesRemaining); + bytesRemaining = 0; + } + else + { + SerDe.Write(stream, chunkSize); + SerDe.Write(stream, valueBytes, bytePos, chunkSize); + bytePos += chunkSize; + bytesRemaining -= chunkSize; + } + } + SerDe.Write(stream, -1); + } + /// /// Function that creates a file in _path to store the broadcast value in the given path. /// diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs deleted file mode 100644 index fa8a7647f..000000000 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/ChunkedStream.cs +++ /dev/null @@ -1,108 +0,0 @@ -using System; -using System.IO; -using System.Runtime.Serialization.Formatters.Binary; - -namespace Microsoft.Spark.Interop.Ipc -{ - /// - /// This is a stream-like object that takes a stream of data, of unknown length, and breaks it - /// into fixed length frames. The intended use case is serializing large data and sending it - /// immediately over a socket -- we do not want to buffer the entire data before sending it, - /// but the receiving end needs to know whether or not there is more data coming. - /// It works by buffering the incoming data in some fixed-size chunks. If the buffer is full, - /// it first sends the buffer size, then the data. This repeats as long as there is more data - /// to send. When this is closed, it sends the length of whatever data is in the buffer, then - /// that data, and finally a "length" of -1 to indicate the stream has completed. - /// - public class ChunkedStream - { - private readonly int _bufferSize; - private readonly byte[] _buffer; - private int _currentPos; - private readonly Stream _stream; - - internal ChunkedStream(Stream stream, int bufferSize) - { - _bufferSize = bufferSize; - _buffer = new byte[_bufferSize]; - _currentPos = 0; - _stream = stream; - } - - /// - /// Writes the value into the stream of type in fixed chunks. - /// - /// Value of type object to write. - public void Write(object value) - { - byte[] bytes = ConvertToByteArray(value); - int bytePos = 0; - int bytesRemaining = bytes.Length; - while (bytesRemaining > 0) - { - int newPos = bytesRemaining + _currentPos; - if (newPos < _bufferSize) - { - Array.Copy(bytes, bytePos, _buffer, _currentPos, bytesRemaining); - _currentPos = newPos; - bytesRemaining = 0; - } - else - { - // Fill the buffer, send the length then the contents, and start filling again. - int spaceLeft = _bufferSize - _currentPos; - int newBytePos = bytePos + spaceLeft; - Array.Copy(bytes, bytePos, _buffer, _currentPos, spaceLeft); - WriteInt(_bufferSize, _stream); - SerDe.Write(_stream, _buffer); - bytesRemaining -= spaceLeft; - bytePos = newBytePos; - _currentPos = 0; - } - } - } - - /// - /// Writes the remaining bytes left in _buffer and finishes it by writing -1 to the _stream - /// and then closing it. - /// - public void Close() - { - // If there is anything left in the buffer, write it out first. - if (_currentPos > 0) - { - WriteInt(_currentPos, _stream); - SerDe.Write(_stream, _buffer, _currentPos); - } - // -1 length indicates to the receiving end that we're done. - WriteInt(-1, _stream); - _stream.Close(); - } - - /// - /// Writes the given integer value into the stream in Big Endian format. - /// - /// Int value to write to stream. - /// Stream to write value into. - internal void WriteInt(int value, Stream stream) - { - byte[] bytes = BitConverter.GetBytes(value); - if (BitConverter.IsLittleEndian) - Array.Reverse(bytes); - SerDe.Write(stream, bytes); - } - - /// - /// Converts the given object value into array of bytes. - /// - /// Value of type object to convert to byte array. - /// Array of bytes - internal byte[] ConvertToByteArray(object value) - { - var formatter = new BinaryFormatter(); - using var ms = new MemoryStream(); - formatter.Serialize(ms, value); - return ms.ToArray(); - } - } -} diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs index c2c742e87..937a0e007 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs @@ -282,6 +282,17 @@ public static void Write(Stream s, byte[] value) => public static void Write(Stream s, byte[] value, int count) => s.Write(value, 0, count); + /// + /// Writes a byte array to a stream + /// + /// The stream to write + /// The byte array to write + /// The zero-based byte offset in array at which to begin copying + /// bytes + /// The number of bytes in the array to write. + public static void Write(Stream s, byte[] value, int offset, int count) => + s.Write(value, offset, count); + /// /// Writes a boolean to a stream /// From f83ed475473a02c90405551ecea8dc70006decd3 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 28 Aug 2020 18:44:00 -0700 Subject: [PATCH 31/66] removing unused library --- src/csharp/Microsoft.Spark/Broadcast.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index b28aaf9f6..bf60b7985 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -2,7 +2,6 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; -using System.Linq; using System.Net; using System.Runtime.Serialization; using System.Runtime.Serialization.Formatters.Binary; From 27c9af2e9bca8d4f6f6e24a4e408a0c78a48f702 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 29 Aug 2020 13:35:58 -0700 Subject: [PATCH 32/66] PR review changes --- azure-pipelines.yml | 1 + .../IpcTests/BroadcastTests.cs | 25 +++++++++++++++++ .../IpcTests/Sql/DataFrameTests.cs | 2 ++ .../IpcTests/Sql/SparkSessionTests.cs | 1 + .../UdfTests/UdfSimpleTypesTests.cs | 2 ++ .../Processor/BroadcastVariableProcessor.cs | 6 +++-- src/csharp/Microsoft.Spark/Broadcast.cs | 27 +++++-------------- 7 files changed, 41 insertions(+), 23 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 36ead1bc1..b530adbc3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,6 +20,7 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index a8d2f44a7..aba6da17f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -35,6 +35,7 @@ public BroadcastTests(SparkFixture fixture) /// /// Test Broadcast support by using multiple broadcast variables in a UDF with /// encryption enabled. + /// This test is filtered out when backward compatibility tests are run. /// [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestMultipleBroadcastWithEncryption() @@ -57,6 +58,7 @@ public void TestMultipleBroadcastWithEncryption() /// /// Test Broadcast support by using multiple broadcast variables in a UDF. + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestMultipleBroadcastWithoutEncryption() @@ -76,9 +78,31 @@ public void TestMultipleBroadcastWithoutEncryption() Assert.Equal(expected, actual); } + /// + /// Test Broadcast with encryption support by broadcasting a large (>100MB) object. + /// This test is filtered out when backward compatibility tests are run. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + public void TestLargeBroadcastValueWithEncryption() + { + var broadcastValue = new byte[104858000]; + Broadcast bc1 = _spark.SparkContext.Broadcast(broadcastValue); + + Func udf = Udf( + str => $"{str}: length of broadcast array = {bc1.Value().Length}"); + + var expected = new string[] { + "hello: length of broadcast array = 104858000", + "world: length of broadcast array = 104858000" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + } + /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers. + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestDestroy() @@ -120,6 +144,7 @@ public void TestDestroy() /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If /// the broadcast is used after unpersist is called, it is re-sent to the executors. + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUnpersist() diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs index c2dc897db..06b5e68d8 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs @@ -225,6 +225,7 @@ public void TestVectorUdf() } [Fact] + // This test is filtered out when backward compatibility tests are run. public void TestDataFrameVectorUdf() { Func udf1Func = @@ -370,6 +371,7 @@ private static RecordBatch ArrowBasedCountCharacters(RecordBatch records) [Fact] + // This test is filtered out when backward compatibility tests are run. public void TestDataFrameGroupedMapUdf() { DataFrame df = _spark diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs index 5a70a6698..14430206b 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs @@ -172,6 +172,7 @@ public void TestCreateDataFrame() /// /// Test CreateDataFrame API with Timestamp as data + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestCreateDataFrameWithTimestamp() diff --git a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs index 92422c205..2a872dc54 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs @@ -105,6 +105,7 @@ public void TestUdfWithReturnAsDateType() /// /// UDF that takes in Timestamp type. + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUdfWithTimestampType() @@ -125,6 +126,7 @@ public void TestUdfWithTimestampType() /// /// UDF that returns Timestamp type. + /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUdfWithReturnAsTimestampType() diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index d705f3efb..0d18eaf34 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -61,8 +61,8 @@ internal BroadcastVariables Process(Stream stream) var readBid = SerDe.ReadInt64(socket.InputStream); if (bid != readBid) { - throw new Exception($"Encrypted broadcast id {readBid} does not " + - $"match regular stream broadcast id {bid}"); + throw new Exception($"Broadcast id {readBid} from encrypted stream" + + $" does not " + $"match broadcast id {bid} from normal stream."); } object value = formatter.Deserialize(socket.InputStream); BroadcastRegistry.Add(bid, value); @@ -83,7 +83,9 @@ internal BroadcastVariables Process(Stream stream) } } if (socket != null) + { socket.Dispose(); + } return broadcastVars; } } diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index bf60b7985..3f76ca7df 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -185,7 +185,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); - WriteInChunks(value, 8192, socket.OutputStream); + WriteInChunks(value, socket.OutputStream); socket.OutputStream.Close(); _pythonBroadcast.Invoke("waitTillDataReceived"); } @@ -200,35 +200,20 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( /// TODO: This is not performant as it writes to stream only after serializing the whole /// value, instead of serializing and writing in chunks like Python. /// - /// Function to write the broadcast value into the stream in fixed size chunks. + /// Function to write the broadcast value into the encrypted stream. /// /// Broadcast value to be written to the stream - /// Size of chunk to write values in /// Stream connecting to encryption server to write value to - private void WriteInChunks(object value, int chunkSize, Stream stream) + private void WriteInChunks(object value, Stream stream) { var formatter = new BinaryFormatter(); using var ms = new MemoryStream(); formatter.Serialize(ms, value); byte[] valueBytes = ms.ToArray(); int bytesRemaining = valueBytes.Length; - int bytePos = 0; - while (bytesRemaining > 0) - { - if (bytesRemaining < chunkSize) - { - SerDe.Write(stream, bytesRemaining); - SerDe.Write(stream, valueBytes, bytePos, bytesRemaining); - bytesRemaining = 0; - } - else - { - SerDe.Write(stream, chunkSize); - SerDe.Write(stream, valueBytes, bytePos, chunkSize); - bytePos += chunkSize; - bytesRemaining -= chunkSize; - } - } + SerDe.Write(stream, bytesRemaining); + SerDe.Write(stream, valueBytes, 0, bytesRemaining); + // -1 length indicates to the receiving end that we're done. SerDe.Write(stream, -1); } From 63a65acfd15bb2f297df9d9a13800d0af7b1c874 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 29 Aug 2020 13:44:45 -0700 Subject: [PATCH 33/66] Variable name consistency --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index aba6da17f..7aaa10237 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -85,8 +85,8 @@ public void TestMultipleBroadcastWithoutEncryption() [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestLargeBroadcastValueWithEncryption() { - var broadcastValue = new byte[104858000]; - Broadcast bc1 = _spark.SparkContext.Broadcast(broadcastValue); + var obj1 = new byte[104858000]; + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); Func udf = Udf( str => $"{str}: length of broadcast array = {bc1.Value().Length}"); From 7555b935d6ede92647a867c886ccd13d07b48d09 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 31 Aug 2020 13:03:46 -0700 Subject: [PATCH 34/66] nit --- src/csharp/Microsoft.Spark/Broadcast.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 3f76ca7df..3e25d4475 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -185,7 +185,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), (string)pair[1].Invoke("toString")); - WriteInChunks(value, socket.OutputStream); + WriteEncrypted(value, socket.OutputStream); socket.OutputStream.Close(); _pythonBroadcast.Invoke("waitTillDataReceived"); } @@ -204,7 +204,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( /// /// Broadcast value to be written to the stream /// Stream connecting to encryption server to write value to - private void WriteInChunks(object value, Stream stream) + private void WriteEncrypted(object value, Stream stream) { var formatter = new BinaryFormatter(); using var ms = new MemoryStream(); From 6c066e2c468d4582e58e6f65d679725e120962d1 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 31 Aug 2020 14:26:10 -0700 Subject: [PATCH 35/66] debugging pipeline hang spark 2.4.0 onwards - removed large broadcast value test --- azure-pipelines.yml | 1 - .../IpcTests/BroadcastTests.cs | 32 +++++++++---------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b530adbc3..36ead1bc1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,7 +20,6 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 7aaa10237..4f57cf2bb 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -82,22 +82,22 @@ public void TestMultipleBroadcastWithoutEncryption() /// Test Broadcast with encryption support by broadcasting a large (>100MB) object. /// This test is filtered out when backward compatibility tests are run. /// - [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - public void TestLargeBroadcastValueWithEncryption() - { - var obj1 = new byte[104858000]; - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str}: length of broadcast array = {bc1.Value().Length}"); - - var expected = new string[] { - "hello: length of broadcast array = 104858000", - "world: length of broadcast array = 104858000" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - } + //[SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + //public void TestLargeBroadcastValueWithEncryption() + //{ + // var obj1 = new byte[104858000]; + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + // Func udf = Udf( + // str => $"{str}: length of broadcast array = {bc1.Value().Length}"); + + // var expected = new string[] { + // "hello: length of broadcast array = 104858000", + // "world: length of broadcast array = 104858000" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + //} /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast From b51ebeec20f367040d9186d91d7345021f56f9ab Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 1 Sep 2020 11:02:37 -0700 Subject: [PATCH 36/66] Debugging pipeline hang --- azure-pipelines.yml | 1 + .../IpcTests/BroadcastTests.cs | 34 ++++++++++--------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 36ead1bc1..b530adbc3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -20,6 +20,7 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 4f57cf2bb..6083ab699 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -82,22 +82,24 @@ public void TestMultipleBroadcastWithoutEncryption() /// Test Broadcast with encryption support by broadcasting a large (>100MB) object. /// This test is filtered out when backward compatibility tests are run. /// - //[SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - //public void TestLargeBroadcastValueWithEncryption() - //{ - // var obj1 = new byte[104858000]; - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - // Func udf = Udf( - // str => $"{str}: length of broadcast array = {bc1.Value().Length}"); - - // var expected = new string[] { - // "hello: length of broadcast array = 104858000", - // "world: length of broadcast array = 104858000" }; - - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - //} + [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + public void TestLargeBroadcastValueWithEncryption() + { + var obj1 = new byte[104858000]; + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + Func udf = Udf( + str => $"{str}: length of broadcast array = {bc1.Value().Length}"); + + var expected = new string[] { + "hello: length of broadcast array = 104858000", + "world: length of broadcast array = 104858000" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + // Destroying broadcast variable to free up memory + bc1.Destroy(); + } /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast From 87db025b9c64874d6ab0e7b89de165f6186eae73 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 1 Sep 2020 18:45:12 -0700 Subject: [PATCH 37/66] PR review changes --- azure-pipelines.yml | 1 + .../IpcTests/BroadcastTests.cs | 5 -- .../IpcTests/Sql/DataFrameTests.cs | 2 - .../IpcTests/Sql/SparkSessionTests.cs | 1 - .../UdfTests/UdfSimpleTypesTests.cs | 2 - src/csharp/Microsoft.Spark/Broadcast.cs | 60 ++++++++++++------- .../Microsoft.Spark/Interop/Ipc/SerDe.cs | 11 ---- 7 files changed, 39 insertions(+), 43 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b530adbc3..9f265db79 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -14,6 +14,7 @@ variables: # forwardCompatibleRelease/backwardCompatibleRelease is the "oldest" releases that work with the current release forwardCompatibleRelease: '0.9.0' backwardCompatibleRelease: '0.9.0' + # TestsToFilterOut are tests filtered out when backward compatibility tests are run. TestsToFilterOut: "(FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameGroupedMapUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameVectorUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroy)&\ diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 6083ab699..cdb2a0322 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -35,7 +35,6 @@ public BroadcastTests(SparkFixture fixture) /// /// Test Broadcast support by using multiple broadcast variables in a UDF with /// encryption enabled. - /// This test is filtered out when backward compatibility tests are run. /// [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestMultipleBroadcastWithEncryption() @@ -58,7 +57,6 @@ public void TestMultipleBroadcastWithEncryption() /// /// Test Broadcast support by using multiple broadcast variables in a UDF. - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestMultipleBroadcastWithoutEncryption() @@ -80,7 +78,6 @@ public void TestMultipleBroadcastWithoutEncryption() /// /// Test Broadcast with encryption support by broadcasting a large (>100MB) object. - /// This test is filtered out when backward compatibility tests are run. /// [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestLargeBroadcastValueWithEncryption() @@ -104,7 +101,6 @@ public void TestLargeBroadcastValueWithEncryption() /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers. - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestDestroy() @@ -146,7 +142,6 @@ public void TestDestroy() /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If /// the broadcast is used after unpersist is called, it is re-sent to the executors. - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUnpersist() diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs index 17e890d27..4fa9904e3 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs @@ -225,7 +225,6 @@ public void TestVectorUdf() } [Fact] - // This test is filtered out when backward compatibility tests are run. public void TestDataFrameVectorUdf() { Func udf1Func = @@ -371,7 +370,6 @@ private static RecordBatch ArrowBasedCountCharacters(RecordBatch records) [Fact] - // This test is filtered out when backward compatibility tests are run. public void TestDataFrameGroupedMapUdf() { DataFrame df = _spark diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs index 14430206b..5a70a6698 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/SparkSessionTests.cs @@ -172,7 +172,6 @@ public void TestCreateDataFrame() /// /// Test CreateDataFrame API with Timestamp as data - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestCreateDataFrameWithTimestamp() diff --git a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs index 2a872dc54..92422c205 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/UdfTests/UdfSimpleTypesTests.cs @@ -105,7 +105,6 @@ public void TestUdfWithReturnAsDateType() /// /// UDF that takes in Timestamp type. - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUdfWithTimestampType() @@ -126,7 +125,6 @@ public void TestUdfWithTimestampType() /// /// UDF that returns Timestamp type. - /// This test is filtered out when backward compatibility tests are run. /// [Fact] public void TestUdfWithReturnAsTimestampType() diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 3e25d4475..740cf161b 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -144,7 +144,7 @@ private JvmObjectReference CreateBroadcast_V2_3_1_AndBelow( JvmObjectReference javaSparkContext, object value) { - WriteToFile(value); + WriteToStream(value, null, false); return (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( "org.apache.spark.api.python.PythonRDD", "readBroadcastFromFile", @@ -180,18 +180,20 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( { var pair = (JvmObjectReference[])_pythonBroadcast.Invoke("setupEncryptionServer"); - using ISocketWrapper socket = SocketFactory.CreateSocket(); - socket.Connect( - IPAddress.Loopback, - (int)pair[0].Invoke("intValue"), - (string)pair[1].Invoke("toString")); - WriteEncrypted(value, socket.OutputStream); - socket.OutputStream.Close(); + using (ISocketWrapper socket = SocketFactory.CreateSocket()) + { + socket.Connect( + IPAddress.Loopback, + (int)pair[0].Invoke("intValue"), // port number + (string)pair[1].Invoke("toString")); // secret + WriteToStream(value, socket.OutputStream, true); + socket.OutputStream.Flush(); + } _pythonBroadcast.Invoke("waitTillDataReceived"); } else { - WriteToFile(value); + WriteToStream(value, null, false); } return (JvmObjectReference)javaSparkContext.Invoke("broadcast", _pythonBroadcast); @@ -204,17 +206,18 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( /// /// Broadcast value to be written to the stream /// Stream connecting to encryption server to write value to - private void WriteEncrypted(object value, Stream stream) + /// Boolean value to check if broadcast encrytion is set + private void WriteToStream(object value, Stream stream, bool isEncrypted) { - var formatter = new BinaryFormatter(); - using var ms = new MemoryStream(); - formatter.Serialize(ms, value); - byte[] valueBytes = ms.ToArray(); - int bytesRemaining = valueBytes.Length; - SerDe.Write(stream, bytesRemaining); - SerDe.Write(stream, valueBytes, 0, bytesRemaining); - // -1 length indicates to the receiving end that we're done. - SerDe.Write(stream, -1); + if (!isEncrypted) + { + using FileStream f = File.Create(_path); + Dump(value, f, isEncrypted); + } + else + { + Dump(value, stream, isEncrypted); + } } /// @@ -224,7 +227,7 @@ private void WriteEncrypted(object value, Stream stream) private void WriteToFile(object value) { using FileStream f = File.Create(_path); - Dump(value, f); + Dump(value, f, false); } /// @@ -232,10 +235,23 @@ private void WriteToFile(object value) /// /// Serializable object /// Stream to which the object is serialized - private void Dump(object value, Stream stream) + /// Stream to which the object is serialized + private void Dump(object value, Stream stream, bool isEncrypted) { var formatter = new BinaryFormatter(); - formatter.Serialize(stream, value); + if (isEncrypted) + { + using var ms = new MemoryStream(); + formatter.Serialize(ms, value); + SerDe.Write(stream, ms.Length); + ms.WriteTo(stream); + // -1 length indicates to the receiving end that we're done. + SerDe.Write(stream, -1); + } + else + { + formatter.Serialize(stream, value); + } } } diff --git a/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs b/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs index 937a0e007..c2c742e87 100644 --- a/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs +++ b/src/csharp/Microsoft.Spark/Interop/Ipc/SerDe.cs @@ -282,17 +282,6 @@ public static void Write(Stream s, byte[] value) => public static void Write(Stream s, byte[] value, int count) => s.Write(value, 0, count); - /// - /// Writes a byte array to a stream - /// - /// The stream to write - /// The byte array to write - /// The zero-based byte offset in array at which to begin copying - /// bytes - /// The number of bytes in the array to write. - public static void Write(Stream s, byte[] value, int offset, int count) => - s.Write(value, offset, count); - /// /// Writes a boolean to a stream /// From ac87c46e5f01093a8c0340928c0e70d923efbd9a Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 1 Sep 2020 19:10:16 -0700 Subject: [PATCH 38/66] PR review changes --- .../IpcTests/BroadcastTests.cs | 46 +++++++++++++++++++ src/csharp/Microsoft.Spark/Broadcast.cs | 29 ++++-------- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index cdb2a0322..7c227f59a 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -82,6 +82,7 @@ public void TestMultipleBroadcastWithoutEncryption() [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestLargeBroadcastValueWithEncryption() { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); var obj1 = new byte[104858000]; Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); @@ -105,6 +106,7 @@ public void TestLargeBroadcastValueWithEncryption() [Fact] public void TestDestroy() { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "false"); var obj1 = new TestBroadcastVariable(5, "destroy"); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); @@ -139,6 +141,50 @@ public void TestDestroy() } } + /// + /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast + /// variable and makes it inaccessible from workers, with Broadcast encryption on. + /// + [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + public void TestDestroyWithEncryption() + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); + var obj1 = new TestBroadcastVariable(6, "destroy encryption"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); + + var expected = new string[] { + "hello destroy encryption, 6", + "world destroy encryption, 6" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + + bc1.Destroy(); + + // Throws the following exception: + // ERROR Utils: Exception encountered + // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) + // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) + // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) + // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + try + { + _df.Select(udf(_df["_1"])).Collect().ToArray(); + Assert.True(false); + } + catch (Exception e) + { + Assert.NotNull(e); + } + } + /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If /// the broadcast is used after unpersist is called, it is re-sent to the executors. diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 740cf161b..9428a73ef 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -199,43 +199,34 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( return (JvmObjectReference)javaSparkContext.Invoke("broadcast", _pythonBroadcast); } - /// TODO: This is not performant as it writes to stream only after serializing the whole - /// value, instead of serializing and writing in chunks like Python. + /// TODO: This is not performant in the case of Broadcast encryption as it writes to stream + /// only after serializing the whole value, instead of serializing and writing in chunks + /// like Python. /// - /// Function to write the broadcast value into the encrypted stream. + /// Function to write the broadcast value into the stream. /// /// Broadcast value to be written to the stream - /// Stream connecting to encryption server to write value to + /// Stream to write value to /// Boolean value to check if broadcast encrytion is set private void WriteToStream(object value, Stream stream, bool isEncrypted) { - if (!isEncrypted) + if (isEncrypted) { - using FileStream f = File.Create(_path); - Dump(value, f, isEncrypted); + Dump(value, stream, isEncrypted); } else { - Dump(value, stream, isEncrypted); + using FileStream f = File.Create(_path); + Dump(value, f, isEncrypted); } } - /// - /// Function that creates a file in _path to store the broadcast value in the given path. - /// - /// Broadcast value to be written to the file - private void WriteToFile(object value) - { - using FileStream f = File.Create(_path); - Dump(value, f, false); - } - /// /// Function that serializes and stores the object passed to the given Stream. /// /// Serializable object /// Stream to which the object is serialized - /// Stream to which the object is serialized + /// Boolean value to check if broadcast encrytion is set private void Dump(object value, Stream stream, bool isEncrypted) { var formatter = new BinaryFormatter(); From 04e46e10e486e670d84fd4b492966839564adedc Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 1 Sep 2020 19:38:59 -0700 Subject: [PATCH 39/66] reverted changes --- src/csharp/Microsoft.Spark/Broadcast.cs | 51 +++++++++++-------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 9428a73ef..53619a0db 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -144,7 +144,7 @@ private JvmObjectReference CreateBroadcast_V2_3_1_AndBelow( JvmObjectReference javaSparkContext, object value) { - WriteToStream(value, null, false); + WriteToFile(value); return (JvmObjectReference)javaSparkContext.Jvm.CallStaticJavaMethod( "org.apache.spark.api.python.PythonRDD", "readBroadcastFromFile", @@ -187,13 +187,12 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( (int)pair[0].Invoke("intValue"), // port number (string)pair[1].Invoke("toString")); // secret WriteToStream(value, socket.OutputStream, true); - socket.OutputStream.Flush(); } _pythonBroadcast.Invoke("waitTillDataReceived"); } else { - WriteToStream(value, null, false); + WriteToFile(value); } return (JvmObjectReference)javaSparkContext.Invoke("broadcast", _pythonBroadcast); @@ -207,18 +206,25 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( /// /// Broadcast value to be written to the stream /// Stream to write value to - /// Boolean value to check if broadcast encrytion is set - private void WriteToStream(object value, Stream stream, bool isEncrypted) + private void WriteToStream(object value, Stream stream) { - if (isEncrypted) - { - Dump(value, stream, isEncrypted); - } - else - { - using FileStream f = File.Create(_path); - Dump(value, f, isEncrypted); - } + var formatter = new BinaryFormatter(); + using var ms = new MemoryStream(); + formatter.Serialize(ms, value); + SerDe.Write(stream, ms.Length); + ms.WriteTo(stream); + // -1 length indicates to the receiving end that we're done. + SerDe.Write(stream, -1); + } + + /// + /// Function that creates a file in _path to store the broadcast value in the given path. + /// + /// Broadcast value to be written to the file + private void WriteToFile(object value) + { + using FileStream f = File.Create(_path); + Dump(value, f); } /// @@ -226,23 +232,10 @@ private void WriteToStream(object value, Stream stream, bool isEncrypted) /// /// Serializable object /// Stream to which the object is serialized - /// Boolean value to check if broadcast encrytion is set - private void Dump(object value, Stream stream, bool isEncrypted) + private void Dump(object value, Stream stream) { var formatter = new BinaryFormatter(); - if (isEncrypted) - { - using var ms = new MemoryStream(); - formatter.Serialize(ms, value); - SerDe.Write(stream, ms.Length); - ms.WriteTo(stream); - // -1 length indicates to the receiving end that we're done. - SerDe.Write(stream, -1); - } - else - { - formatter.Serialize(stream, value); - } + formatter.Serialize(stream, value); } } From 049f137f16a2baf9a3836cf081897e32eee1d886 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 1 Sep 2020 19:41:25 -0700 Subject: [PATCH 40/66] fixing error --- src/csharp/Microsoft.Spark/Broadcast.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 53619a0db..601ff5a24 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -186,7 +186,7 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( IPAddress.Loopback, (int)pair[0].Invoke("intValue"), // port number (string)pair[1].Invoke("toString")); // secret - WriteToStream(value, socket.OutputStream, true); + WriteToStream(value, socket.OutputStream); } _pythonBroadcast.Invoke("waitTillDataReceived"); } From 996975e19e9e043ff72071a04d99a6917a4bad5f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 2 Sep 2020 10:25:39 -0700 Subject: [PATCH 41/66] PR review changes --- azure-pipelines.yml | 1 + src/csharp/Microsoft.Spark/Broadcast.cs | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9f265db79..37240c84e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,6 +22,7 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroyWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark/Broadcast.cs b/src/csharp/Microsoft.Spark/Broadcast.cs index 601ff5a24..1136aaafe 100644 --- a/src/csharp/Microsoft.Spark/Broadcast.cs +++ b/src/csharp/Microsoft.Spark/Broadcast.cs @@ -208,9 +208,8 @@ private JvmObjectReference CreateBroadcast_V2_3_2_AndAbove( /// Stream to write value to private void WriteToStream(object value, Stream stream) { - var formatter = new BinaryFormatter(); using var ms = new MemoryStream(); - formatter.Serialize(ms, value); + Dump(value, ms); SerDe.Write(stream, ms.Length); ms.WriteTo(stream); // -1 length indicates to the receiving end that we're done. From fe5c6f4657a14c99357b94bbc4e818caaa14f38e Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 2 Sep 2020 15:41:12 -0700 Subject: [PATCH 42/66] re-triggering pipeline failing due to ActiveSession bug --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 7c227f59a..00f2fba84 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -143,7 +143,7 @@ public void TestDestroy() /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast - /// variable and makes it inaccessible from workers, with Broadcast encryption on. + /// variable and makes it inaccessible from workers, with Broadcast encryption set to true. /// [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] public void TestDestroyWithEncryption() From 83815b8d146fe2bb766fdbad57b48c4de25f56b4 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 2 Sep 2020 19:57:26 -0700 Subject: [PATCH 43/66] debugging pipeline hanging --- azure-pipelines.yml | 1 - .../IpcTests/BroadcastTests.cs | 78 +++++++++---------- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 37240c84e..9f265db79 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,7 +22,6 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroyWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 00f2fba84..eafc43d5b 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -145,45 +145,45 @@ public void TestDestroy() /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers, with Broadcast encryption set to true. /// - [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - public void TestDestroyWithEncryption() - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); - var obj1 = new TestBroadcastVariable(6, "destroy encryption"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); - - var expected = new string[] { - "hello destroy encryption, 6", - "world destroy encryption, 6" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - - bc1.Destroy(); - - // Throws the following exception: - // ERROR Utils: Exception encountered - // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) - // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) - // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) - // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - try - { - _df.Select(udf(_df["_1"])).Collect().ToArray(); - Assert.True(false); - } - catch (Exception e) - { - Assert.NotNull(e); - } - } + //[SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + //public void TestDestroyWithEncryption() + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); + // var obj1 = new TestBroadcastVariable(6, "destroy encryption"); + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + // Func udf = Udf( + // str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); + + // var expected = new string[] { + // "hello destroy encryption, 6", + // "world destroy encryption, 6" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + + // bc1.Destroy(); + + // // Throws the following exception: + // // ERROR Utils: Exception encountered + // // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) + // // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) + // // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) + // // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + // try + // { + // _df.Select(udf(_df["_1"])).Collect().ToArray(); + // Assert.True(false); + // } + // catch (Exception e) + // { + // Assert.NotNull(e); + // } + //} /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If From 8017605142a12bf7cd7961df4d0191fcbc72a37f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 4 Sep 2020 10:34:13 -0700 Subject: [PATCH 44/66] adding TestDestroyWithEncryption --- .../IpcTests/BroadcastTests.cs | 78 +++++++++---------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index eafc43d5b..00f2fba84 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -145,45 +145,45 @@ public void TestDestroy() /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers, with Broadcast encryption set to true. /// - //[SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - //public void TestDestroyWithEncryption() - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); - // var obj1 = new TestBroadcastVariable(6, "destroy encryption"); - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - // Func udf = Udf( - // str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); - - // var expected = new string[] { - // "hello destroy encryption, 6", - // "world destroy encryption, 6" }; - - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - - // bc1.Destroy(); - - // // Throws the following exception: - // // ERROR Utils: Exception encountered - // // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) - // // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) - // // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) - // // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - // try - // { - // _df.Select(udf(_df["_1"])).Collect().ToArray(); - // Assert.True(false); - // } - // catch (Exception e) - // { - // Assert.NotNull(e); - // } - //} + [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] + public void TestDestroyWithEncryption() + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); + var obj1 = new TestBroadcastVariable(6, "destroy encryption"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); + + var expected = new string[] { + "hello destroy encryption, 6", + "world destroy encryption, 6" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + + bc1.Destroy(); + + // Throws the following exception: + // ERROR Utils: Exception encountered + // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) + // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) + // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) + // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + try + { + _df.Select(udf(_df["_1"])).Collect().ToArray(); + Assert.True(false); + } + catch (Exception e) + { + Assert.NotNull(e); + } + } /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If From 25737cb010725d0f41c4a86099659b0bb6c2915c Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 4 Sep 2020 11:23:05 -0700 Subject: [PATCH 45/66] filtering test --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9f265db79..37240c84e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,6 +22,7 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroyWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" From 11b27c8935c905a7005525e990ddbbda8b64abaf Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 8 Sep 2020 14:59:47 -0700 Subject: [PATCH 46/66] PR review changes --- .../Processor/BroadcastVariableProcessor.cs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 0d18eaf34..f17da97b5 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Diagnostics; using System.IO; using System.Net; using System.Runtime.Serialization.Formatters.Binary; @@ -59,11 +60,7 @@ internal BroadcastVariables Process(Stream stream) if (broadcastVars.DecryptionServerNeeded) { var readBid = SerDe.ReadInt64(socket.InputStream); - if (bid != readBid) - { - throw new Exception($"Broadcast id {readBid} from encrypted stream" + - $" does not " + $"match broadcast id {bid} from normal stream."); - } + Debug.Assert(bid == readBid); object value = formatter.Deserialize(socket.InputStream); BroadcastRegistry.Add(bid, value); } @@ -82,10 +79,7 @@ internal BroadcastVariables Process(Stream stream) BroadcastRegistry.Remove(bid); } } - if (socket != null) - { - socket.Dispose(); - } + socket?.Dispose(); return broadcastVars; } } From 2c14f88c8d37374342c0ad153de2e6fa44125e2d Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 8 Sep 2020 19:35:53 -0700 Subject: [PATCH 47/66] pipeline hang --- azure-pipelines.yml | 1 - .../IpcTests/BroadcastTests.cs | 44 ------------------- 2 files changed, 45 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 56941138e..f26250ff2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -22,7 +22,6 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroyWithEncryption)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 788684810..a78c491b7 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -141,50 +141,6 @@ public void TestDestroy() } } - /// - /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast - /// variable and makes it inaccessible from workers, with Broadcast encryption set to true. - /// - [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - public void TestDestroyWithEncryption() - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); - var obj1 = new TestBroadcastVariable(6, "destroy encryption"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); - - var expected = new string[] { - "hello destroy encryption, 6", - "world destroy encryption, 6" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - - bc1.Destroy(); - - // Throws the following exception: - // ERROR Utils: Exception encountered - // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) - // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) - // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) - // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - try - { - _df.Select(udf(_df["_1"])).Collect().ToArray(); - Assert.True(false); - } - catch (Exception e) - { - Assert.NotNull(e); - } - } - /// /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If /// the broadcast is used after unpersist is called, it is re-sent to the executors. From 09c394445a536d8313f3728664712cb75bc97873 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 12 Sep 2020 17:40:37 -0700 Subject: [PATCH 48/66] PR review changes. --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index a1894d0f5..87d175ea4 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -1,5 +1,6 @@ using System; using System.Linq; +using Microsoft.Spark.E2ETest.Utils; using Microsoft.Spark.Sql; using Xunit; using static Microsoft.Spark.Sql.Functions; @@ -41,7 +42,6 @@ public void TestMultipleBroadcastWithEncryption() _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); var obj1 = new TestBroadcastVariable(1, "first"); var obj2 = new TestBroadcastVariable(2, "second"); - Broadcast bc = _spark.SparkContext.Broadcast(5); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); From 43f50541b732881ffec5b51a50c1c1434476f83f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 16 Sep 2020 18:00:08 -0700 Subject: [PATCH 49/66] PR review comments --- .../IpcTests/BroadcastTests.cs | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 87d175ea4..77fd1f976 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -32,35 +32,15 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } - /// - /// Test Broadcast support by using multiple broadcast variables in a UDF with - /// encryption enabled. - /// - [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - public void TestMultipleBroadcastWithEncryption() - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); - var obj1 = new TestBroadcastVariable(1, "first"); - var obj2 = new TestBroadcastVariable(2, "second"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); - - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); - - var expected = new string[] { "hello first and second", "world first and second" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - } - /// /// Test Broadcast support by using multiple broadcast variables in a UDF. /// - [Fact] - public void TestMultipleBroadcastWithoutEncryption() + [Theory] + [InlineData("true")] + [InlineData("false")] + public void TestMultipleBroadcast(string isEncryptionEnabled) { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "false"); + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); var obj1 = new TestBroadcastVariable(1, "first"); var obj2 = new TestBroadcastVariable(2, "second"); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); @@ -76,12 +56,14 @@ public void TestMultipleBroadcastWithoutEncryption() } /// - /// Test Broadcast with encryption support by broadcasting a large (>100MB) object. + /// Test Broadcast support by broadcasting a large (>100MB) object. /// - [SkipIfSparkVersionIsLessThan(Versions.V2_3_2)] - public void TestLargeBroadcastValueWithEncryption() + [Theory] + [InlineData("true")] + [InlineData("false")] + public void TestLargeBroadcastValueWithEncryption(string isEncryptionEnabled) { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "true"); + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); var obj1 = new byte[104858000]; Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); @@ -102,10 +84,12 @@ public void TestLargeBroadcastValueWithEncryption() /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers. /// - [Fact] - public void TestDestroy() + [Theory] + [InlineData("true")] + [InlineData("false")] + public void TestDestroy(string isEncryptionEnabled) { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", "false"); + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); var obj1 = new TestBroadcastVariable(5, "destroy"); Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); @@ -144,9 +128,12 @@ public void TestDestroy() /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If /// the broadcast is used after unpersist is called, it is re-sent to the executors. /// - [Fact] - public void TestUnpersist() + [Theory] + [InlineData("true")] + [InlineData("false")] + public void TestUnpersist(string isEncryptionEnabled) { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); var obj = new TestBroadcastVariable(1, "unpersist"); Broadcast bc = _spark.SparkContext.Broadcast(obj); From b381ab879d0ab11418ba9c6e85e802aef16dd3f8 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 16 Sep 2020 18:09:40 -0700 Subject: [PATCH 50/66] pipeline changes --- azure-pipelines.yml | 5 ++--- .../Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 255d29595..e63090df9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -18,10 +18,9 @@ variables: TestsToFilterOut: "(FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameGroupedMapUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameVectorUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroy)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithEncryption)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcastWithoutEncryption)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcast)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValueWithEncryption)&\ + (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValue)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithReturnAsTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfSimpleTypesTests.TestUdfWithTimestampType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.SparkSessionTests.TestCreateDataFrameWithTimestamp)" diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 77fd1f976..846d86a11 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -61,7 +61,7 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) [Theory] [InlineData("true")] [InlineData("false")] - public void TestLargeBroadcastValueWithEncryption(string isEncryptionEnabled) + public void TestLargeBroadcastValue(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); var obj1 = new byte[104858000]; From 59d55fdf9e4add70f2e04b679a5c81bef050966f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Wed, 16 Sep 2020 19:41:12 -0700 Subject: [PATCH 51/66] testing pipeline hanging reason --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 846d86a11..17da69485 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -85,7 +85,6 @@ public void TestLargeBroadcastValue(string isEncryptionEnabled) /// variable and makes it inaccessible from workers. /// [Theory] - [InlineData("true")] [InlineData("false")] public void TestDestroy(string isEncryptionEnabled) { From a72225a8a0f9febe7aca4ac0c239912e35950a7f Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 17 Sep 2020 09:00:14 -0700 Subject: [PATCH 52/66] removing encryption from destroy and unpersist. --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 17da69485..0d8210621 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -128,7 +128,6 @@ public void TestDestroy(string isEncryptionEnabled) /// the broadcast is used after unpersist is called, it is re-sent to the executors. /// [Theory] - [InlineData("true")] [InlineData("false")] public void TestUnpersist(string isEncryptionEnabled) { From b7aaa542b2a0cd2538935da9d5029390d795a8d1 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Thu, 17 Sep 2020 17:11:40 -0700 Subject: [PATCH 53/66] removing encryption false from TestBroadcastLargeFalse --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 0d8210621..9a315501f 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -60,7 +60,6 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) /// [Theory] [InlineData("true")] - [InlineData("false")] public void TestLargeBroadcastValue(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); From 0ab0527f4d489717eeadbf0ce090ffb2990ad74c Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 18 Sep 2020 14:48:57 -0700 Subject: [PATCH 54/66] disabling all broadcast tests --- .../IpcTests/BroadcastTests.cs | 248 +++++++++--------- 1 file changed, 124 insertions(+), 124 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 9a315501f..abd46e7ee 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -32,129 +32,129 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } - /// - /// Test Broadcast support by using multiple broadcast variables in a UDF. - /// - [Theory] - [InlineData("true")] - [InlineData("false")] - public void TestMultipleBroadcast(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj1 = new TestBroadcastVariable(1, "first"); - var obj2 = new TestBroadcastVariable(2, "second"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); - - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); - - var expected = new string[] { "hello first and second", "world first and second" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - } - - /// - /// Test Broadcast support by broadcasting a large (>100MB) object. - /// - [Theory] - [InlineData("true")] - public void TestLargeBroadcastValue(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj1 = new byte[104858000]; - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str}: length of broadcast array = {bc1.Value().Length}"); - - var expected = new string[] { - "hello: length of broadcast array = 104858000", - "world: length of broadcast array = 104858000" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - // Destroying broadcast variable to free up memory - bc1.Destroy(); - } - - /// - /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast - /// variable and makes it inaccessible from workers. - /// - [Theory] - [InlineData("false")] - public void TestDestroy(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj1 = new TestBroadcastVariable(5, "destroy"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); - - var expected = new string[] { "hello destroy, 5", "world destroy, 5" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - - bc1.Destroy(); - - // Throws the following exception: - // ERROR Utils: Exception encountered - // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) - // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) - // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) - // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - try - { - _df.Select(udf(_df["_1"])).Collect().ToArray(); - Assert.True(false); - } - catch (Exception e) - { - Assert.NotNull(e); - } - } - - /// - /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If - /// the broadcast is used after unpersist is called, it is re-sent to the executors. - /// - [Theory] - [InlineData("false")] - public void TestUnpersist(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj = new TestBroadcastVariable(1, "unpersist"); - Broadcast bc = _spark.SparkContext.Broadcast(obj); - - Func udf = Udf( - str => $"{str} {bc.Value().StringValue}, {bc.Value().IntValue}"); - - var expected = new string[] { "hello unpersist, 1", "world unpersist, 1" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - - Assert.Equal(expected, actual); - - // This deletes the copies of the broadcast on the executors. We then use the Broadcast - // variable again in the UDF and validate that it is re-sent to all executors. - bc.Unpersist(); - - string[] actualUnpersisted = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actualUnpersisted); - } - - private string[] ToStringArray(DataFrame df) - { - Row[] rows = df.Collect().ToArray(); - return rows.Select(s => s[0].ToString()).ToArray(); - } + ///// + ///// Test Broadcast support by using multiple broadcast variables in a UDF. + ///// + //[Theory] + //[InlineData("true")] + //[InlineData("false")] + //public void TestMultipleBroadcast(string isEncryptionEnabled) + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + // var obj1 = new TestBroadcastVariable(1, "first"); + // var obj2 = new TestBroadcastVariable(2, "second"); + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + // Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); + + // Func udf = Udf( + // str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); + + // var expected = new string[] { "hello first and second", "world first and second" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + //} + + ///// + ///// Test Broadcast support by broadcasting a large (>100MB) object. + ///// + //[Theory] + //[InlineData("true")] + //public void TestLargeBroadcastValue(string isEncryptionEnabled) + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + // var obj1 = new byte[104858000]; + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + // Func udf = Udf( + // str => $"{str}: length of broadcast array = {bc1.Value().Length}"); + + // var expected = new string[] { + // "hello: length of broadcast array = 104858000", + // "world: length of broadcast array = 104858000" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + // // Destroying broadcast variable to free up memory + // bc1.Destroy(); + //} + + ///// + ///// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast + ///// variable and makes it inaccessible from workers. + ///// + //[Theory] + //[InlineData("false")] + //public void TestDestroy(string isEncryptionEnabled) + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + // var obj1 = new TestBroadcastVariable(5, "destroy"); + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + // Func udf = Udf( + // str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); + + // var expected = new string[] { "hello destroy, 5", "world destroy, 5" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + + // bc1.Destroy(); + + // // Throws the following exception: + // // ERROR Utils: Exception encountered + // // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) + // // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) + // // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) + // // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + // try + // { + // _df.Select(udf(_df["_1"])).Collect().ToArray(); + // Assert.True(false); + // } + // catch (Exception e) + // { + // Assert.NotNull(e); + // } + //} + + ///// + ///// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If + ///// the broadcast is used after unpersist is called, it is re-sent to the executors. + ///// + //[Theory] + //[InlineData("false")] + //public void TestUnpersist(string isEncryptionEnabled) + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + // var obj = new TestBroadcastVariable(1, "unpersist"); + // Broadcast bc = _spark.SparkContext.Broadcast(obj); + + // Func udf = Udf( + // str => $"{str} {bc.Value().StringValue}, {bc.Value().IntValue}"); + + // var expected = new string[] { "hello unpersist, 1", "world unpersist, 1" }; + + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + + // Assert.Equal(expected, actual); + + // // This deletes the copies of the broadcast on the executors. We then use the Broadcast + // // variable again in the UDF and validate that it is re-sent to all executors. + // bc.Unpersist(); + + // string[] actualUnpersisted = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actualUnpersisted); + //} + + //private string[] ToStringArray(DataFrame df) + //{ + // Row[] rows = df.Collect().ToArray(); + // return rows.Select(s => s[0].ToString()).ToArray(); + //} } } From fd8ed11900c13326bd6ce7ba705272699d6c8ca0 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 18 Sep 2020 16:07:11 -0700 Subject: [PATCH 55/66] Enabling multiple tests with encryption on and off --- .../IpcTests/BroadcastTests.cs | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index abd46e7ee..8c31083a5 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -32,28 +32,28 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } - ///// - ///// Test Broadcast support by using multiple broadcast variables in a UDF. - ///// - //[Theory] - //[InlineData("true")] - //[InlineData("false")] - //public void TestMultipleBroadcast(string isEncryptionEnabled) - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - // var obj1 = new TestBroadcastVariable(1, "first"); - // var obj2 = new TestBroadcastVariable(2, "second"); - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - // Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); + /// + /// Test Broadcast support by using multiple broadcast variables in a UDF. + /// + [Theory] + [InlineData("true")] + [InlineData("false")] + public void TestMultipleBroadcast(string isEncryptionEnabled) + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + var obj1 = new TestBroadcastVariable(1, "first"); + var obj2 = new TestBroadcastVariable(2, "second"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); - // Func udf = Udf( - // str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); - // var expected = new string[] { "hello first and second", "world first and second" }; + var expected = new string[] { "hello first and second", "world first and second" }; - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - //} + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + } ///// ///// Test Broadcast support by broadcasting a large (>100MB) object. @@ -151,10 +151,10 @@ public BroadcastTests(SparkFixture fixture) // Assert.Equal(expected, actualUnpersisted); //} - //private string[] ToStringArray(DataFrame df) - //{ - // Row[] rows = df.Collect().ToArray(); - // return rows.Select(s => s[0].ToString()).ToArray(); - //} + private string[] ToStringArray(DataFrame df) + { + Row[] rows = df.Collect().ToArray(); + return rows.Select(s => s[0].ToString()).ToArray(); + } } } From 83d0acfef5038ebff614c92452bde36948fd7888 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Fri, 18 Sep 2020 18:12:14 -0700 Subject: [PATCH 56/66] disabling tests again --- .../IpcTests/BroadcastTests.cs | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 8c31083a5..9a626485c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -32,28 +32,28 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } - /// - /// Test Broadcast support by using multiple broadcast variables in a UDF. - /// - [Theory] - [InlineData("true")] - [InlineData("false")] - public void TestMultipleBroadcast(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj1 = new TestBroadcastVariable(1, "first"); - var obj2 = new TestBroadcastVariable(2, "second"); - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); + ///// + ///// Test Broadcast support by using multiple broadcast variables in a UDF. + ///// + //[Theory] + //[InlineData("true")] + //[InlineData("false")] + //public void TestMultipleBroadcast(string isEncryptionEnabled) + //{ + // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + // var obj1 = new TestBroadcastVariable(1, "first"); + // var obj2 = new TestBroadcastVariable(2, "second"); + // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + // Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); - Func udf = Udf( - str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); + // Func udf = Udf( + // str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); - var expected = new string[] { "hello first and second", "world first and second" }; + // var expected = new string[] { "hello first and second", "world first and second" }; - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - } + // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + // Assert.Equal(expected, actual); + //} ///// ///// Test Broadcast support by broadcasting a large (>100MB) object. From f1402971c3ab5e13b702e0a3cee9e09878259f76 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 00:19:00 -0700 Subject: [PATCH 57/66] enabling all encryotion false tests --- .../IpcTests/BroadcastTests.cs | 165 +++++++++--------- 1 file changed, 82 insertions(+), 83 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 9a626485c..1a1fcdfa8 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -32,28 +32,27 @@ public BroadcastTests(SparkFixture fixture) _df = _spark.CreateDataFrame(new[] { "hello", "world" }); } - ///// - ///// Test Broadcast support by using multiple broadcast variables in a UDF. - ///// - //[Theory] - //[InlineData("true")] - //[InlineData("false")] - //public void TestMultipleBroadcast(string isEncryptionEnabled) - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - // var obj1 = new TestBroadcastVariable(1, "first"); - // var obj2 = new TestBroadcastVariable(2, "second"); - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - // Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); + /// + /// Test Broadcast support by using multiple broadcast variables in a UDF. + /// + [Theory] + [InlineData("false")] + public void TestMultipleBroadcast(string isEncryptionEnabled) + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + var obj1 = new TestBroadcastVariable(1, "first"); + var obj2 = new TestBroadcastVariable(2, "second"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + Broadcast bc2 = _spark.SparkContext.Broadcast(obj2); - // Func udf = Udf( - // str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue} and {bc2.Value().StringValue}"); - // var expected = new string[] { "hello first and second", "world first and second" }; + var expected = new string[] { "hello first and second", "world first and second" }; - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - //} + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + } ///// ///// Test Broadcast support by broadcasting a large (>100MB) object. @@ -79,77 +78,77 @@ public BroadcastTests(SparkFixture fixture) // bc1.Destroy(); //} - ///// - ///// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast - ///// variable and makes it inaccessible from workers. - ///// - //[Theory] - //[InlineData("false")] - //public void TestDestroy(string isEncryptionEnabled) - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - // var obj1 = new TestBroadcastVariable(5, "destroy"); - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - // Func udf = Udf( - // str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); - - // var expected = new string[] { "hello destroy, 5", "world destroy, 5" }; - - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - - // bc1.Destroy(); - - // // Throws the following exception: - // // ERROR Utils: Exception encountered - // // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) - // // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) - // // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) - // // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) - // // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - // try - // { - // _df.Select(udf(_df["_1"])).Collect().ToArray(); - // Assert.True(false); - // } - // catch (Exception e) - // { - // Assert.NotNull(e); - // } - //} + /// + /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast + /// variable and makes it inaccessible from workers. + /// + [Theory] + [InlineData("false")] + public void TestDestroy(string isEncryptionEnabled) + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + var obj1 = new TestBroadcastVariable(5, "destroy"); + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + Func udf = Udf( + str => $"{str} {bc1.Value().StringValue}, {bc1.Value().IntValue}"); + + var expected = new string[] { "hello destroy, 5", "world destroy, 5" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + + bc1.Destroy(); + + // Throws the following exception: + // ERROR Utils: Exception encountered + // org.apache.spark.SparkException: Attempted to use Broadcast(0) after it was destroyed(destroy at NativeMethodAccessorImpl.java:0) + // at org.apache.spark.broadcast.Broadcast.assertValid(Broadcast.scala:144) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply$mcV$sp(TorrentBroadcast.scala:203) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.broadcast.TorrentBroadcast$$anonfun$writeObject$1.apply(TorrentBroadcast.scala:202) + // at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1326) + // at org.apache.spark.broadcast.TorrentBroadcast.writeObject(TorrentBroadcast.scala:202) + // at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + try + { + _df.Select(udf(_df["_1"])).Collect().ToArray(); + Assert.True(false); + } + catch (Exception e) + { + Assert.NotNull(e); + } + } - ///// - ///// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If - ///// the broadcast is used after unpersist is called, it is re-sent to the executors. - ///// - //[Theory] - //[InlineData("false")] - //public void TestUnpersist(string isEncryptionEnabled) - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - // var obj = new TestBroadcastVariable(1, "unpersist"); - // Broadcast bc = _spark.SparkContext.Broadcast(obj); + /// + /// Test Broadcast.Unpersist() deletes cached copies of the broadcast on the executors. If + /// the broadcast is used after unpersist is called, it is re-sent to the executors. + /// + [Theory] + [InlineData("false")] + public void TestUnpersist(string isEncryptionEnabled) + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + var obj = new TestBroadcastVariable(1, "unpersist"); + Broadcast bc = _spark.SparkContext.Broadcast(obj); - // Func udf = Udf( - // str => $"{str} {bc.Value().StringValue}, {bc.Value().IntValue}"); + Func udf = Udf( + str => $"{str} {bc.Value().StringValue}, {bc.Value().IntValue}"); - // var expected = new string[] { "hello unpersist, 1", "world unpersist, 1" }; + var expected = new string[] { "hello unpersist, 1", "world unpersist, 1" }; - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); + Assert.Equal(expected, actual); - // // This deletes the copies of the broadcast on the executors. We then use the Broadcast - // // variable again in the UDF and validate that it is re-sent to all executors. - // bc.Unpersist(); + // This deletes the copies of the broadcast on the executors. We then use the Broadcast + // variable again in the UDF and validate that it is re-sent to all executors. + bc.Unpersist(); - // string[] actualUnpersisted = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actualUnpersisted); - //} + string[] actualUnpersisted = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actualUnpersisted); + } private string[] ToStringArray(DataFrame df) { From fdcd11d381ef6c00aa7b866bbf8fda320fce0ead Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 01:31:38 -0700 Subject: [PATCH 58/66] enabling multiple broadcast encryption on(destroying broadcast variables) --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 1a1fcdfa8..835520ef6 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -37,6 +37,7 @@ public BroadcastTests(SparkFixture fixture) /// [Theory] [InlineData("false")] + [InlineData("true")] public void TestMultipleBroadcast(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); @@ -52,6 +53,8 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); Assert.Equal(expected, actual); + bc1.Destroy(); + bc2.Destroy(); } ///// From 2c743e15d76d84cc55ecd8803d86368e07e64b1b Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 03:23:09 -0700 Subject: [PATCH 59/66] Enabling encryption on for TestDestroy --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 835520ef6..eec446a72 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -62,6 +62,7 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) ///// //[Theory] //[InlineData("true")] + //[InlineData("false")] //public void TestLargeBroadcastValue(string isEncryptionEnabled) //{ // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); @@ -87,6 +88,7 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) /// [Theory] [InlineData("false")] + [InlineData("true")] public void TestDestroy(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); From e29587510c6a20ba6d9ad231e93da95fc794f988 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 16:21:17 -0700 Subject: [PATCH 60/66] disabling encryption for TestDestroy and enabling it for TestUnpersist --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index eec446a72..a53bd71e1 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -88,7 +88,6 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) /// [Theory] [InlineData("false")] - [InlineData("true")] public void TestDestroy(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); @@ -132,6 +131,7 @@ public void TestDestroy(string isEncryptionEnabled) /// [Theory] [InlineData("false")] + [InlineData("true")] public void TestUnpersist(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); From 3d0c7e29129d9873a44679ba2e0f9e1541fc8cc1 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 18:47:38 -0700 Subject: [PATCH 61/66] enabling TestLargeBroadcastValue with encryption on --- .../IpcTests/BroadcastTests.cs | 47 +++++++++---------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index a53bd71e1..b5848ae9d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -57,30 +57,29 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) bc2.Destroy(); } - ///// - ///// Test Broadcast support by broadcasting a large (>100MB) object. - ///// - //[Theory] - //[InlineData("true")] - //[InlineData("false")] - //public void TestLargeBroadcastValue(string isEncryptionEnabled) - //{ - // _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - // var obj1 = new byte[104858000]; - // Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - // Func udf = Udf( - // str => $"{str}: length of broadcast array = {bc1.Value().Length}"); - - // var expected = new string[] { - // "hello: length of broadcast array = 104858000", - // "world: length of broadcast array = 104858000" }; - - // string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - // Assert.Equal(expected, actual); - // // Destroying broadcast variable to free up memory - // bc1.Destroy(); - //} + /// + /// Test Broadcast support by broadcasting a large (>100MB) object. + /// + [Theory] + [InlineData("true")] + public void TestLargeBroadcastValue(string isEncryptionEnabled) + { + _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); + var obj1 = new byte[104858000]; + Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); + + Func udf = Udf( + str => $"{str}: length of broadcast array = {bc1.Value().Length}"); + + var expected = new string[] { + "hello: length of broadcast array = 104858000", + "world: length of broadcast array = 104858000" }; + + string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); + Assert.Equal(expected, actual); + // Destroying broadcast variable to free up memory + bc1.Destroy(); + } /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast From e682bf5ae7df25287248e84114dfec878b7f573d Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 19:55:39 -0700 Subject: [PATCH 62/66] Enabling TestLargeBroadcastValue with encryption off --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index b5848ae9d..0ff79f6b0 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -62,6 +62,7 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) /// [Theory] [InlineData("true")] + [InlineData("false")] public void TestLargeBroadcastValue(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); From a57973251b9aa777b5486146055a9e7e8013e7e1 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sat, 19 Sep 2020 22:34:54 -0700 Subject: [PATCH 63/66] disabling TestLargeBroadcastValue encryption off --- src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index 0ff79f6b0..b5848ae9d 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -62,7 +62,6 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) /// [Theory] [InlineData("true")] - [InlineData("false")] public void TestLargeBroadcastValue(string isEncryptionEnabled) { _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); From 228f2f324a8ed9017840ffc89b0461db7fbc6deb Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Sun, 27 Sep 2020 21:58:38 -0700 Subject: [PATCH 64/66] PR comments --- azure-pipelines.yml | 1 - .../IpcTests/BroadcastTests.cs | 25 ------------------- .../Processor/BroadcastVariableProcessor.cs | 9 +++++-- 3 files changed, 7 insertions(+), 28 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index f6eeefd89..5c7bec3d2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -19,7 +19,6 @@ variables: (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.DataFrameTests.TestDataFrameVectorUdf)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestDestroy)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestMultipleBroadcast)&\ - (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestLargeBroadcastValue)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.IpcTests.BroadcastTests.TestUnpersist)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfComplexTypesTests.TestUdfWithArrayType)&\ (FullyQualifiedName!=Microsoft.Spark.E2ETest.UdfTests.UdfComplexTypesTests.TestUdfWithArrayOfArrayType)&\ diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs index d2341fe9a..e0443f04c 100644 --- a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs +++ b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/BroadcastTests.cs @@ -56,31 +56,6 @@ public void TestMultipleBroadcast(string isEncryptionEnabled) bc2.Destroy(); } - /// - /// Test Broadcast support by broadcasting a large (>100MB) object. - /// - [Theory] - [InlineData("true")] - [InlineData("false")] - public void TestLargeBroadcastValue(string isEncryptionEnabled) - { - _spark.SparkContext.GetConf().Set("spark.io.encryption.enabled", isEncryptionEnabled); - var obj1 = new byte[104858000]; - Broadcast bc1 = _spark.SparkContext.Broadcast(obj1); - - Func udf = Udf( - str => $"{str}: length of broadcast array = {bc1.Value().Length}"); - - var expected = new string[] { - "hello: length of broadcast array = 104858000", - "world: length of broadcast array = 104858000" }; - - string[] actual = ToStringArray(_df.Select(udf(_df["_1"]))); - Assert.Equal(expected, actual); - // Destroying broadcast variable to free up memory - bc1.Destroy(); - } - /// /// Test Broadcast.Destroy() that destroys all data and metadata related to the broadcast /// variable and makes it inaccessible from workers. diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index f17da97b5..e080dfeea 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -59,8 +59,13 @@ internal BroadcastVariables Process(Stream stream) { if (broadcastVars.DecryptionServerNeeded) { - var readBid = SerDe.ReadInt64(socket.InputStream); - Debug.Assert(bid == readBid); + long readBid = SerDe.ReadInt64(socket.InputStream); + if (bid != readBid) + { + throw new Exception($"The Broadcast Id received from the encryption" + + $" server {readBid} is different from the Broadcast Id received" + + $" from the payload {bid}."); + } object value = formatter.Deserialize(socket.InputStream); BroadcastRegistry.Add(bid, value); } From efa424c3674eda52b02832278bf6c02f4790e0c9 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Mon, 28 Sep 2020 11:43:01 -0700 Subject: [PATCH 65/66] Update src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs Co-authored-by: Steve Suh --- .../Processor/BroadcastVariableProcessor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index e080dfeea..9a54cb409 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -62,7 +62,7 @@ internal BroadcastVariables Process(Stream stream) long readBid = SerDe.ReadInt64(socket.InputStream); if (bid != readBid) { - throw new Exception($"The Broadcast Id received from the encryption" + + throw new Exception("The Broadcast Id received from the encryption" + $" server {readBid} is different from the Broadcast Id received" + $" from the payload {bid}."); } From d047ac23445387a824e16b2539f74df9e1506d9b Mon Sep 17 00:00:00 2001 From: Terry Kim Date: Mon, 28 Sep 2020 15:23:06 -0700 Subject: [PATCH 66/66] Update src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs --- .../Processor/BroadcastVariableProcessor.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs index 9a54cb409..e3bc16df6 100644 --- a/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs +++ b/src/csharp/Microsoft.Spark.Worker/Processor/BroadcastVariableProcessor.cs @@ -62,9 +62,9 @@ internal BroadcastVariables Process(Stream stream) long readBid = SerDe.ReadInt64(socket.InputStream); if (bid != readBid) { - throw new Exception("The Broadcast Id received from the encryption" + - $" server {readBid} is different from the Broadcast Id received" + - $" from the payload {bid}."); + throw new Exception("The Broadcast Id received from the encryption " + + $"server {readBid} is different from the Broadcast Id received " + + $"from the payload {bid}."); } object value = formatter.Deserialize(socket.InputStream); BroadcastRegistry.Add(bid, value);