Expose new SparkSession, DataFrame, and DataFrameStatFunctions APIs introduced in Spark 3.0 (#647)

Niharikadutta · web-flow · commit 1ab206df3916 · 2020-09-18T13:08:59.000-07:00
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameFunctionsTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameFunctionsTests.cs
@@ -3,7 +3,9 @@
 // See the LICENSE file in the project root for more information.
 
 using System.Collections.Generic;
+using Microsoft.Spark.E2ETest.Utils;
 using Microsoft.Spark.Sql;
+using static Microsoft.Spark.Sql.Functions;
 using Xunit;
 
 namespace Microsoft.Spark.E2ETest.IpcTests
@@ -91,5 +93,20 @@ public void TestDataFrameStatFunctionSignatures()
 
             df = stat.SampleBy("age", new Dictionary<int, double> { { 1, 0.5 } }, 100);
         }
+
+        /// <summary>
+        /// Test signatures for APIs introduced in Spark 3.0.*.
+        /// </summary>
+        [SkipIfSparkVersionIsLessThan(Versions.V3_0_0)]
+        public void TestSignaturesV3_0_X()
+        {
+            DataFrameStatFunctions stat = _df.Stat();
+            Column col = Column("age");
+
+            Assert.IsType<DataFrame>(stat.SampleBy(
+                col,
+                new Dictionary<int, double> { { 1, 0.5 } },
+                100));
+        }
     }
 }
diff --git a/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs b/src/csharp/Microsoft.Spark.E2ETest/IpcTests/Sql/DataFrameTests.cs
@@ -712,6 +712,18 @@ public void TestSignaturesV3_X_X()
             IEnumerable<Row> actual = df.ToLocalIterator(true).ToArray();
             IEnumerable<Row> expected = data.Select(r => new Row(r.Values, schema));
             Assert.Equal(expected, actual);
+
+            Assert.IsType<DataFrame>(df.Observe("metrics", Count("Name").As("CountNames")));
+
+            Assert.IsType<Row[]>(_df.Tail(1).ToArray());
+
+            _df.PrintSchema(1);
+
+            _df.Explain("simple");
+            _df.Explain("extended");
+            _df.Explain("codegen");
+            _df.Explain("cost");
+            _df.Explain("formatted");
         }
     }
 }
diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrame.cs b/src/csharp/Microsoft.Spark/Sql/DataFrame.cs
@@ -70,6 +70,16 @@ public void PrintSchema() =>
             Console.WriteLine(
                 (string)((JvmObjectReference)_jvmObject.Invoke("schema")).Invoke("treeString"));
 
+        /// <summary>
+        /// Prints the schema up to the given level to the console in a nice tree format.
+        /// </summary>
+        [Since(Versions.V3_0_0)]
+        public void PrintSchema(int level)
+        {
+            var schema = (JvmObjectReference)_jvmObject.Invoke("schema");
+            Console.WriteLine((string)schema.Invoke("treeString", level));
+        }
+
         /// <summary>
         /// Prints the plans (logical and physical) to the console for debugging purposes.
         /// </summary>
@@ -80,6 +90,30 @@ public void Explain(bool extended = false)
             Console.WriteLine((string)execution.Invoke(extended ? "toString" : "simpleString"));
         }
 
+        /// <summary>
+        /// Prints the plans (logical and physical) with a format specified by a given explain
+        /// mode.
+        /// 
+        /// </summary>
+        /// <param name="mode">Specifies the expected output format of plans.
+        /// 1. `simple` Print only a physical plan.
+        /// 2. `extended`: Print both logical and physical plans.
+        /// 3. `codegen`: Print a physical plan and generated codes if they are available.
+        /// 4. `cost`: Print a logical plan and statistics if they are available.
+        /// 5. `formatted`: Split explain output into two sections: a physical plan outline and
+        /// node details.
+        /// </param>
+        [Since(Versions.V3_0_0)]
+        public void Explain(string mode)
+        {
+            var execution = (JvmObjectReference)_jvmObject.Invoke("queryExecution");
+            var explainMode = (JvmObjectReference)_jvmObject.Jvm.CallStaticJavaMethod(
+                "org.apache.spark.sql.execution.ExplainMode",
+                "fromString",
+                mode);
+            Console.WriteLine((string)execution.Invoke("explainString", explainMode));
+        }
+
         /// <summary>
         /// Returns all column names and their data types as an IEnumerable of Tuples.
         /// </summary>
@@ -480,6 +514,27 @@ public RelationalGroupedDataset Cube(string column, params string[] columns) =>
         public DataFrame Agg(Column expr, params Column[] exprs) =>
             WrapAsDataFrame(_jvmObject.Invoke("agg", expr, exprs));
 
+        /// <summary>
+        /// Define (named) metrics to observe on the Dataset. This method returns an 'observed'
+        /// DataFrame that returns the same result as the input, with the following guarantees:
+        /// 
+        /// 1. It will compute the defined aggregates(metrics) on all the data that is flowing
+        /// through the Dataset at that point.
+        /// 2. It will report the value of the defined aggregate columns as soon as we reach a
+        /// completion point.A completion point is either the end of a query(batch mode) or the end
+        /// of a streaming epoch. The value of the aggregates only reflects the data processed
+        /// since the previous completion point.
+        /// 
+        /// Please note that continuous execution is currently not supported.
+        /// </summary>
+        /// <param name="name">Named metrics to observe</param>
+        /// <param name="expr">Defined aggregate to observe</param>
+        /// <param name="exprs">Defined aggregates to observe</param>
+        /// <returns>DataFrame object</returns>
+        [Since(Versions.V3_0_0)]
+        public DataFrame Observe(string name, Column expr, params Column[] exprs) =>
+            WrapAsDataFrame(_jvmObject.Invoke("observe", name, expr, exprs));
+
         /// <summary>
         /// Returns a new `DataFrame` by taking the first `number` rows.
         /// </summary>
@@ -702,6 +757,17 @@ public DataFrame Summary(params string[] statistics) =>
         /// <returns>First `n` rows</returns>
         public IEnumerable<Row> Take(int n) => Head(n);
 
+        /// <summary>
+        /// Returns the last `n` rows in the `DataFrame`.
+        /// </summary>
+        /// <param name="n">Number of rows</param>
+        /// <returns>Last `n` rows</returns>
+        [Since(Versions.V3_0_0)]
+        public IEnumerable<Row> Tail(int n)
+        {
+            return GetRows("tailToPython", n);
+        }
+
         /// <summary>
         /// Returns an array that contains all rows in this `DataFrame`.
         /// </summary>
@@ -929,16 +995,15 @@ public DataStreamWriter WriteStream() =>
             new DataStreamWriter((JvmObjectReference)_jvmObject.Invoke("writeStream"), this);
 
         /// <summary>
-        /// Returns row objects based on the function (either "toPythonIterator" or
-        /// "collectToPython").
+        /// Returns row objects based on the function (either "toPythonIterator",
+        /// "collectToPython", or "tailToPython").
         /// </summary>
-        /// <param name="funcName">
-        /// The name of the function to call, either "toPythonIterator" or "collectToPython".
-        /// </param>
-        /// <returns><see cref="Row"/> objects</returns>
-        private IEnumerable<Row> GetRows(string funcName)
+        /// <param name="funcName">String name of function to call</param>
+        /// <param name="args">Arguments to the function</param>
+        /// <returns>IEnumerable of Rows from Spark</returns>
+        private IEnumerable<Row> GetRows(string funcName, params object[] args)
         {
-            (int port, string secret, _) = GetConnectionInfo(funcName);
+            (int port, string secret, _) = GetConnectionInfo(funcName, args);
             using ISocketWrapper socket = SocketFactory.CreateSocket();
             socket.Connect(IPAddress.Loopback, port, secret);
             foreach (Row row in new RowCollector().Collect(socket))
@@ -952,9 +1017,11 @@ private IEnumerable<Row> GetRows(string funcName)
         /// used for connecting with Spark to receive rows for this `DataFrame`.
         /// </summary>
         /// <returns>A tuple of port number, secret string, and JVM socket auth server.</returns>
-        private (int, string, JvmObjectReference) GetConnectionInfo(string funcName)
+        private (int, string, JvmObjectReference) GetConnectionInfo(
+            string funcName,
+            params object[] args)
         {
-            object result = _jvmObject.Invoke(funcName);
+            object result = _jvmObject.Invoke(funcName, args);
             Version version = SparkEnvironment.SparkVersion;
             return (version.Major, version.Minor, version.Build) switch
             {
diff --git a/src/csharp/Microsoft.Spark/Sql/DataFrameStatFunctions.cs b/src/csharp/Microsoft.Spark/Sql/DataFrameStatFunctions.cs
@@ -121,6 +121,22 @@ public DataFrame SampleBy<T>(
             long seed) =>
             WrapAsDataFrame(_jvmObject.Invoke("sampleBy", columnName, fractions, seed));
 
+        /// <summary>
+        /// Returns a stratified sample without replacement based on the fraction given
+        /// on each stratum.
+        /// </summary>
+        /// <typeparam name="T">Stratum type</typeparam>
+        /// <param name="column">Column that defines strata</param>
+        /// <param name="fractions">
+        /// Sampling fraction for each stratum. If a stratum is not specified, we treat
+        /// its fraction as zero.
+        /// </param>
+        /// <param name="seed">Random seed</param>
+        /// <returns>DataFrame object</returns>
+        [Since(Versions.V3_0_0)]
+        public DataFrame SampleBy<T>(Column column, IDictionary<T, double> fractions, long seed) =>
+            WrapAsDataFrame(_jvmObject.Invoke("sampleBy", column, fractions, seed));
+
         private DataFrame WrapAsDataFrame(object obj) => new DataFrame((JvmObjectReference)obj);
     }
 }
diff --git a/src/csharp/Microsoft.Spark/Sql/SparkSession.cs b/src/csharp/Microsoft.Spark/Sql/SparkSession.cs
@@ -255,6 +255,30 @@ public DataFrame CreateDataFrame(IEnumerable<Timestamp> data) =>
         public DataFrame Sql(string sqlText) =>
             new DataFrame((JvmObjectReference)_jvmObject.Invoke("sql", sqlText));
 
+        /// <summary>
+        /// Execute an arbitrary string command inside an external execution engine rather than
+        /// Spark. This could be useful when user wants to execute some commands out of Spark. For
+        /// example, executing custom DDL/DML command for JDBC, creating index for ElasticSearch,
+        /// creating cores for Solr and so on.
+        /// The command will be eagerly executed after this method is called and the returned
+        /// DataFrame will contain the output of the command(if any).
+        /// </summary>
+        /// <param name="runner">The class name of the runner that implements
+        /// `ExternalCommandRunner`</param>
+        /// <param name="command">The target command to be executed</param>
+        /// <param name="options">The options for the runner</param>
+        /// <returns>>DataFrame object</returns>
+        [Since(Versions.V3_0_0)]
+        public DataFrame ExecuteCommand(
+            string runner,
+            string command,
+            Dictionary<string, string> options) =>
+            new DataFrame((JvmObjectReference)_jvmObject.Invoke(
+                "executeCommand",
+                runner,
+                command,
+                options));
+
         /// <summary>
         /// Returns a DataFrameReader that can be used to read non-streaming data in
         /// as a DataFrame.

Original file line number	Diff line number	Diff line change
`@@ -712,6 +712,18 @@ public void TestSignaturesV3_X_X()`
`712`	`712`	`IEnumerable<Row> actual = df.ToLocalIterator(true).ToArray();`
`713`	`713`	`IEnumerable<Row> expected = data.Select(r => new Row(r.Values, schema));`
`714`	`714`	`Assert.Equal(expected, actual);`
	`715`	`+`
	`716`	`+ Assert.IsType<DataFrame>(df.Observe("metrics", Count("Name").As("CountNames")));`
	`717`	`+`
	`718`	`+ Assert.IsType<Row[]>(_df.Tail(1).ToArray());`
	`719`	`+`
	`720`	`+ _df.PrintSchema(1);`
	`721`	`+`
	`722`	`+ _df.Explain("simple");`
	`723`	`+ _df.Explain("extended");`
	`724`	`+ _df.Explain("codegen");`
	`725`	`+ _df.Explain("cost");`
	`726`	`+ _df.Explain("formatted");`
`715`	`727`	`}`
`716`	`728`	`}`
`717`	`729`	`}`