databricks · lopez- · Jun 30, 2020 · Jul 20, 2020 · Jul 20, 2020 · ueshin
diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -42,7 +42,6 @@ class MissingPandasLikeSeries(object):
     autocorr = _unsupported_function("autocorr")
     between_time = _unsupported_function("between_time")
     combine = _unsupported_function("combine")
-    cov = _unsupported_function("cov")
     droplevel = _unsupported_function("droplevel")
     ewm = _unsupported_function("ewm")
     factorize = _unsupported_function("factorize")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -4858,6 +4858,51 @@ def mad(self):
 
         return mad
 
+    def cov(self, other: "Series", min_periods: Optional[int] = None) -> float:
+        """
+        Compute covariance with Series, excluding missing values.
+
+        Parameters
+        ----------
+        other : Series
+            Series with which to compute the covariance.
+        min_periods : int, optional
+            Minimum number of observations needed to have a valid result.
+
+        Returns
+        -------
+        float
+            Covariance between Series and other normalized by N-1
+            (unbiased estimator).
+
+        Examples
+        --------
+        >>> import databricks.koalas as ks
+        >>> ks.set_option("compute.ops_on_diff_frames", True)
+        >>> s1 = ks.Series([0.90010907, 0.13484424, 0.62036035])
+        >>> s2 = ks.Series([0.12528585, 0.26962463, 0.51111198])
+        >>> s1.cov(s2)
+        -0.01685762652715874
+        >>> ks.reset_option("compute.ops_on_diff_frames")
+        """
+
+        if not isinstance(other, Series):
+            raise ValueError("'other' must be a Series")
+
+        if len(self.index) != len(other.index):
+            raise ValueError("series are not aligned")
+
+        min_periods = 0 if min_periods is None else min_periods
+        if len(self.index) < min_periods or len(self.index) <= 1:
+            return np.nan
+
+        if same_anchor(self, other):
+            # if the have the same anchor use the more performant Spark native `cov`
+            return self._internal.spark_frame.cov(self.name, other.name)
+        else:
+            # if not on the same anchor calculate covariance manually
+            return (self - self.mean()).dot(other - other.mean()) / (len(self.index) - 1)
+
     def unstack(self, level=-1):
         """
         Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame.

diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -948,6 +948,32 @@ def test_series_repeat(self):
         else:
             self.assert_eq(kser1.repeat(kser2).sort_index(), pser1.repeat(pser2).sort_index())
 
+    def test_cov(self):
+        pser = pd.Series([90, 91, 85])
+        kser = ks.from_pandas()
+        kser_other = ks.Series([90, 91, 85])
+        pser_other = kser_other.to_pandas()
+
+        self.assert_eq(kser.cov(kser_other), pser.cov(pser_other), almost=True)
+
+        kser = ks.Series([90])
+        pser = kser.to_pandas()
+        kser_other = ks.Series([85])
+        pser_other = kser_other.to_pandas()
+
+        k_isnan = np.isnan(kser.cov(kser_other))
+        p_isnan = np.isnan(pser.cov(pser_other))
+        self.assert_eq(k_isnan, p_isnan)
+
+        kser = ks.Series([90, 91, 85])
+        pser = kser.to_pandas()
+        kser_other = ks.Series([90, 91, 85])
+        pser_other = kser_other.to_pandas()
+
+        k_isnan = np.isnan(kser.cov(kser_other, 4))
+        p_isnan = np.isnan(pser.cov(pser_other, 4))
+        self.assert_eq(k_isnan, p_isnan)
+
 
 class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
     @classmethod

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -1787,3 +1787,25 @@ def test_ffill(self):
         kser.ffill(inplace=True)
         pser.ffill(inplace=True)
         self.assert_eq(repr(kser), repr(pser))
+
+    def test_cov(self):
+        kdf = ks.DataFrame({"A": [90, 91, 85], "B": [90, 91, 85]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        self.assert_eq(kdf.A.cov(kdf.B), pdf.A.cov(pdf.B), almost=True)
+
+        kdf = ks.DataFrame({"A": [90], "B": [90]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        k_cov = kdf.A.cov(kdf.B)
+        p_cov = pdf.A.cov(pdf.B)
+
+        self.assert_eq(np.isnan(k_cov), np.isnan(p_cov))
+
+        kdf = ks.DataFrame({"A": [90, 91, 85], "B": [90, 91, 85]}, columns=["A", "B"])
+        pdf = kdf.to_pandas()
+
+        k_cov = kdf.A.cov(kdf.B, 4)
+        p_cov = pdf.A.cov(pdf.B, 4)
+
+        self.assert_eq(np.isnan(k_cov), np.isnan(p_cov))