Parallelize longitudinal preprocessors

PhilipDeegan · MaryanMorel · commit 2a5945a498e5 · 2018-05-29T15:48:08.000+02:00
diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp
@@ -9,12 +9,11 @@
 
 LongitudinalFeaturesLagger::LongitudinalFeaturesLagger(
     ulong n_intervals,
-    SArrayULongPtr n_lags)
+    SArrayULongPtr _n_lags)
     : n_intervals(n_intervals),
-      n_lags(n_lags),
-      n_features(n_lags->size()),
-      n_lagged_features(n_features + n_lags->sum()),
-      col_offset(nullptr){
+      n_lags(_n_lags),
+      n_features(_n_lags->size()),
+      n_lagged_features(_n_lags->size() + _n_lags->sum()){
   if (n_lags != nullptr) compute_col_offset(n_lags);
 };
 
@@ -33,8 +32,8 @@ void LongitudinalFeaturesLagger::compute_col_offset(const SArrayULongPtr n_lags)
 void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
                                                         ArrayDouble2d &out,
                                                         ulong censoring) const {
-  if (n_features != features.n_rows()) {
-    TICK_ERROR("Features matrix rows count should match n_lags length.");
+  if (n_intervals != features.n_rows()) {
+    TICK_ERROR("Features matrix rows count should match n_intervals.");
   }
   if (n_features != features.n_cols()) {
     TICK_ERROR("Features matrix column count should match n_lags length.");
diff --git a/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp b/lib/cpp/preprocessing/sparse_longitudinal_features_product.cpp
@@ -7,10 +7,6 @@
 #include "tick/preprocessing/sparse_longitudinal_features_product.h"
 #include <map>
 
-SparseLongitudinalFeaturesProduct::SparseLongitudinalFeaturesProduct(
-    const SBaseArrayDouble2dPtrList1D &features)
-    : n_features(features[0]->n_cols()) {}
-
 ulong SparseLongitudinalFeaturesProduct::get_feature_product_col(
     ulong col1, ulong col2, ulong n_cols) const {
   if (col1 > col2) {  // ensure we have the right order as the following formula
diff --git a/lib/include/tick/preprocessing/longitudinal_features_lagger.h b/lib/include/tick/preprocessing/longitudinal_features_lagger.h
@@ -17,11 +17,11 @@ class LongitudinalFeaturesLagger {
   SArrayULongPtr n_lags;
   ulong n_features;
   ulong n_lagged_features;
-  SArrayULongPtr col_offset;
+  SArrayULongPtr col_offset = nullptr;
 
  public:
   // This exists soley for cereal/swig
-  LongitudinalFeaturesLagger(): LongitudinalFeaturesLagger(0, nullptr) {};
+  LongitudinalFeaturesLagger() = default;
 
   LongitudinalFeaturesLagger(ulong n_intervals,
                              SArrayULongPtr n_lags);
@@ -45,11 +45,9 @@ class LongitudinalFeaturesLagger {
 
     Array<ulong> temp_n_lags, temp_col_offset;
     ar(cereal::make_nvp("n_lags", temp_n_lags));
-//    ar(cereal::make_nvp("col_offset", temp_col_offset));
 
     n_lags = temp_n_lags.as_sarray_ptr();
-    if (n_lags != nullptr) compute_col_offset(n_lags);
-    //    col_offset = temp_col_offset.as_sarray_ptr();
+    col_offset = temp_col_offset.as_sarray_ptr();
   }
 
 
@@ -59,7 +57,7 @@ class LongitudinalFeaturesLagger {
     ar(CEREAL_NVP(n_features));
     ar(CEREAL_NVP(n_lagged_features));
     ar(cereal::make_nvp("n_lags", *n_lags));
-//    ar(cereal::make_nvp("col_offset", *col_offset));
+    ar(cereal::make_nvp("col_offset", *col_offset));
   }
 };
 
diff --git a/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h b/lib/include/tick/preprocessing/sparse_longitudinal_features_product.h
@@ -16,8 +16,11 @@ class SparseLongitudinalFeaturesProduct {
   ulong n_features;
 
  public:
+  // This exists soley for cereal/swig
+  explicit SparseLongitudinalFeaturesProduct() = default;
+
   explicit SparseLongitudinalFeaturesProduct(
-      const SBaseArrayDouble2dPtrList1D &features);
+      const ulong n_features): n_features(n_features) {};
 
   inline ulong get_feature_product_col(ulong col1, ulong col2,
                                        ulong n_cols) const;
@@ -28,7 +31,12 @@ class SparseLongitudinalFeaturesProduct {
                                ArrayDouble &out_data) const;
 
   template <class Archive>
-  void serialize(Archive &ar) {
+  void load(Archive &ar) {
+    ar(CEREAL_NVP(n_features));
+  }
+
+  template <class Archive>
+  void save(Archive &ar) const {
     ar(CEREAL_NVP(n_features));
   }
 };
diff --git a/lib/swig/preprocessing/longitudinal_features_lagger.i b/lib/swig/preprocessing/longitudinal_features_lagger.i
@@ -9,8 +9,8 @@
 class LongitudinalFeaturesLagger {
 
  public:
-// This exists soley for cereal/swig
-  LongitudinalFeaturesLagger(): LongitudinalFeaturesLagger(0, nullptr) {};
+  // This exists soley for cereal/swig
+  LongitudinalFeaturesLagger();
 
   LongitudinalFeaturesLagger(ulong n_intervals,
                              SArrayULongPtr n_lags);
diff --git a/lib/swig/preprocessing/sparse_longitudinal_features_product.i b/lib/swig/preprocessing/sparse_longitudinal_features_product.i
@@ -9,7 +9,10 @@
 class SparseLongitudinalFeaturesProduct {
 
   public:
-    SparseLongitudinalFeaturesProduct(const SBaseArrayDouble2dPtrList1D &features);
+    // This exists soley for cereal/swig
+    SparseLongitudinalFeaturesProduct();
+
+    SparseLongitudinalFeaturesProduct(const ulong n_features);
 
     void sparse_features_product(ArrayULong &row,
                                  ArrayULong &col,
diff --git a/tick/preprocessing/longitudinal_features_lagger.py b/tick/preprocessing/longitudinal_features_lagger.py
@@ -3,11 +3,13 @@
 import numpy as np
 import scipy.sparse as sps
 from tick.preprocessing.base import LongitudinalPreprocessor
-from .build.preprocessing import LongitudinalFeaturesLagger\
+from tick.preprocessing.build.preprocessing import LongitudinalFeaturesLagger\
     as _LongitudinalFeaturesLagger
-from .utils import check_longitudinal_features_consistency,\
+from tick.preprocessing.utils import check_longitudinal_features_consistency,\
     check_censoring_consistency
 from multiprocessing.pool import Pool
+from copy import deepcopy
+from functools import partial, partialmethod
 
 
 class LongitudinalFeaturesLagger(LongitudinalPreprocessor):
@@ -76,15 +78,12 @@ class LongitudinalFeaturesLagger(LongitudinalPreprocessor):
         "_n_intervals": {
             "writable": False
         },
-        "_cpp_preprocessor": {
-            "writable": False
-        },
         "_fitted": {
             "writable": False
         }
     }
 
-    def __init__(self, n_lags, n_jobs=1):
+    def __init__(self, n_lags, n_jobs=-1):
         LongitudinalPreprocessor.__init__(self, n_jobs=n_jobs)
         if not isinstance(n_lags, np.ndarray) or n_lags.dtype != 'uint64':
             raise ValueError(
@@ -93,15 +92,13 @@ def __init__(self, n_lags, n_jobs=1):
         self._n_init_features = None
         self._n_output_features = None
         self._n_intervals = None
-        self._cpp_preprocessor = None
         self._fitted = False
 
     def _reset(self):
         """Resets the object its initial construction state."""
         self._set("_n_init_features", None)
         self._set("_n_output_features", None)
         self._set("_n_intervals", None)
-        self._set("_cpp_preprocessor", None)
         self._set("_fitted", False)
 
     def fit(self, features, labels=None, censoring=None):
@@ -138,10 +135,7 @@ def fit(self, features, labels=None, censoring=None):
         self._set("_n_init_features", n_init_features)
         self._set("_n_intervals", n_intervals)
         self._set("_n_output_features", int((self.n_lags + 1).sum()))
-        self._set("_cpp_preprocessor",
-                  _LongitudinalFeaturesLagger(features, self.n_lags))
         self._set("_fitted", True)
-
         return self
 
     def transform(self, features, labels=None, censoring=None):
@@ -175,49 +169,58 @@ def transform(self, features, labels=None, censoring=None):
         base_shape = (self._n_intervals, self._n_init_features)
         features = check_longitudinal_features_consistency(
             features, base_shape, "float64")
-        if sps.issparse(features[0]):
-            if self.n_jobs > 1:
-                with Pool(self.n_jobs) as pool:
-                    X_with_lags = pool.starmap(self._sparse_lagger, zip(features, censoring))
-                    pool.start()
-                    pool.join()
-            else:
-                X_with_lags = [
-                    self._sparse_lagger(x, int(censoring[i]))
-                    for i, x in enumerate(features)
-                ]
-                # TODO: Don't get why int() is required here as censoring_i is uint64
-        else:
-            if self.n_jobs > 1:
-                with Pool(self.n_jobs) as pool:
-                    X_with_lags = pool.starmap(self._dense_lagger, zip(features, censoring))
-                    pool.start()
-                    pool.join()
-            else:
-                X_with_lags = [
-                    self._dense_lagger(x, int(censoring[i]))
-                    for i, x in enumerate(features)
-                ]
+
+        initializer = partial(self._inject_cpp_object,
+                              n_intervals=self._n_intervals, n_lags=self.n_lags)
+        callback = self._sparse_lagger if sps.issparse(features[0]) \
+            else self._dense_lagger
+        callback = partial(callback, n_intervals=self._n_intervals,
+                           n_output_features=self._n_output_features,
+                           n_lags=self.n_lags)
+
+        with Pool(self.n_jobs, initializer=initializer) as pool:
+            X_with_lags = pool.starmap(callback, zip(features, censoring))
 
         return X_with_lags, labels, censoring
 
-    def _dense_lagger(self, feature_matrix, censoring_i):
-        output = np.zeros((self._n_intervals, self._n_output_features),
+    @staticmethod
+    def _inject_cpp_object(n_intervals, n_lags):
+        """Creates a global instance of the CPP preprocessor object.
+
+        WARNING: to be used only as a multiprocessing.Pool initializer.
+        In multiprocessing context, each process has its own namespace, so using
+        global is not as bad as it seems. Still, it requires to proceed with
+        caution.
+        """
+        global _cpp_preprocessor
+        _cpp_preprocessor = _LongitudinalFeaturesLagger(n_intervals, n_lags)
+
+    @staticmethod
+    def _dense_lagger(feature_matrix, censoring_i, n_intervals,
+                      n_output_features, n_lags):
+        """Creates a lagged version of a dense matrixrepresenting longitudinal
+        features."""
+        global _cpp_preprocessor
+        output = np.zeros((n_intervals, n_output_features),
                           dtype="float64")
-        self._cpp_preprocessor.dense_lag_preprocessor(feature_matrix, output,
-                                                      censoring_i)
+        _cpp_preprocessor.dense_lag_preprocessor(feature_matrix, output,
+                                                 int(censoring_i))
         return output
 
-    def _sparse_lagger(self, feature_matrix, censoring_i):
-        pp = self._cpp_preprocessor
+    @staticmethod
+    def _sparse_lagger(feature_matrix, censoring_i, n_intervals,
+                       n_output_features, n_lags):
+        """Creates a lagged version of a sparse matrix representing longitudinal
+        features."""
+        global _cpp_preprocessor
         coo = feature_matrix.tocoo()
-        estimated_nnz = coo.nnz * int((self.n_lags + 1).sum())
+        estimated_nnz = coo.nnz * int((n_lags + 1).sum())
         out_row = np.zeros((estimated_nnz,), dtype="uint64")
         out_col = np.zeros((estimated_nnz,), dtype="uint64")
         out_data = np.zeros((estimated_nnz,), dtype="float64")
-        pp.sparse_lag_preprocessor(
+        _cpp_preprocessor.sparse_lag_preprocessor(
             coo.row.astype("uint64"), coo.col.astype("uint64"), coo.data,
             out_row, out_col, out_data, int(censoring_i))
         return sps.csr_matrix((out_data, (out_row, out_col)),
-                              shape=(self._n_intervals,
-                                     self._n_output_features))
+                              shape=(n_intervals,
+                                     n_output_features))
diff --git a/tick/preprocessing/longitudinal_features_product.py b/tick/preprocessing/longitudinal_features_product.py
diff --git a/tick/preprocessing/tests/longitudinal_features_lagger_test.py b/tick/preprocessing/tests/longitudinal_features_lagger_test.py
diff --git a/tick/preprocessing/tests/longitudinal_features_product_test.py b/tick/preprocessing/tests/longitudinal_features_product_test.py