shitty pickling

MaryanMorel · MaryanMorel · commit 3e7d36e5d940 · 2018-05-25T14:38:03.000+02:00
diff --git a/lib/cpp/preprocessing/longitudinal_features_lagger.cpp b/lib/cpp/preprocessing/longitudinal_features_lagger.cpp
@@ -6,34 +6,39 @@
 
 #include "tick/preprocessing/longitudinal_features_lagger.h"
 
+
 LongitudinalFeaturesLagger::LongitudinalFeaturesLagger(
-    const SBaseArrayDouble2dPtrList1D &features,
-    const SArrayULongPtr n_lags)
-    : n_intervals(features[0]->n_rows()),
+    ulong n_intervals,
+    SArrayULongPtr n_lags)
+    : n_intervals(n_intervals),
       n_lags(n_lags),
-      n_samples(features.size()),
-      n_observations(n_samples * n_intervals),
-      n_features(features[0]->n_cols()),
-      n_lagged_features(n_lags->sum() + n_lags->size()) {
-  col_offset = ArrayULong(n_lags->size());
-  col_offset.init_to_zero();
-  if (n_features != n_lags->size()) {
-    TICK_ERROR("Features matrix column number should match n_lags length.");
-  }
-  if ((*n_lags)[0] >= n_intervals) {
-    TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
-  }
+      n_features(n_lags->size()),
+      n_lagged_features(n_features + n_lags->sum()),
+      col_offset(nullptr){
+  if (n_lags != nullptr) compute_col_offset(n_lags);
+};
+
+void LongitudinalFeaturesLagger::compute_col_offset(const SArrayULongPtr n_lags) {
+  ArrayULong col_offset_temp = ArrayULong(n_lags->size());
+  col_offset_temp.init_to_zero();
   for (ulong i(1); i < n_lags->size(); i++) {
     if ((*n_lags)[i] >= n_intervals) {
       TICK_ERROR("n_lags elements must be between 0 and (n_intervals - 1).");
     }
-    col_offset[i] = col_offset[i - 1] + (*n_lags)[i-1] + 1;
+    col_offset_temp[i] = col_offset_temp[i - 1] + (*n_lags)[i-1] + 1;
   }
-}
+  col_offset = col_offset_temp.as_sarray_ptr();
+};
 
 void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
                                                         ArrayDouble2d &out,
                                                         ulong censoring) const {
+  if (n_features != features.n_rows()) {
+    TICK_ERROR("Features matrix rows count should match n_lags length.");
+  }
+  if (n_features != features.n_cols()) {
+    TICK_ERROR("Features matrix column count should match n_lags length.");
+  }
   if (out.n_cols() != n_lagged_features) {
     TICK_ERROR(
         "n_columns of &out should be equal to n_features + sum(n_lags).");
@@ -47,8 +52,9 @@ void LongitudinalFeaturesLagger::dense_lag_preprocessor(ArrayDouble2d &features,
     n_cols_feature = (*n_lags)[feature] + 1;
     for (ulong j = 0; j < n_intervals; j++) {
       row = j;
-      col = col_offset[feature];
-      value = features(row, feature);
+      col = (*col_offset)[feature];
+      // use view_row instead of (row, feature) to be const
+      value = view_row(features, row)[feature];
       max_col = col + n_cols_feature;
       if (value != 0) {
         while (row < censoring && col < max_col) {
@@ -68,14 +74,15 @@ void LongitudinalFeaturesLagger::sparse_lag_preprocessor(ArrayULong &row,
                                                          ArrayULong &out_col,
                                                          ArrayDouble &out_data,
                                                          ulong censoring) const {
+  // TODO: add checks here ? Or do them in Python ?
   ulong j(0), r, c, offset, new_col, max_col;
   double value;
 
   for (ulong i = 0; i < data.size(); i++) {
     value = data[i];
     r = row[i];
     c = col[i];
-    offset = col_offset[c];
+    offset = (*col_offset)[c];
     max_col = offset + (*n_lags)[c] + 1;
     new_col = offset;
 
diff --git a/lib/include/tick/preprocessing/longitudinal_features_lagger.h b/lib/include/tick/preprocessing/longitudinal_features_lagger.h
@@ -15,17 +15,21 @@ class LongitudinalFeaturesLagger {
  protected:
   ulong n_intervals;
   SArrayULongPtr n_lags;
-  ArrayULong col_offset;
-  ulong n_samples;
-  ulong n_observations;
   ulong n_features;
   ulong n_lagged_features;
+  SArrayULongPtr col_offset;
 
  public:
-  LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
-                             const SArrayULongPtr n_lags);
+  // This exists soley for cereal/swig
+  LongitudinalFeaturesLagger(): LongitudinalFeaturesLagger(0, nullptr) {};
 
-  void dense_lag_preprocessor(ArrayDouble2d &features, ArrayDouble2d &out,
+  LongitudinalFeaturesLagger(ulong n_intervals,
+                             SArrayULongPtr n_lags);
+
+  void compute_col_offset(SArrayULongPtr n_lags);
+
+  void dense_lag_preprocessor(ArrayDouble2d &features,
+                              ArrayDouble2d &out,
                               ulong censoring) const;
 
   void sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col,
@@ -34,14 +38,28 @@ class LongitudinalFeaturesLagger {
                                ulong censoring) const;
 
   template <class Archive>
-  void serialize(Archive &ar) {
+  void load(Archive &ar) {
+    ar(CEREAL_NVP(n_intervals));
+    ar(CEREAL_NVP(n_features));
+    ar(CEREAL_NVP(n_lagged_features));
+
+    Array<ulong> temp_n_lags, temp_col_offset;
+    ar(cereal::make_nvp("n_lags", temp_n_lags));
+//    ar(cereal::make_nvp("col_offset", temp_col_offset));
+
+    n_lags = temp_n_lags.as_sarray_ptr();
+    if (n_lags != nullptr) compute_col_offset(n_lags);
+    //    col_offset = temp_col_offset.as_sarray_ptr();
+  }
+
+
+  template <class Archive>
+  void save(Archive &ar) const {
     ar(CEREAL_NVP(n_intervals));
-    ar(CEREAL_NVP(n_lags));
-    ar(CEREAL_NVP(col_offset));
-    ar(CEREAL_NVP(n_samples));
-    ar(CEREAL_NVP(n_observations));
     ar(CEREAL_NVP(n_features));
     ar(CEREAL_NVP(n_lagged_features));
+    ar(cereal::make_nvp("n_lags", *n_lags));
+//    ar(cereal::make_nvp("col_offset", *col_offset));
   }
 };
 
diff --git a/lib/swig/preprocessing/longitudinal_features_lagger.i b/lib/swig/preprocessing/longitudinal_features_lagger.i
@@ -4,24 +4,25 @@
 #include "tick/preprocessing/longitudinal_features_lagger.h"
 %}
 
+%include serialization.i
+
 class LongitudinalFeaturesLagger {
 
  public:
-  LongitudinalFeaturesLagger(const SBaseArrayDouble2dPtrList1D &features,
-                             const SArrayULongPtr n_lags);
+// This exists soley for cereal/swig
+  LongitudinalFeaturesLagger(): LongitudinalFeaturesLagger(0, nullptr) {};
+
+  LongitudinalFeaturesLagger(ulong n_intervals,
+                             SArrayULongPtr n_lags);
 
   void dense_lag_preprocessor(ArrayDouble2d &features,
                               ArrayDouble2d &out,
                               ulong censoring) const;
 
-  void sparse_lag_preprocessor(ArrayULong &row,
-                               ArrayULong &col,
-                               ArrayDouble &data,
-                               ArrayULong &out_row,
-                               ArrayULong &out_col,
-                               ArrayDouble &out_data,
+  void sparse_lag_preprocessor(ArrayULong &row, ArrayULong &col,
+                               ArrayDouble &data, ArrayULong &out_row,
+                               ArrayULong &out_col, ArrayDouble &out_data,
                                ulong censoring) const;
-
 };
 
 TICK_MAKE_PICKLABLE(LongitudinalFeaturesLagger);
diff --git a/lib/swig/preprocessing/sparse_longitudinal_features_product.i b/lib/swig/preprocessing/sparse_longitudinal_features_product.i
@@ -4,6 +4,8 @@
 #include "tick/preprocessing/sparse_longitudinal_features_product.h"
 %}
 
+%include serialization.i
+
 class SparseLongitudinalFeaturesProduct {
 
   public:
diff --git a/tick/preprocessing/base/longitudinal_preprocessor.py b/tick/preprocessing/base/longitudinal_preprocessor.py
@@ -2,6 +2,7 @@
 
 from abc import ABC, abstractmethod
 from tick.base import Base
+from multiprocessing import cpu_count
 
 
 class LongitudinalPreprocessor(ABC, Base):
@@ -14,9 +15,14 @@ class LongitudinalPreprocessor(ABC, Base):
         set to the number of cores.
     """
 
-    def __init__(self, n_jobs=-1):
+    _attrinfos = {'n_jobs': {'writable': True}}
+
+    def __init__(self, n_jobs=1):
         Base.__init__(self)
-        self.n_jobs = n_jobs
+        if n_jobs == -1:
+            self.n_jobs = cpu_count()
+        else:
+            self.n_jobs = n_jobs
 
     @abstractmethod
     def fit(self, features, labels, censoring) -> None:
diff --git a/tick/preprocessing/longitudinal_features_lagger.py b/tick/preprocessing/longitudinal_features_lagger.py
@@ -7,6 +7,7 @@
     as _LongitudinalFeaturesLagger
 from .utils import check_longitudinal_features_consistency,\
     check_censoring_consistency
+from multiprocessing.pool import Pool
 
 
 class LongitudinalFeaturesLagger(LongitudinalPreprocessor):
@@ -83,7 +84,7 @@ class LongitudinalFeaturesLagger(LongitudinalPreprocessor):
         }
     }
 
-    def __init__(self, n_lags, n_jobs=-1):
+    def __init__(self, n_lags, n_jobs=1):
         LongitudinalPreprocessor.__init__(self, n_jobs=n_jobs)
         if not isinstance(n_lags, np.ndarray) or n_lags.dtype != 'uint64':
             raise ValueError(
@@ -166,7 +167,6 @@ def transform(self, features, labels=None, censoring=None):
         output : `[numpy.ndarrays]`  or `[csr_matrices]`, shape=(n_intervals, n_features)
             The list of features matrices with added lagged features.
         """
-
         n_samples = len(features)
         if censoring is None:
             censoring = np.full((n_samples,), self._n_intervals,
@@ -176,16 +176,28 @@ def transform(self, features, labels=None, censoring=None):
         features = check_longitudinal_features_consistency(
             features, base_shape, "float64")
         if sps.issparse(features[0]):
-            X_with_lags = [
-                self._sparse_lagger(x, int(censoring[i]))
-                for i, x in enumerate(features)
-            ]
-            # TODO: Don't get why int() is required here as censoring_i is uint64
+            if self.n_jobs > 1:
+                with Pool(self.n_jobs) as pool:
+                    X_with_lags = pool.starmap(self._sparse_lagger, zip(features, censoring))
+                    pool.start()
+                    pool.join()
+            else:
+                X_with_lags = [
+                    self._sparse_lagger(x, int(censoring[i]))
+                    for i, x in enumerate(features)
+                ]
+                # TODO: Don't get why int() is required here as censoring_i is uint64
         else:
-            X_with_lags = [
-                self._dense_lagger(x, int(censoring[i]))
-                for i, x in enumerate(features)
-            ]
+            if self.n_jobs > 1:
+                with Pool(self.n_jobs) as pool:
+                    X_with_lags = pool.starmap(self._dense_lagger, zip(features, censoring))
+                    pool.start()
+                    pool.join()
+            else:
+                X_with_lags = [
+                    self._dense_lagger(x, int(censoring[i]))
+                    for i, x in enumerate(features)
+                ]
 
         return X_with_lags, labels, censoring
 
@@ -197,14 +209,15 @@ def _dense_lagger(self, feature_matrix, censoring_i):
         return output
 
     def _sparse_lagger(self, feature_matrix, censoring_i):
+        pp = self._cpp_preprocessor
         coo = feature_matrix.tocoo()
         estimated_nnz = coo.nnz * int((self.n_lags + 1).sum())
         out_row = np.zeros((estimated_nnz,), dtype="uint64")
         out_col = np.zeros((estimated_nnz,), dtype="uint64")
         out_data = np.zeros((estimated_nnz,), dtype="float64")
-        self._cpp_preprocessor.sparse_lag_preprocessor(
+        pp.sparse_lag_preprocessor(
             coo.row.astype("uint64"), coo.col.astype("uint64"), coo.data,
-            out_row, out_col, out_data, censoring_i)
+            out_row, out_col, out_data, int(censoring_i))
         return sps.csr_matrix((out_data, (out_row, out_col)),
                               shape=(self._n_intervals,
                                      self._n_output_features))
diff --git a/tick/preprocessing/tests/longitudinal_features_lagger_test.py b/tick/preprocessing/tests/longitudinal_features_lagger_test.py
@@ -36,6 +36,18 @@ def test_sparse_pre_convolution(self):
         feat_prod = [f.todense() for f in feat_prod]
         np.testing.assert_equal(feat_prod, self.expected_output)
 
+    # def test_parallelization(self):
+    #     feat_prod, _, _ = LongitudinalFeaturesLagger(n_lags=self.n_lags, n_jobs=1) \
+    #         .fit_transform(self.sparse_features, censoring=self.censoring)
+    #     p_feat_prod, _, _ = LongitudinalFeaturesLagger(n_lags=self.n_lags, n_jobs=3)\
+    #         .fit_transform(self.sparse_features, censoring=self.censoring)
+    #
+    # def test_parallelization(self):
+    #     feat_prod, _, _ = LongitudinalFeaturesLagger(n_lags=self.n_lags, n_jobs=1) \
+    #         .fit_transform(self.features, censoring=self.censoring)
+    #     p_feat_prod, _, _ = LongitudinalFeaturesLagger(n_lags=self.n_lags, n_jobs=1)\
+    #         .fit_transform(self.features, censoring=self.censoring)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tick/survival/convolutional_sccs.py b/tick/survival/convolutional_sccs.py
@@ -355,7 +355,8 @@ def fit_kfold_cv(self, features, labels, censoring, C_tv_range: tuple = (),
             features, labels, censoring)
         # split the data with stratified KFold
         kf = StratifiedKFold(n_folds, shuffle, self.random_state)
-        labels_interval = np.nonzero(p_labels)[1]
+        # labels_interval = np.nonzero(p_labels)[1]
+        labels_interval = [np.nonzero(arr)[0][0] for arr in p_labels]
 
         # Training loop
         model_global_parameters = {