Skip to content

Commit ce40a4e

Browse files
committed
[NeoML] DnnDistributed -- remove code copy-paste
Signed-off-by: Kirill Golikov <kirill.golikov@abbyy.com>
1 parent 504a6a7 commit ce40a4e

4 files changed

Lines changed: 159 additions & 197 deletions

File tree

NeoML/Python/src/PyDnnDistributed.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
/* Copyright © 2017-2023 ABBYY
1+
/* Copyright © 2017-2024 ABBYY
2+
23
Licensed under the Apache License, Version 2.0 (the "License");
34
you may not use this file except in compliance with the License.
45
You may obtain a copy of the License at

NeoML/Python/src/PyDnnDistributed.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
/* Copyright © 2017-2023 ABBYY
1+
/* Copyright © 2017-2024 ABBYY
2+
23
Licensed under the Apache License, Version 2.0 (the "License");
34
you may not use this file except in compliance with the License.
45
You may obtain a copy of the License at
@@ -20,7 +21,7 @@ limitations under the License.
2021

2122
class CPyDistributedDataset : public IDistributedDataset {
2223
public:
23-
CPyDistributedDataset( const py::object& data ) : getData( data ) {};
24+
CPyDistributedDataset( const py::object& data ) : getData( data ) {}
2425
int SetInputBatch( CDnn& dnn, int thread ) override;
2526
private:
2627
py::object getData;
@@ -29,13 +30,14 @@ class CPyDistributedDataset : public IDistributedDataset {
2930
class CPyDistributedTraining : public CDistributedTraining {
3031
public:
3132
CPyDistributedTraining( CDnn& dnn, int count, TDistributedInitializer initializer, int seed )
32-
: CDistributedTraining( dnn, count, initializer, seed ) {};
33+
: CDistributedTraining( dnn, count, initializer, seed ) {}
3334
CPyDistributedTraining( CArchive& archive, int count, TDistributedInitializer initializer, int seed )
34-
: CDistributedTraining( archive, count, initializer, seed ) {};
35+
: CDistributedTraining( archive, count, initializer, seed ) {}
3536
CPyDistributedTraining( CDnn& dnn, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
36-
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {};
37+
: CDistributedTraining( dnn, cudaDevs, initializer, seed ) {}
3738
CPyDistributedTraining( CArchive& archive, const CArray<int>& cudaDevs, TDistributedInitializer initializer, int seed )
38-
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {};
39+
: CDistributedTraining( archive, cudaDevs, initializer, seed ) {}
40+
3941
void Run( const py::object& data );
4042
void RunAndBackward( const py::object& data );
4143
void Learn( const py::object& data );
@@ -46,4 +48,4 @@ class CPyDistributedTraining : public CDistributedTraining {
4648
void Save( const std::string& path );
4749
};
4850

49-
void InitializeDistributedTraining(py::module& m);
51+
void InitializeDistributedTraining( py::module& m );

NeoML/include/NeoML/Dnn/DnnDistributed.h

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class NEOML_API CDistributedTraining {
6363
virtual ~CDistributedTraining();
6464

6565
// Gets the number of models in disitrbuted traning
66-
int GetModelCount() const { return cnns.Size(); }
66+
int GetModelCount() const { return threadPool->Size(); }
6767
// Sets the solver for all of the models
6868
void SetSolver( CArchive& archive );
6969
// Sets the learning rate for all of the models
@@ -101,27 +101,26 @@ class NEOML_API CDistributedTraining {
101101

102102
private:
103103
struct CThreadParams;
104+
// Run neural networks passes types
105+
enum class TRunType { Invalid, RunOnce, RunBackwardOnce, Train };
104106

105-
// Either multi-threads on a CPU or multi-devices GPU
106-
const bool isCpu;
107107
// If multi-threads on a CPU, it is an operator of worker threads
108-
IThreadPool* const threadPool;
108+
CPtrOwner<IThreadPool> threadPool;
109+
// Params to transfer to all threads function
110+
CPtrOwner<CThreadParams> threadParams;
109111
// Separate mathEngine for each thread or device both for CPU and GPU training
110112
// Cannot use CPointerArray, as CreateDistributedCpuMathEngines requires a raw array to initialize engines
111113
CArray<IMathEngine*> mathEngines;
112114
// Separate random generator for each dnn in a thread
113115
CPointerArray<CRandom> rands;
114116
// Separate dnn for each thread
115117
CPointerArray<CDnn> cnns;
116-
// Separate `batchSize` for each dnn (may be empty) in a thread
117-
CArray<int> batchSize;
118-
// `Train()` cannot be called if it `isFirstRun`
119-
// `batchSize` may not be equal 0, if it `isFirstRun` for `RunOnce`, `RunAndBackwardOnce` or `RunAndLearnOnce`.
120-
bool isFirstRun = true;
121-
// Containers for errors if it happened
122-
CArray<CString> errorMessages;
123-
124-
void initialize( CArchive& archive, int count, TDistributedInitializer initializer, int seed );
118+
119+
void initialize( CArchive& archive, int count,
120+
TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
121+
void serializeDnn( CDnn& dnn, int count,
122+
TDistributedInitializer initializer, int seed, size_t memoryLimit, const int* cudaDevs = nullptr );
123+
void run( IDistributedDataset*, TRunType );
125124

126125
friend class CLoraSerializer;
127126
};

0 commit comments

Comments
 (0)