1
+
2
+ from .core import CreateModel , ApplyModel
3
+
1
4
## Information
2
- __version__ = """0.0.4 """
5
+ __version__ = """0.0.5 """
3
6
__info__ = """
4
7
- Author : Wooil Jeong
5
8
6
9
- Github : https://github.com/WooilJeong/
7
10
- Blog : https://wooiljeong.github.io\
8
11
"""
9
- __all__ = ["__version__" , "__info__" ]
10
-
11
- import os
12
- import glob
13
- import numpy as np
14
-
15
- from pathlib import Path
16
- from collections import Counter
17
-
18
- import tensorflow as tf
19
- from tensorflow import keras
20
- from tensorflow .keras import layers
21
-
22
- class CTCLayer (layers .Layer ):
23
- def __init__ (self , name = None ):
24
- super ().__init__ (name = name )
25
- self .loss_fn = keras .backend .ctc_batch_cost
26
-
27
- def call (self , y_true , y_pred ):
28
- # Compute the training-time loss value and add it
29
- # to the layer using `self.add_loss()`.
30
- batch_len = tf .cast (tf .shape (y_true )[0 ], dtype = "int64" )
31
- input_length = tf .cast (tf .shape (y_pred )[1 ], dtype = "int64" )
32
- label_length = tf .cast (tf .shape (y_true )[1 ], dtype = "int64" )
33
-
34
- input_length = input_length * tf .ones (shape = (batch_len , 1 ), dtype = "int64" )
35
- label_length = label_length * tf .ones (shape = (batch_len , 1 ), dtype = "int64" )
36
-
37
- loss = self .loss_fn (y_true , y_pred , input_length , label_length )
38
- self .add_loss (loss )
39
-
40
- # At test time, just return the computed predictions
41
- return y_pred
42
-
43
-
44
-
45
- class CreateModel :
46
-
47
- def __init__ (self , train_img_path , img_width = 200 , img_height = 50 ):
48
- # 이미지 크기
49
- self .img_width = img_width
50
- self .img_height = img_height
51
- # 학습 이미지 파일 경로 리스트
52
- self .images = sorted (train_img_path )
53
- # 학습 이미지 파일 라벨 리스트
54
- self .labels = [img .split (os .path .sep )[- 1 ].split (".png" )[0 ] for img in self .images ]
55
- # 라벨 SET
56
- self .characters = set (char for label in self .labels for char in label )
57
- # 라벨 최대 길이
58
- self .max_length = max ([len (label ) for label in self .labels ])
59
-
60
- # Mapping characters to integers
61
- self .char_to_num = layers .experimental .preprocessing .StringLookup (
62
- vocabulary = sorted (self .characters ), num_oov_indices = 0 , mask_token = None
63
- )
64
- # Mapping integers back to original characters
65
- self .num_to_char = layers .experimental .preprocessing .StringLookup (
66
- vocabulary = self .char_to_num .get_vocabulary (), mask_token = None , invert = True
67
- )
68
-
69
- def train_model (self , epochs = 100 ):
70
- # 학습 및 검증을 위한 배치 사이즈 정의
71
- batch_size = 16
72
- # 다운 샘플링 요인 수 (Conv: 2, Pooling: 2)
73
- downsample_factor = 4
74
-
75
- # Splitting data into training and validation sets
76
- x_train , x_valid , y_train , y_valid = self .split_data (np .array (self .images ), np .array (self .labels ))
77
-
78
- train_dataset = tf .data .Dataset .from_tensor_slices ((x_train , y_train ))
79
- train_dataset = (
80
- train_dataset .map (
81
- self .encode_single_sample , num_parallel_calls = tf .data .experimental .AUTOTUNE
82
- )
83
- .batch (batch_size )
84
- .prefetch (buffer_size = tf .data .experimental .AUTOTUNE )
85
- )
86
-
87
- validation_dataset = tf .data .Dataset .from_tensor_slices ((x_valid , y_valid ))
88
- validation_dataset = (
89
- validation_dataset .map (
90
- self .encode_single_sample , num_parallel_calls = tf .data .experimental .AUTOTUNE
91
- )
92
- .batch (batch_size )
93
- .prefetch (buffer_size = tf .data .experimental .AUTOTUNE )
94
- )
95
-
96
- # Get the model
97
- model = self .build_model ()
98
-
99
-
100
- early_stopping_patience = 10
101
- # Add early stopping
102
- early_stopping = keras .callbacks .EarlyStopping (
103
- monitor = "val_loss" , patience = early_stopping_patience , restore_best_weights = True
104
- )
105
-
106
- # Train the model
107
- history = model .fit (
108
- train_dataset ,
109
- validation_data = validation_dataset ,
110
- epochs = epochs ,
111
- callbacks = [early_stopping ],
112
- )
113
-
114
- return model
115
-
116
-
117
- def encode_single_sample (self , img_path , label ):
118
- # 1. Read image
119
- img = tf .io .read_file (img_path )
120
- # 2. Decode and convert to grayscale
121
- img = tf .io .decode_png (img , channels = 1 )
122
- # 3. Convert to float32 in [0, 1] range
123
- img = tf .image .convert_image_dtype (img , tf .float32 )
124
- # 4. Resize to the desired size
125
- img = tf .image .resize (img , [self .img_height , self .img_width ])
126
- # 5. Transpose the image because we want the time
127
- # dimension to correspond to the width of the image.
128
- img = tf .transpose (img , perm = [1 , 0 , 2 ])
129
- # 6. Map the characters in label to numbers
130
- label = self .char_to_num (tf .strings .unicode_split (label , input_encoding = "UTF-8" ))
131
- # 7. Return a dict as our model is expecting two inputs
132
- return {"image" : img , "label" : label }
133
-
134
- def build_model (self ):
135
- # Inputs to the model
136
- input_img = layers .Input (
137
- shape = (self .img_width , self .img_height , 1 ), name = "image" , dtype = "float32"
138
- )
139
- labels = layers .Input (name = "label" , shape = (None ,), dtype = "float32" )
140
-
141
- # First conv block
142
- x = layers .Conv2D (
143
- 32 ,
144
- (3 , 3 ),
145
- activation = "relu" ,
146
- kernel_initializer = "he_normal" ,
147
- padding = "same" ,
148
- name = "Conv1" ,
149
- )(input_img )
150
- x = layers .MaxPooling2D ((2 , 2 ), name = "pool1" )(x )
151
-
152
- # Second conv block
153
- x = layers .Conv2D (
154
- 64 ,
155
- (3 , 3 ),
156
- activation = "relu" ,
157
- kernel_initializer = "he_normal" ,
158
- padding = "same" ,
159
- name = "Conv2" ,
160
- )(x )
161
- x = layers .MaxPooling2D ((2 , 2 ), name = "pool2" )(x )
162
-
163
- # We have used two max pool with pool size and strides 2.
164
- # Hence, downsampled feature maps are 4x smaller. The number of
165
- # filters in the last layer is 64. Reshape accordingly before
166
- # passing the output to the RNN part of the model
167
- new_shape = ((self .img_width // 4 ), (self .img_height // 4 ) * 64 )
168
- x = layers .Reshape (target_shape = new_shape , name = "reshape" )(x )
169
- x = layers .Dense (64 , activation = "relu" , name = "dense1" )(x )
170
- x = layers .Dropout (0.2 )(x )
171
-
172
- # RNNs
173
- x = layers .Bidirectional (layers .LSTM (128 , return_sequences = True , dropout = 0.25 ))(x )
174
- x = layers .Bidirectional (layers .LSTM (64 , return_sequences = True , dropout = 0.25 ))(x )
175
-
176
- # Output layer
177
- x = layers .Dense (len (self .characters ) + 1 , activation = "softmax" , name = "dense2" )(x )
178
-
179
- # Add CTC layer for calculating CTC loss at each step
180
- output = CTCLayer (name = "ctc_loss" )(labels , x )
181
-
182
- # Define the model
183
- model = keras .models .Model (
184
- inputs = [input_img , labels ], outputs = output , name = "ocr_model_v1"
185
- )
186
- # Optimizer
187
- opt = keras .optimizers .Adam ()
188
- # Compile the model and return
189
- model .compile (optimizer = opt )
190
- return model
191
-
192
- def split_data (self , images , labels , train_size = 0.9 , shuffle = True ):
193
- # 1. Get the total size of the dataset
194
- size = len (images )
195
- # 2. Make an indices array and shuffle it, if required
196
- indices = np .arange (size )
197
- if shuffle :
198
- np .random .shuffle (indices )
199
- # 3. Get the size of training samples
200
- train_samples = int (size * train_size )
201
- # 4. Split data into training and validation sets
202
- x_train , y_train = images [indices [:train_samples ]], labels [indices [:train_samples ]]
203
- x_valid , y_valid = images [indices [train_samples :]], labels [indices [train_samples :]]
204
- return x_train , x_valid , y_train , y_valid
205
-
206
-
207
- class ApplyModel :
208
-
209
- def __init__ (self ,
210
- weights_path ,
211
- img_width = 200 ,
212
- img_height = 50 ,
213
- max_length = 6 ,
214
- characters = {'0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , '8' , '9' }):
215
-
216
- self .img_width = img_width
217
- self .img_height = img_height
218
- self .max_length = max_length
219
- self .characters = characters
220
-
221
- # Mapping characters to integers
222
- self .char_to_num = layers .experimental .preprocessing .StringLookup (
223
- vocabulary = sorted (self .characters ), num_oov_indices = 0 , mask_token = None
224
- )
225
- # Mapping integers back to original characters
226
- self .num_to_char = layers .experimental .preprocessing .StringLookup (
227
- vocabulary = self .char_to_num .get_vocabulary (), mask_token = None , invert = True
228
- )
229
- # Model
230
- self .model = self .build_model ()
231
- self .model .load_weights (weights_path )
232
- self .prediction_model = keras .models .Model (
233
- self .model .get_layer (name = "image" ).input , self .model .get_layer (name = "dense2" ).output
234
- )
235
-
236
- def predict (self , target_img_path ):
237
- target_img = self .encode_single_sample (target_img_path )['image' ]
238
- target_img = tf .reshape (target_img , shape = [1 ,self .img_width ,self .img_height ,1 ])
239
- pred_val = self .prediction_model .predict (target_img )
240
- pred = self .decode_batch_predictions (pred_val )[0 ]
241
- return pred
242
-
243
- def encode_single_sample (self , img_path ):
244
- # 1. Read image
245
- img = tf .io .read_file (img_path )
246
- # 2. Decode and convert to grayscale
247
- img = tf .io .decode_png (img , channels = 1 )
248
- # 3. Convert to float32 in [0, 1] range
249
- img = tf .image .convert_image_dtype (img , tf .float32 )
250
- # 4. Resize to the desired size
251
- img = tf .image .resize (img , [self .img_height , self .img_width ])
252
- # 5. Transpose the image because we want the time
253
- # dimension to correspond to the width of the image.
254
- img = tf .transpose (img , perm = [1 , 0 , 2 ])
255
- # 6. Map the characters in label to numbers
256
- # 7. Return a dict as our model is expecting two inputs
257
- return {"image" : img }
258
-
259
- def build_model (self ):
260
- # Inputs to the model
261
- input_img = layers .Input (
262
- shape = (self .img_width , self .img_height , 1 ), name = "image" , dtype = "float32"
263
- )
264
- labels = layers .Input (name = "label" , shape = (None ,), dtype = "float32" )
265
-
266
- # First conv block
267
- x = layers .Conv2D (
268
- 32 ,
269
- (3 , 3 ),
270
- activation = "relu" ,
271
- kernel_initializer = "he_normal" ,
272
- padding = "same" ,
273
- name = "Conv1" ,
274
- )(input_img )
275
- x = layers .MaxPooling2D ((2 , 2 ), name = "pool1" )(x )
276
-
277
- # Second conv block
278
- x = layers .Conv2D (
279
- 64 ,
280
- (3 , 3 ),
281
- activation = "relu" ,
282
- kernel_initializer = "he_normal" ,
283
- padding = "same" ,
284
- name = "Conv2" ,
285
- )(x )
286
- x = layers .MaxPooling2D ((2 , 2 ), name = "pool2" )(x )
287
-
288
- # We have used two max pool with pool size and strides 2.
289
- # Hence, downsampled feature maps are 4x smaller. The number of
290
- # filters in the last layer is 64. Reshape accordingly before
291
- # passing the output to the RNN part of the model
292
- new_shape = ((self .img_width // 4 ), (self .img_height // 4 ) * 64 )
293
- x = layers .Reshape (target_shape = new_shape , name = "reshape" )(x )
294
- x = layers .Dense (64 , activation = "relu" , name = "dense1" )(x )
295
- x = layers .Dropout (0.2 )(x )
296
-
297
- # RNNs
298
- x = layers .Bidirectional (layers .LSTM (128 , return_sequences = True , dropout = 0.25 ))(x )
299
- x = layers .Bidirectional (layers .LSTM (64 , return_sequences = True , dropout = 0.25 ))(x )
300
-
301
- # Output layer
302
- x = layers .Dense (len (self .characters ) + 1 , activation = "softmax" , name = "dense2" )(x )
303
-
304
- # Add CTC layer for calculating CTC loss at each step
305
- output = CTCLayer (name = "ctc_loss" )(labels , x )
306
-
307
- # Define the model
308
- model = keras .models .Model (
309
- inputs = [input_img , labels ], outputs = output , name = "ocr_model_v1"
310
- )
311
- # Optimizer
312
- opt = keras .optimizers .Adam ()
313
- # Compile the model and return
314
- model .compile (optimizer = opt )
315
- return model
316
-
317
- def split_data (self , images , labels , train_size = 0.9 , shuffle = True ):
318
- # 1. Get the total size of the dataset
319
- size = len (images )
320
- # 2. Make an indices array and shuffle it, if required
321
- indices = np .arange (size )
322
- if shuffle :
323
- np .random .shuffle (indices )
324
- # 3. Get the size of training samples
325
- train_samples = int (size * train_size )
326
- # 4. Split data into training and validation sets
327
- x_train , y_train = images [indices [:train_samples ]], labels [indices [:train_samples ]]
328
- x_valid , y_valid = images [indices [train_samples :]], labels [indices [train_samples :]]
329
- return x_train , x_valid , y_train , y_valid
330
-
331
-
332
- # A utility function to decode the output of the network
333
- def decode_batch_predictions (self , pred ):
334
- input_len = np .ones (pred .shape [0 ]) * pred .shape [1 ]
335
- # Use greedy search. For complex tasks, you can use beam search
336
- results = keras .backend .ctc_decode (pred , input_length = input_len , greedy = True )[0 ][0 ][
337
- :, :self .max_length
338
- ]
339
- # Iterate over the results and get back the text
340
- output_text = []
341
- for res in results :
342
- res = tf .strings .reduce_join (self .num_to_char (res + 1 )).numpy ().decode ("utf-8" )
343
- output_text .append (res )
344
- return output_text
12
+ __all__ = ["__version__" , "__info__" , "CreateModel" , "ApplyModel" ]
0 commit comments