4
4
import os
5
5
import time
6
6
import threading
7
- from common import common
7
+ from common import common , checkpoint
8
8
9
9
class Upload :
10
10
11
11
def s3_upload_file (s3_client ,bucket ,file_name ,object_name ):
12
+ """It will upload given `file_name` to given s3 `bucket`
13
+ and `object_name` s3 path.
14
+ """
15
+
12
16
try :
13
17
response = s3_client .upload_file (file_name ,bucket ,object_name )
14
18
logging .info (f"upload successful at s3://{ bucket } /{ object_name } " )
@@ -19,6 +23,12 @@ def s3_upload_file(s3_client,bucket,file_name,object_name):
19
23
logging .error (f"{ file_path } upload failed error { e } " )
20
24
21
25
def s3_upload (bucket ,dir ,topic_name ,retry_upload_seconds ,thread_count ):
26
+ """It will initialize s3 client and
27
+ based on checkpoint file for each partition.
28
+ It will call `s3_upload_file` function to upload.
29
+
30
+ It will run after every `retry_upload_seconds`"""
31
+
22
32
s3_client = boto3 .client ('s3' )
23
33
while True :
24
34
_topic_dir = os .path .join (dir , topic_name )
@@ -43,27 +53,9 @@ def s3_upload(bucket,dir,topic_name,retry_upload_seconds,thread_count):
43
53
44
54
class Download :
45
55
46
- def s3_read_checkpoint_partition (dir ,topic ,partition ):
47
- try :
48
- logging .debug (os .path .join (dir ,topic ,partition ,"checkpoint" ))
49
- with open (os .path .join (dir ,topic ,partition ,"checkpoint" )) as c :
50
- line = c .readline ().strip ()
51
- _ck_file = line .split ()[0 ]
52
- _total_files = line .split ()[1 ]
53
- return {"checkpoint" : _ck_file , "total_files" : _total_files }
54
- except FileNotFoundError as e :
55
- logging .debug (e )
56
- return None
57
-
58
- def s3_write_checkpoint_partition (dir ,topic ,partition ,msg ):
59
- try :
60
- with open (os .path .join (dir ,topic ,partition ,"checkpoint" ), "w" ) as cp :
61
- cp .write (msg )
62
- except TypeError as e :
63
- logging .error (e )
64
- logging .error (f"writing checkpoint { msg } for { topic } , { partition } is failed" )
65
-
66
56
def s3_count_partitions (s3_client ,bucket ,topic ):
57
+ """It will return number of objects in a given s3 bucket and s3 bucket path."""
58
+
67
59
try :
68
60
return s3_client .list_objects_v2 (
69
61
Bucket = bucket ,
@@ -75,6 +67,8 @@ def s3_count_partitions(s3_client,bucket,topic):
75
67
exit (1 )
76
68
77
69
def s3_list_files (s3_client ,bucket ,path ):
70
+ """It will list all files for given s3 bucket and s3 bucket path."""
71
+
78
72
_list = []
79
73
paginator = s3_client .get_paginator ('list_objects_v2' )
80
74
operation_parameters = {'Bucket' : bucket ,'Prefix' : path }
@@ -88,6 +82,21 @@ def s3_list_files(s3_client,bucket,path):
88
82
return sorted (_list )
89
83
90
84
def s3_download_file (s3_client ,bucket ,object_path ,file_path ):
85
+ """It will download two files .tar.gz and .tar.gz.sha256 .
86
+
87
+ Parameters
88
+ ----------
89
+ s3_client : boto3.client('s3')
90
+
91
+ bucket: str
92
+
93
+ object_path: str
94
+ Description: path in s3 bucket
95
+
96
+ file_path: str
97
+ Description: path from local filesystem
98
+ """
99
+
91
100
try :
92
101
# donwload .tar.gz
93
102
s3_client .download_file (bucket , object_path , file_path )
@@ -98,6 +107,13 @@ def s3_download_file(s3_client,bucket,object_path,file_path):
98
107
logging .error (f"{ file_path } failed with error { e } " )
99
108
100
109
def s3_download (bucket ,topic ,tmp_dir ,retry_download_seconds = 60 ):
110
+ """It will initialize s3 client and
111
+ based on checkpoint file for each partition.
112
+ It will call `s3_download_file` function to download backup
113
+ and backup sha file.
114
+
115
+ It will run after every `retry_download_seconds`"""
116
+
101
117
s3_client = boto3 .client ('s3' )
102
118
while True :
103
119
_pc = Download .s3_count_partitions (s3_client ,bucket ,topic )
@@ -106,7 +122,8 @@ def s3_download(bucket,topic,tmp_dir,retry_download_seconds=60):
106
122
os .makedirs (os .path .join (tmp_dir ,topic ,str (p )),exist_ok = True )
107
123
108
124
for _pt in range (_pc ):
109
- _ck = Download .s3_read_checkpoint_partition (tmp_dir ,topic ,str (_pt ))
125
+
126
+ _ck = checkpoint .read_checkpoint_partition (tmp_dir ,topic ,str (_pt )
110
127
_partition_path = os .path .join (topic ,str (_pt ))
111
128
_s3_partition_files = Download .s3_list_files (s3_client ,bucket ,_partition_path )
112
129
if _ck is not None :
@@ -134,7 +151,7 @@ def s3_download(bucket,topic,tmp_dir,retry_download_seconds=60):
134
151
Download .s3_download_file (s3_client ,bucket ,file ,os .path .join (tmp_dir ,file ))
135
152
if file .endswith (".tar.gz" ):
136
153
_ck ['total_files' ] += 1
137
- Download . s3_write_checkpoint_partition (tmp_dir ,topic ,str (_pt ),file + " " + str (_ck ['total_files' ]))
154
+ checkpoint . write_checkpoint_partition (tmp_dir ,topic ,str (_pt ),file + " " + str (_ck ['total_files' ]))
138
155
139
156
if _pc == 0 :
140
157
logging .error (f"No Partitions found in given S3 path s3://{ bucket } /{ topic } retry seconds { retry_download_seconds } s" )
0 commit comments