1
+ import hashlib
1
2
import os
2
3
from uuid import uuid4
3
4
4
5
import requests
6
+ from botocore .exceptions import ClientError
5
7
from common .repository import IRepository
6
8
from common .s3_service import S3Service
7
9
from structlog import get_logger
@@ -21,6 +23,11 @@ def update_filename_extension(filename, type):
21
23
return f"{ filename } { extension } "
22
24
23
25
26
+ def get_file_checksum (data ):
27
+ """Calculate MD5 checksum of file data"""
28
+ return hashlib .md5 (data ).hexdigest ()
29
+
30
+
24
31
class Scoap3Repository (IRepository ):
25
32
def __init__ (self ):
26
33
super ().__init__ ()
@@ -30,6 +37,31 @@ def __init__(self):
30
37
self .s3 = S3Service (self .bucket )
31
38
self .client = self .s3 .meta .client
32
39
40
+ def file_exists_with_same_checksum (self , bucket , key , data = None ):
41
+ """Check if a file exists at the destination and has the same checksum"""
42
+ try :
43
+ if data :
44
+ # Calculate checksum of data
45
+ data_checksum = get_file_checksum (data )
46
+
47
+ # Get destination file if it exists
48
+ try :
49
+ dest_response = self .client .head_object (Bucket = bucket , Key = key )
50
+ dest_checksum = dest_response .get ("ETag" , "" ).strip ('"' )
51
+
52
+ # Compare checksums
53
+ return data_checksum == dest_checksum
54
+ except ClientError as e :
55
+ if e .response ["Error" ]["Code" ] == "404" :
56
+ # File doesn't exist at destination
57
+ return False
58
+ raise
59
+
60
+ return False
61
+ except Exception as e :
62
+ logger .error ("Error checking file existence" , error = str (e ))
63
+ return False
64
+
33
65
def copy_file (self , source_bucket , source_key , prefix = None , type = None ):
34
66
if not self .upload_enabled :
35
67
return ""
@@ -129,9 +161,18 @@ def download_and_upload_to_s3(self, url, prefix=None, headers=None, type=None):
129
161
logger .error ("Failed to download file" , error = str (e ), url = url )
130
162
return
131
163
164
+ if self .file_exists_with_same_checksum (
165
+ self .bucket , destination_key , data = response .content
166
+ ):
167
+ logger .info (
168
+ "File already exists with the same checksum, skipping upload" ,
169
+ url = url ,
170
+ destination = f"{ self .bucket } /{ destination_key } " ,
171
+ )
172
+ return f"{ self .bucket } /{ destination_key } "
173
+
132
174
try :
133
- # Upload the file to S3
134
- self .client .put_object (
175
+ response = self .client .put_object (
135
176
Body = response .content ,
136
177
Bucket = self .bucket ,
137
178
Key = destination_key ,
0 commit comments