Skip to content

Commit e138928

Browse files
feat(S3): Conditional s3 upload
Only upload to S3 when file checksum is different. Signed-off-by: [email protected]
1 parent 54d5c7d commit e138928

File tree

7 files changed

+35882
-8
lines changed

7 files changed

+35882
-8
lines changed

dags/common/scoap3_s3.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import hashlib
12
import os
23
from uuid import uuid4
34

45
import requests
6+
from botocore.exceptions import ClientError
57
from common.repository import IRepository
68
from common.s3_service import S3Service
79
from structlog import get_logger
@@ -21,6 +23,11 @@ def update_filename_extension(filename, type):
2123
return f"{filename}{extension}"
2224

2325

26+
def get_file_checksum(data):
27+
"""Calculate MD5 checksum of file data"""
28+
return hashlib.md5(data).hexdigest()
29+
30+
2431
class Scoap3Repository(IRepository):
2532
def __init__(self):
2633
super().__init__()
@@ -30,6 +37,31 @@ def __init__(self):
3037
self.s3 = S3Service(self.bucket)
3138
self.client = self.s3.meta.client
3239

40+
def file_exists_with_same_checksum(self, bucket, key, data=None):
41+
"""Check if a file exists at the destination and has the same checksum"""
42+
try:
43+
if data:
44+
# Calculate checksum of data
45+
data_checksum = get_file_checksum(data)
46+
47+
# Get destination file if it exists
48+
try:
49+
dest_response = self.client.head_object(Bucket=bucket, Key=key)
50+
dest_checksum = dest_response.get("ETag", "").strip('"')
51+
52+
# Compare checksums
53+
return data_checksum == dest_checksum
54+
except ClientError as e:
55+
if e.response["Error"]["Code"] == "404":
56+
# File doesn't exist at destination
57+
return False
58+
raise
59+
60+
return False
61+
except Exception as e:
62+
logger.error("Error checking file existence", error=str(e))
63+
return False
64+
3365
def copy_file(self, source_bucket, source_key, prefix=None, type=None):
3466
if not self.upload_enabled:
3567
return ""
@@ -129,9 +161,18 @@ def download_and_upload_to_s3(self, url, prefix=None, headers=None, type=None):
129161
logger.error("Failed to download file", error=str(e), url=url)
130162
return
131163

164+
if self.file_exists_with_same_checksum(
165+
self.bucket, destination_key, data=response.content
166+
):
167+
logger.info(
168+
"File already exists with the same checksum, skipping upload",
169+
url=url,
170+
destination=f"{self.bucket}/{destination_key}",
171+
)
172+
return f"{self.bucket}/{destination_key}"
173+
132174
try:
133-
# Upload the file to S3
134-
self.client.put_object(
175+
response = self.client.put_object(
135176
Body=response.content,
136177
Bucket=self.bucket,
137178
Key=destination_key,
@@ -140,7 +181,10 @@ def download_and_upload_to_s3(self, url, prefix=None, headers=None, type=None):
140181
},
141182
ACL="public-read",
142183
)
143-
return f"{self.bucket}/{destination_key}"
184+
185+
version_id = response.get("VersionId")
186+
s3_url = f"{self.bucket}/{destination_key}"
187+
return {"url": s3_url, "version_id": version_id}
144188
except Exception as e:
145189
logger.error(
146190
"Failed to upload file",

tests/integration/common/cassettes/download_pdf.yaml

Lines changed: 17826 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
interactions:
2+
- request:
3+
body: null
4+
headers:
5+
Accept:
6+
- "*/*"
7+
Accept-Encoding:
8+
- gzip, deflate
9+
Connection:
10+
- keep-alive
11+
User-Agent:
12+
- python-requests/2.31.0
13+
method: HEAD
14+
uri: https://scoap3-prod-backend.s3.cern.ch/media/harvested_files/10.1155/2024/3681297/3681297.pdf
15+
response:
16+
body:
17+
string: ""
18+
headers:
19+
Accept-Ranges:
20+
- bytes
21+
Bucket:
22+
- scoap3-prod-backend
23+
Content-Length:
24+
- "1013660"
25+
Content-Type:
26+
- binary/octet-stream
27+
Date:
28+
- Wed, 19 Mar 2025 14:45:45 GMT
29+
Etag:
30+
- '"e737b90022b130cf83e58f253ad47a5f"'
31+
Last-Modified:
32+
- Tue, 25 Feb 2025 00:15:39 GMT
33+
X-Amz-Meta-Source-Url:
34+
- https://s3.amazonaws.com/downloads.hindawi.com/journals/ahep/2024/3681297.pdf
35+
X-Amz-Request-Id:
36+
- tx00000176051ff8bcdcbfb-0067dad899-3b453ea1-default
37+
X-Rgw-Object-Type:
38+
- Normal
39+
status:
40+
code: 200
41+
message: OK
42+
version: 1

0 commit comments

Comments
 (0)