Skip to content

Commit 3aaa413

Browse files
committed
Revert "fix: LEAP-1404: Support multiple import storages per provider (HumanSignal#6216)"
This reverts commit 52507c2.
1 parent 9ff95b0 commit 3aaa413

File tree

10 files changed

+41
-153
lines changed

10 files changed

+41
-153
lines changed

label_studio/io_storages/azure_blob/models.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import logging
55
import re
66
from datetime import datetime, timedelta
7-
from typing import Union
87
from urllib.parse import urlparse
98

109
from azure.core.exceptions import ResourceNotFoundError
@@ -23,7 +22,6 @@
2322
ImportStorageLink,
2423
ProjectStorageMixin,
2524
)
26-
from io_storages.utils import storage_can_resolve_bucket_url
2725
from tasks.models import Annotation
2826

2927
from label_studio.io_storages.azure_blob.utils import AZURE
@@ -158,9 +156,6 @@ def generate_http_url(self, url):
158156
'https://' + self.get_account_name() + '.blob.core.windows.net/' + container + '/' + blob + '?' + sas_token
159157
)
160158

161-
def can_resolve_url(self, url: Union[str, None]) -> bool:
162-
return storage_can_resolve_bucket_url(self, url)
163-
164159
def get_blob_metadata(self, key):
165160
return AZURE.get_blob_metadata(
166161
key, self.container, account_name=self.account_name, account_key=self.account_key

label_studio/io_storages/base_models.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import logging
66
import traceback as tb
77
from datetime import datetime
8-
from typing import Union
98
from urllib.parse import urljoin
109

1110
import django_rq
@@ -232,13 +231,9 @@ def get_data(self, key):
232231
def generate_http_url(self, url):
233232
raise NotImplementedError
234233

235-
def can_resolve_url(self, url: Union[str, None]) -> bool:
236-
return self.can_resolve_scheme(url)
237-
238-
def can_resolve_scheme(self, url: Union[str, None]) -> bool:
239-
if not url:
240-
return False
241-
# TODO: Search for occurrences inside string, e.g. for cases like "gs://bucket/file.pdf" or "<embed src='gs://bucket/file.pdf'/>"
234+
def can_resolve_url(self, url):
235+
# TODO: later check to the full prefix like "url.startswith(self.path_full)"
236+
# Search of occurrences inside string, e.g. for cases like "gs://bucket/file.pdf" or "<embed src='gs://bucket/file.pdf'/>"
242237
_, prefix = get_uri_via_regex(url, prefixes=(self.url_scheme,))
243238
if prefix == self.url_scheme:
244239
return True
@@ -266,8 +261,8 @@ def resolve_uri(self, uri, task=None):
266261
elif isinstance(uri, str):
267262
try:
268263
# extract uri first from task data
269-
extracted_uri, _ = get_uri_via_regex(uri, prefixes=(self.url_scheme,))
270-
if not self.can_resolve_url(extracted_uri):
264+
extracted_uri, extracted_storage = get_uri_via_regex(uri, prefixes=(self.url_scheme,))
265+
if not extracted_storage:
271266
logger.debug(f'No storage info found for URI={uri}')
272267
return
273268

label_studio/io_storages/gcs/models.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
"""
33
import json
44
import logging
5-
from typing import Union
65

76
from core.redis import start_job_async_or_sync
87
from django.conf import settings
@@ -18,7 +17,6 @@
1817
ProjectStorageMixin,
1918
)
2019
from io_storages.gcs.utils import GCS
21-
from io_storages.utils import storage_can_resolve_bucket_url
2220
from tasks.models import Annotation
2321

2422
logger = logging.getLogger(__name__)
@@ -95,9 +93,6 @@ def generate_http_url(self, url):
9593
presign_ttl=self.presign_ttl,
9694
)
9795

98-
def can_resolve_url(self, url: Union[str, None]) -> bool:
99-
return storage_can_resolve_bucket_url(self, url)
100-
10196
def scan_and_create_links(self):
10297
return self._scan_and_create_links(GCSImportStorageLink)
10398

label_studio/io_storages/s3/models.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import json
44
import logging
55
import re
6-
from typing import Union
76

87
import boto3
98
from core.feature_flags import flag_set
@@ -21,7 +20,6 @@
2120
ProjectStorageMixin,
2221
)
2322
from io_storages.s3.utils import get_client_and_resource, resolve_s3_url
24-
from io_storages.utils import storage_can_resolve_bucket_url
2523
from tasks.models import Annotation
2624
from tasks.validation import ValidationError as TaskValidationError
2725

@@ -177,9 +175,6 @@ def get_data(self, key):
177175
def generate_http_url(self, url):
178176
return resolve_s3_url(url, self.get_client(), self.presign, expires_in=self.presign_ttl * 60)
179177

180-
def can_resolve_url(self, url: Union[str, None]) -> bool:
181-
return storage_can_resolve_bucket_url(self, url)
182-
183178
def get_blob_metadata(self, key):
184179
return AWS.get_blob_metadata(
185180
key,

label_studio/io_storages/utils.py

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,14 @@
22
"""
33
import logging
44
import re
5-
from dataclasses import dataclass
6-
from typing import Union
75

86
logger = logging.getLogger(__name__)
97

108
# Put storage prefixes here
119
uri_regex = r"([\"'])(?P<uri>(?P<storage>{})://[^\1=]*)\1"
1210

1311

14-
@dataclass
15-
class BucketURI:
16-
bucket: str
17-
path: str
18-
scheme: str
19-
20-
21-
def get_uri_via_regex(data, prefixes=('s3', 'gs')) -> tuple[Union[str, None], Union[str, None]]:
12+
def get_uri_via_regex(data, prefixes=('s3', 'gs')):
2213
data = str(data).strip()
2314
middle_check = False
2415

@@ -47,38 +38,3 @@ def get_uri_via_regex(data, prefixes=('s3', 'gs')) -> tuple[Union[str, None], Un
4738
logger.warning("Can't parse task.data to match URI. Reason: Match is not found.")
4839
return None, None
4940
return r_match.group('uri'), r_match.group('storage')
50-
51-
52-
def parse_bucket_uri(value: object, storage) -> Union[BucketURI, None]:
53-
if not value:
54-
return None
55-
56-
uri, _ = get_uri_via_regex(value, prefixes=(storage.url_scheme,))
57-
if not uri:
58-
return None
59-
60-
try:
61-
scheme, rest = uri.split('://', 1)
62-
bucket, path = rest.split('/', 1)
63-
except ValueError:
64-
return None
65-
66-
return BucketURI(bucket=bucket, path=path, scheme=scheme)
67-
68-
69-
def storage_can_resolve_bucket_url(storage, url) -> bool:
70-
if not storage.can_resolve_scheme(url):
71-
return False
72-
73-
uri = parse_bucket_uri(url, storage)
74-
if not uri:
75-
return False
76-
77-
storage_bucket: str | None = getattr(storage, 'bucket', None) or getattr(storage, 'container', None)
78-
if storage_bucket != uri.bucket:
79-
return False
80-
81-
if storage.prefix and not uri.path.startswith(storage.prefix):
82-
return False
83-
84-
return True

label_studio/tests/conftest.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,7 @@ def s3_with_hypertext_s3_links(s3):
145145
s3.put_object(
146146
Bucket=bucket_name,
147147
Key='test.json',
148-
Body=json.dumps(
149-
{'text': '<a href="s3://pytest-s3-jsons-hypertext/file with /spaces and\' / \' / quotes.jpg"/>'}
150-
),
148+
Body=json.dumps({'text': '<a href="s3://hypertext-bucket/file with /spaces and\' / \' / quotes.jpg"/>'}),
151149
)
152150
yield s3
153151

@@ -159,11 +157,7 @@ def s3_with_partially_encoded_s3_links(s3):
159157
s3.put_object(
160158
Bucket=bucket_name,
161159
Key='test.json',
162-
Body=json.dumps(
163-
{
164-
'text': '<a href="s3://pytest-s3-json-partially-encoded/file with /spaces and\' / \' / %2Bquotes%3D.jpg"/>'
165-
}
166-
),
160+
Body=json.dumps({'text': '<a href="s3://hypertext-bucket/file with /spaces and\' / \' / %2Bquotes%3D.jpg"/>'}),
167161
)
168162
yield s3
169163

label_studio/tests/io_storages.tavern.yml

Lines changed: 13 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -249,36 +249,6 @@ stages:
249249
response:
250250
status_code: 400
251251

252-
- id: import_task_invalid_url
253-
name: Import task from wrong bucket
254-
request:
255-
url: "{django_live_url}/api/projects/{project_pk}/tasks"
256-
json:
257-
data:
258-
image_url: "s3:/"
259-
method: POST
260-
headers:
261-
content-type: application/json
262-
response:
263-
status_code: 201
264-
save:
265-
json:
266-
task_pk: id
267-
268-
# check, that image_url is not resolved and api is not broken
269-
- id: get_task
270-
name: Get task and check, that image_url is not resolved
271-
request:
272-
url: "{django_live_url}/api/tasks/{task_pk}"
273-
method: GET
274-
headers:
275-
content-type: application/json
276-
response:
277-
status_code: 200
278-
json:
279-
data:
280-
image_url: "s3:/"
281-
282252
---
283253
test_name: test_import_from_s3_storage_recursive_scan
284254
strict: false
@@ -605,19 +575,16 @@ stages:
605575
content-type: application/json
606576
json:
607577
data:
608-
image: gs://test-gs-bucket_JSON/manual.link.jpg
578+
image: gs://whatever-bucket-with/manual.link.jpg
609579
dict:
610-
key1: gs://test-gs-bucket_JSON/manual.link.jpg
580+
key1: gs://whatever-bucket-with/manual.link.jpg
611581
array:
612-
- gs://test-gs-bucket_JSON/manual.link.jpg
613-
- gs://test-gs-bucket_JSON/manual.link.jpg
582+
- gs://whatever-bucket-with/manual.link.jpg
583+
- gs://whatever-bucket-with/manual.link.jpg
614584
array:
615-
- item1: gs://test-gs-bucket_JSON/manual.link.jpg
585+
- item1: gs://whatever-bucket-with/manual.link.jpg
616586
some: 'some text'
617-
- item2: gs://test-gs-bucket_JSON/manual.link.jpg
618-
some: 'some text'
619-
# This link should not be resolved - no matching bucket
620-
- item3: gs://bad-bucket/manual.link.jpg
587+
- item2: gs://whatever-bucket-with/manual.link.jpg
621588
some: 'some text'
622589
method: POST
623590
url: '{django_live_url}/api/projects/{project_pk}/import'
@@ -650,19 +617,16 @@ stages:
650617
response:
651618
json:
652619
data:
653-
image: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
620+
image: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
654621
dict:
655-
key1: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
622+
key1: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
656623
array:
657-
- !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
658-
- !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
624+
- !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
625+
- !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
659626
array:
660-
- item1: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
661-
some: 'some text'
662-
- item2: !re_match "https://storage.googleapis.com/test-gs-bucket_JSON/manual.link.+"
627+
- item1: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
663628
some: 'some text'
664-
# This link should remain unresolved - no matching bucket
665-
- item3: !re_match "gs://bad-bucket/manual.link.+"
629+
- item2: !re_match "https://storage.googleapis.com/whatever-bucket-with/manual.link.+"
666630
some: 'some text'
667631
status_code: 200
668632

@@ -1605,7 +1569,7 @@ stages:
16051569
response:
16061570
json:
16071571
data:
1608-
text: !re_match "<a href=\"https://pytest-s3-jsons-hypertext.s3.amazonaws.com/file%20with%20/spaces%20and%27%20/%20%27%20/%20quotes.jpg.+X-Amz-Security-Token=testing"
1572+
text: !re_match "<a href=\"https://hypertext-bucket.s3.amazonaws.com/file%20with%20/spaces%20and%27%20/%20%27%20/%20quotes.jpg.+X-Amz-Security-Token=testing"
16091573
status_code: 200
16101574

16111575
---

label_studio/tests/io_storages_presign_endpoints.tavern.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,11 @@ stages:
101101

102102
- name: get_presigned_url
103103
request:
104-
url: "{django_live_url}/projects/{project_pk}/presign?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC9tYW51YWwubGluay5qcGc="
104+
url: "{django_live_url}/projects/{project_pk}/presign?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
105105
response:
106106
status_code: 303
107107
headers:
108-
location: "https://storage.googleapis.com/test-gs-bucket/manual.link.jpg"
108+
location: "https://storage.googleapis.com/whatever-bucket-with/manual.link.jpg"
109109

110110

111111
---
@@ -139,8 +139,8 @@ stages:
139139

140140
- name: get_presigned_url
141141
request:
142-
url: "{django_live_url}/tasks/{task_pk}/presign?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldC9tYW51YWwubGluay5qcGc="
142+
url: "{django_live_url}/tasks/{task_pk}/presign?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
143143
response:
144144
status_code: 303
145145
headers:
146-
location: "https://storage.googleapis.com/test-gs-bucket/manual.link.jpg"
146+
location: "https://storage.googleapis.com/whatever-bucket-with/manual.link.jpg"

label_studio/tests/io_storages_presign_proxy.tavern.yml

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -431,19 +431,16 @@ stages:
431431
content-type: application/json
432432
json:
433433
data:
434-
image: gs://test-gs-bucket_JSON/manual.link.jpg
434+
image: gs://whatever-bucket-with/manual.link.jpg
435435
dict:
436-
key1: gs://test-gs-bucket_JSON/manual.link.jpg
436+
key1: gs://whatever-bucket-with/manual.link.jpg
437437
array:
438-
- gs://test-gs-bucket_JSON/manual.link.jpg
439-
- gs://test-gs-bucket_JSON/manual.link.jpg
438+
- gs://whatever-bucket-with/manual.link.jpg
439+
- gs://whatever-bucket-with/manual.link.jpg
440440
array:
441-
- item1: gs://test-gs-bucket_JSON/manual.link.jpg
441+
- item1: gs://whatever-bucket-with/manual.link.jpg
442442
some: "some text"
443-
- item2: gs://test-gs-bucket_JSON/manual.link.jpg
444-
some: "some text"
445-
# This link should not be resolved - no matching bucket
446-
- item3: gs://bad-bucket/manual.link.jpg
443+
- item2: gs://whatever-bucket-with/manual.link.jpg
447444
some: "some text"
448445
method: POST
449446
url: "{django_live_url}/api/projects/{project_pk}/import"
@@ -476,19 +473,16 @@ stages:
476473
response:
477474
json:
478475
data:
479-
image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
476+
image: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
480477
dict:
481-
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
478+
key1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
482479
array:
483-
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
484-
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
480+
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
481+
- !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
485482
array:
486-
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
487-
some: "some text"
488-
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly90ZXN0LWdzLWJ1Y2tldF9KU09OL21hbnVhbC5saW5rLmpwZw=="
483+
- item1: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
489484
some: "some text"
490-
# This link should remain unresolved - no matching bucket
491-
- item3: !re_match "gs://bad-bucket/manual.link.+"
485+
- item2: !re_match "/tasks/\\d+/presign/\\?fileuri=Z3M6Ly93aGF0ZXZlci1idWNrZXQtd2l0aC9tYW51YWwubGluay5qcGc="
492486
some: "some text"
493487
status_code: 200
494488
---
@@ -1158,7 +1152,7 @@ stages:
11581152
response:
11591153
json:
11601154
data:
1161-
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtanNvbnMtaHlwZXJ0ZXh0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gcXVvdGVzLmpwZw=="
1155+
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gcXVvdGVzLmpwZw=="
11621156
status_code: 200
11631157
---
11641158
# - Check that json blobs containing partially encoded contents resolve correctly from the bucket,
@@ -1222,7 +1216,7 @@ stages:
12221216
response:
12231217
json:
12241218
data:
1225-
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9weXRlc3QtczMtanNvbi1wYXJ0aWFsbHktZW5jb2RlZC9maWxlIHdpdGggL3NwYWNlcyBhbmQnIC8gJyAvICUyQnF1b3RlcyUzRC5qcGc="
1219+
text: !re_match "<a href=\"/tasks/\\d+/presign/\\?fileuri=czM6Ly9oeXBlcnRleHQtYnVja2V0L2ZpbGUgd2l0aCAvc3BhY2VzIGFuZCcgLyAnIC8gJTJCcXVvdGVzJTNELmpwZw=="
12261220
status_code: 200
12271221
---
12281222
# we don't fail when unexisted s3:// links occur in the list

0 commit comments

Comments
 (0)