Skip to content

Commit 939a694

Browse files
bmartelmakseq
authored andcommitted
fix: OPTIC-1287: Project config validation should ignore xml comments (HumanSignal#6613)
Co-authored-by: Max Tkachenko <[email protected]>
1 parent 75f164d commit 939a694

File tree

2 files changed

+62
-18
lines changed

2 files changed

+62
-18
lines changed

label_studio/core/label_config.py

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import re
66
from collections import OrderedDict, defaultdict
7+
from typing import Tuple, Union
78
from urllib.parse import urlencode
89

910
import defusedxml.ElementTree as etree
@@ -76,22 +77,37 @@ def _fix_choices(config):
7677
return config
7778

7879

79-
def parse_config_to_json(config_string):
80+
def parse_config_to_xml(config_string: Union[str, None], raise_on_empty: bool = False) -> Union[OrderedDict, None]:
81+
if config_string is None:
82+
if raise_on_empty:
83+
raise TypeError('config_string is None')
84+
return None
85+
86+
xml = etree.fromstring(config_string, forbid_dtd=True)
87+
88+
# Remove comments
89+
for comment in xml.findall('.//comment'):
90+
comment.getparent().remove(comment)
91+
92+
return xml
93+
94+
95+
def parse_config_to_json(config_string: Union[str, None]) -> Tuple[Union[OrderedDict, None], Union[str, None]]:
8096
try:
81-
xml = etree.fromstring(config_string, forbid_dtd=False)
97+
xml = parse_config_to_xml(config_string, raise_on_empty=True)
8298
except TypeError:
8399
raise etree.ParseError('can only parse strings')
84100
if xml is None:
85101
raise etree.ParseError('xml is empty or incorrect')
86102
config = xmljson.badgerfish.data(xml)
87103
config = _fix_choices(config)
88-
return config
104+
return config, etree.tostring(xml, encoding='unicode')
89105

90106

91-
def validate_label_config(config_string):
107+
def validate_label_config(config_string: Union[str, None]) -> None:
92108
# xml and schema
93109
try:
94-
config = parse_config_to_json(config_string)
110+
config, cleaned_config_string = parse_config_to_json(config_string)
95111
jsonschema.validate(config, _LABEL_CONFIG_SCHEMA_DATA)
96112
except (etree.ParseError, ValueError) as exc:
97113
raise LabelStudioValidationErrorSentryIgnored(str(exc))
@@ -106,13 +122,13 @@ def validate_label_config(config_string):
106122
raise LabelStudioValidationErrorSentryIgnored(error_message)
107123

108124
# unique names in config # FIXME: 'name =' (with spaces) won't work
109-
all_names = re.findall(r'name="([^"]*)"', config_string)
125+
all_names = re.findall(r'name="([^"]*)"', cleaned_config_string)
110126
if len(set(all_names)) != len(all_names):
111127
raise LabelStudioValidationErrorSentryIgnored('Label config contains non-unique names')
112128

113129
# toName points to existent name
114130
names = set(all_names)
115-
toNames = re.findall(r'toName="([^"]*)"', config_string)
131+
toNames = re.findall(r'toName="([^"]*)"', cleaned_config_string)
116132
for toName_ in toNames:
117133
for toName in toName_.split(','):
118134
if toName not in names:
@@ -121,7 +137,7 @@ def validate_label_config(config_string):
121137

122138
def extract_data_types(label_config):
123139
# load config
124-
xml = etree.fromstring(label_config, forbid_dtd=False)
140+
xml = parse_config_to_xml(label_config)
125141
if xml is None:
126142
raise etree.ParseError('Project config is empty or incorrect')
127143

@@ -185,16 +201,11 @@ def get_all_object_tag_names(label_config):
185201

186202

187203
def config_line_stipped(c):
188-
tree = etree.fromstring(c, forbid_dtd=False)
189-
comments = tree.xpath('//comment()')
190-
191-
for c in comments:
192-
p = c.getparent()
193-
if p is not None:
194-
p.remove(c)
195-
c = etree.tostring(tree, method='html').decode('utf-8')
204+
xml = parse_config_to_xml(c)
205+
if xml is None:
206+
return None
196207

197-
return c.replace('\n', '').replace('\r', '')
208+
return etree.tostring(xml, encoding='unicode').replace('\n', '').replace('\r', '')
198209

199210

200211
def get_task_from_labeling_config(config):
@@ -243,7 +254,7 @@ def data_examples(mode):
243254
def generate_sample_task_without_check(label_config, mode='upload', secure_mode=False):
244255
"""Generate sample task only"""
245256
# load config
246-
xml = etree.fromstring(label_config, forbid_dtd=False)
257+
xml = parse_config_to_xml(label_config)
247258
if xml is None:
248259
raise etree.ParseError('Project config is empty or incorrect')
249260

label_studio/tests/config_validation.tavern.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,3 +1403,36 @@ stages:
14031403
url: '{django_live_url}/api/projects/{pk}/import'
14041404
response:
14051405
status_code: 201
1406+
1407+
---
1408+
test_name: Validation should ignore xml comments
1409+
strict: false
1410+
marks:
1411+
- usefixtures:
1412+
- django_live_url
1413+
1414+
stages:
1415+
- # Signup to the system
1416+
id: signup
1417+
type: ref
1418+
- name: create project
1419+
request:
1420+
method: POST
1421+
url: '{django_live_url}/api/projects'
1422+
json:
1423+
label_config: <View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="A"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View>
1424+
response:
1425+
save:
1426+
json:
1427+
pk: id
1428+
status_code: 201
1429+
- name: validate config will not throw config contains non-unique names error
1430+
request:
1431+
headers:
1432+
content-type: application/json
1433+
json:
1434+
label_config: '<View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="C"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View><!-- Custom script draft --><!-- const taxonomy = document.querySelector(`[name="taxonomy"]`);-->'
1435+
method: POST
1436+
url: '{django_live_url}/api/projects/{pk}/validate'
1437+
response:
1438+
status_code: 200

0 commit comments

Comments
 (0)