Skip to content

fix: OPTIC-1287: Project config validation should ignore xml comments #6613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 12, 2024
47 changes: 29 additions & 18 deletions label_studio/core/label_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
from collections import OrderedDict, defaultdict
from typing import Tuple, Union
from urllib.parse import urlencode

import defusedxml.ElementTree as etree
Expand Down Expand Up @@ -76,22 +77,37 @@
return config


def parse_config_to_json(config_string):
def parse_config_to_xml(config_string: Union[str, None], raise_on_empty: bool = False) -> Union[OrderedDict, None]:
if config_string is None:
if raise_on_empty:
raise TypeError('config_string is None')
return None

Check warning on line 84 in label_studio/core/label_config.py

View check run for this annotation

Codecov / codecov/patch

label_studio/core/label_config.py#L84

Added line #L84 was not covered by tests

xml = etree.fromstring(config_string, forbid_dtd=True)

# Remove comments
for comment in xml.findall('.//comment'):
comment.getparent().remove(comment)

Check warning on line 90 in label_studio/core/label_config.py

View check run for this annotation

Codecov / codecov/patch

label_studio/core/label_config.py#L90

Added line #L90 was not covered by tests

return xml


def parse_config_to_json(config_string: Union[str, None]) -> Tuple[Union[OrderedDict, None], Union[str, None]]:
try:
xml = etree.fromstring(config_string, forbid_dtd=False)
xml = parse_config_to_xml(config_string, raise_on_empty=True)
except TypeError:
raise etree.ParseError('can only parse strings')
if xml is None:
raise etree.ParseError('xml is empty or incorrect')
config = xmljson.badgerfish.data(xml)
config = _fix_choices(config)
return config
return config, etree.tostring(xml, encoding='unicode')


def validate_label_config(config_string):
def validate_label_config(config_string: Union[str, None]) -> None:
# xml and schema
try:
config = parse_config_to_json(config_string)
config, cleaned_config_string = parse_config_to_json(config_string)
jsonschema.validate(config, _LABEL_CONFIG_SCHEMA_DATA)
except (etree.ParseError, ValueError) as exc:
raise LabelStudioValidationErrorSentryIgnored(str(exc))
Expand All @@ -106,13 +122,13 @@
raise LabelStudioValidationErrorSentryIgnored(error_message)

# unique names in config # FIXME: 'name =' (with spaces) won't work
all_names = re.findall(r'name="([^"]*)"', config_string)
all_names = re.findall(r'name="([^"]*)"', cleaned_config_string)
if len(set(all_names)) != len(all_names):
raise LabelStudioValidationErrorSentryIgnored('Label config contains non-unique names')

# toName points to existent name
names = set(all_names)
toNames = re.findall(r'toName="([^"]*)"', config_string)
toNames = re.findall(r'toName="([^"]*)"', cleaned_config_string)
for toName_ in toNames:
for toName in toName_.split(','):
if toName not in names:
Expand All @@ -121,7 +137,7 @@

def extract_data_types(label_config):
# load config
xml = etree.fromstring(label_config, forbid_dtd=False)
xml = parse_config_to_xml(label_config)
if xml is None:
raise etree.ParseError('Project config is empty or incorrect')

Expand Down Expand Up @@ -185,16 +201,11 @@


def config_line_stipped(c):
tree = etree.fromstring(c, forbid_dtd=False)
comments = tree.xpath('//comment()')

for c in comments:
p = c.getparent()
if p is not None:
p.remove(c)
c = etree.tostring(tree, method='html').decode('utf-8')
xml = parse_config_to_xml(c)
if xml is None:
return None

Check warning on line 206 in label_studio/core/label_config.py

View check run for this annotation

Codecov / codecov/patch

label_studio/core/label_config.py#L204-L206

Added lines #L204 - L206 were not covered by tests

return c.replace('\n', '').replace('\r', '')
return etree.tostring(xml, encoding='unicode').replace('\n', '').replace('\r', '')

Check warning on line 208 in label_studio/core/label_config.py

View check run for this annotation

Codecov / codecov/patch

label_studio/core/label_config.py#L208

Added line #L208 was not covered by tests


def get_task_from_labeling_config(config):
Expand Down Expand Up @@ -243,7 +254,7 @@
def generate_sample_task_without_check(label_config, mode='upload', secure_mode=False):
"""Generate sample task only"""
# load config
xml = etree.fromstring(label_config, forbid_dtd=False)
xml = parse_config_to_xml(label_config)
if xml is None:
raise etree.ParseError('Project config is empty or incorrect')

Expand Down
33 changes: 33 additions & 0 deletions label_studio/tests/config_validation.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1403,3 +1403,36 @@ stages:
url: '{django_live_url}/api/projects/{pk}/import'
response:
status_code: 201

---
test_name: Validation should ignore xml comments
strict: false
marks:
- usefixtures:
- django_live_url

stages:
- # Signup to the system
id: signup
type: ref
- name: create project
request:
method: POST
url: '{django_live_url}/api/projects'
json:
label_config: <View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="A"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View>
response:
save:
json:
pk: id
status_code: 201
- name: validate config will not throw config contains non-unique names error
request:
headers:
content-type: application/json
json:
label_config: '<View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="C"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View><!-- Custom script draft --><!-- const taxonomy = document.querySelector(`[name="taxonomy"]`);-->'
method: POST
url: '{django_live_url}/api/projects/{pk}/validate'
response:
status_code: 200
Loading