Skip to content

fix: OPTIC-1287: Project config validation should ignore xml comments #6613

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 12, 2024
47 changes: 29 additions & 18 deletions label_studio/core/label_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import re
from collections import OrderedDict, defaultdict
from typing import Tuple, Union
from urllib.parse import urlencode

import defusedxml.ElementTree as etree
Expand Down Expand Up @@ -76,22 +77,37 @@ def _fix_choices(config):
return config


def parse_config_to_json(config_string):
def parse_config_to_xml(config_string: Union[str, None], raise_on_empty: bool = False) -> Union[OrderedDict, None]:
if config_string is None:
if raise_on_empty:
raise TypeError('config_string is None')
return None

xml = etree.fromstring(config_string, forbid_dtd=True)

# Remove comments
for comment in xml.findall('.//comment'):
comment.getparent().remove(comment)

return xml


def parse_config_to_json(config_string: Union[str, None]) -> Tuple[Union[OrderedDict, None], Union[str, None]]:
try:
xml = etree.fromstring(config_string, forbid_dtd=False)
xml = parse_config_to_xml(config_string, raise_on_empty=True)
except TypeError:
raise etree.ParseError('can only parse strings')
if xml is None:
raise etree.ParseError('xml is empty or incorrect')
config = xmljson.badgerfish.data(xml)
config = _fix_choices(config)
return config
return config, etree.tostring(xml)


def validate_label_config(config_string):
def validate_label_config(config_string: Union[str, None]) -> None:
# xml and schema
try:
config = parse_config_to_json(config_string)
config, cleaned_config_string = parse_config_to_json(config_string)
jsonschema.validate(config, _LABEL_CONFIG_SCHEMA_DATA)
except (etree.ParseError, ValueError) as exc:
raise LabelStudioValidationErrorSentryIgnored(str(exc))
Expand All @@ -106,13 +122,13 @@ def validate_label_config(config_string):
raise LabelStudioValidationErrorSentryIgnored(error_message)

# unique names in config # FIXME: 'name =' (with spaces) won't work
all_names = re.findall(r'name="([^"]*)"', config_string)
all_names = re.findall(r'name="([^"]*)"', cleaned_config_string)
if len(set(all_names)) != len(all_names):
raise LabelStudioValidationErrorSentryIgnored('Label config contains non-unique names')

# toName points to existent name
names = set(all_names)
toNames = re.findall(r'toName="([^"]*)"', config_string)
toNames = re.findall(r'toName="([^"]*)"', cleaned_config_string)
for toName_ in toNames:
for toName in toName_.split(','):
if toName not in names:
Expand All @@ -121,7 +137,7 @@ def validate_label_config(config_string):

def extract_data_types(label_config):
# load config
xml = etree.fromstring(label_config, forbid_dtd=False)
xml = parse_config_to_xml(label_config)
if xml is None:
raise etree.ParseError('Project config is empty or incorrect')

Expand Down Expand Up @@ -185,16 +201,11 @@ def get_all_object_tag_names(label_config):


def config_line_stipped(c):
tree = etree.fromstring(c, forbid_dtd=False)
comments = tree.xpath('//comment()')

for c in comments:
p = c.getparent()
if p is not None:
p.remove(c)
c = etree.tostring(tree, method='html').decode('utf-8')
xml = parse_config_to_xml(c)
if xml is None:
return None

return c.replace('\n', '').replace('\r', '')
return etree.tostring(xml, encoding='unicode').replace('\n', '').replace('\r', '')


def get_task_from_labeling_config(config):
Expand Down Expand Up @@ -243,7 +254,7 @@ def data_examples(mode):
def generate_sample_task_without_check(label_config, mode='upload', secure_mode=False):
"""Generate sample task only"""
# load config
xml = etree.fromstring(label_config, forbid_dtd=False)
xml = parse_config_to_xml(label_config)
if xml is None:
raise etree.ParseError('Project config is empty or incorrect')

Expand Down
33 changes: 33 additions & 0 deletions label_studio/tests/config_validation.tavern.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1403,3 +1403,36 @@ stages:
url: '{django_live_url}/api/projects/{pk}/import'
response:
status_code: 201

---
test_name: Validation should ignore xml comments
strict: false
marks:
- usefixtures:
- django_live_url

stages:
- # Signup to the system
id: signup
type: ref
- name: create project
request:
method: POST
url: '{django_live_url}/api/projects'
json:
label_config: <View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="A"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View>
response:
save:
json:
pk: id
status_code: 201
- name: validate config will not throw config contains non-unique names error
request:
headers:
content-type: application/json
json:
label_config: '<View> <Text name="text" value="$text"/><Taxonomy name="taxonomy" toName="text"><Choice value="C"><Choice value="A_1"/><Choice value="A_2"/></Choice><Choice value="B"/></Taxonomy></View><!-- Custom script draft --><!-- const taxonomy = document.querySelector(`[name="taxonomy"]`);-->'
method: POST
url: '{django_live_url}/api/projects/{pk}/validate'
response:
status_code: 200
Loading