Skip to content

fix: OPTIC-1341: Filtering on data fields is not working when $undefined$ and other columns in task data #6641

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions label_studio/data_manager/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def preprocess_filter(_filter, *_):
return _filter


def preprocess_field_name(raw_field_name, only_undefined_field=False) -> Tuple[str, bool]:
def preprocess_field_name(raw_field_name, project) -> Tuple[str, bool]:
"""Transform a field name (as specified in the datamanager views endpoint) to
a django ORM field name. Also handle dotted accesses to task.data.

Expand Down Expand Up @@ -389,7 +389,25 @@ def preprocess_field_name(raw_field_name, only_undefined_field=False) -> Tuple[s
field_name = field_name[1:]

if field_name.startswith('data.'):
if only_undefined_field:
# process as $undefined$ only if real_name is from labeling config, not from task.data
real_name = field_name.replace('data.', '')
common_data_columns = project.summary.common_data_columns
real_name_suitable = (
# there is only one object tag in labeling config
# and requested filter name == value from object tag
len(project.data_types.keys()) == 1
and real_name in project.data_types.keys()
# file was uploaded before labeling config is set, `data.data` is system predefined name
or len(project.data_types.keys()) == 0
and real_name == 'data'
)
if (
real_name_suitable
# common data columns are not None
and common_data_columns
# $undefined$ is in common data columns, in all tasks
and settings.DATA_UNDEFINED_NAME in common_data_columns
):
field_name = f'data__{settings.DATA_UNDEFINED_NAME}'
else:
field_name = field_name.replace('data.', 'data__')
Expand Down
6 changes: 2 additions & 4 deletions label_studio/data_manager/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,7 @@ def apply_ordering(queryset, ordering, project, request, view_data=None):
and view_data['columnsDisplayType'][unsigned_field_name] == 'Number'
):
numeric_ordering = True
field_name, ascending = preprocess_field_name(
raw_field_name, only_undefined_field=project.only_undefined_field
)
field_name, ascending = preprocess_field_name(raw_field_name, project=project)

if field_name.startswith('data__'):
# annotate task with data field for float/int/bool ordering support
Expand Down Expand Up @@ -275,7 +273,7 @@ def apply_filters(queryset, filters, project, request):

# django orm loop expression attached to column name
preprocess_field_name = load_func(settings.PREPROCESS_FIELD_NAME)
field_name, _ = preprocess_field_name(_filter.filter, project.only_undefined_field)
field_name, _ = preprocess_field_name(_filter.filter, project)

# filter pre-processing, value type conversion, etc..
preprocess_filter = load_func(settings.DATA_MANAGER_PREPROCESS_FILTER)
Expand Down
10 changes: 9 additions & 1 deletion label_studio/projects/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,22 @@ def make_queryset_from_iterable(tasks_list):
def recalculate_created_annotations_and_labels_from_scratch(
project: 'Project', summary: 'ProjectSummary', organization_id: int
) -> None:
"""Recalculate created_labels, created_annotations and created_labels_drafts from scratch
"""Recalculate from scratch:
task columns
created_labels
created_annotations
created_labels_drafts

:param project: Project
:param summary: ProjectSummary
:param organization_id: Organization.id, it is required for django-rq displaying on admin page
"""
logger.info(f'Reset cache started for project {project.id} and organization {organization_id}')

summary.all_data_columns = {}
summary.common_data_columns = []
summary.update_data_columns(project.tasks.only('data'))

summary.created_labels, summary.created_annotations = {}, {}
summary.update_created_annotations_and_labels(project.annotations.all())

Expand Down
8 changes: 0 additions & 8 deletions label_studio/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,14 +350,6 @@ def secure_mode(self):
def one_object_in_label_config(self):
return len(self.data_types) <= 1

@property
def only_undefined_field(self):
return (
self.one_object_in_label_config
and self.summary.common_data_columns
and self.summary.common_data_columns[0] == settings.DATA_UNDEFINED_NAME
)

@property
def get_labeled_count(self):
return self.tasks.filter(is_labeled=True).count()
Expand Down
129 changes: 129 additions & 0 deletions label_studio/tests/data_manager/test_undefined.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"""This file and its contents are licensed under the Apache License 2.0. Please see the included NOTICE for copyright information and LICENSE for a copy of the license.
"""
import json

import pytest
from django.conf import settings
from projects.functions.utils import recalculate_created_annotations_and_labels_from_scratch
from projects.models import Project, ProjectSummary

from ..utils import make_task, project_id # noqa


def get_filtered_task_ids(business_client, view_id):
response = business_client.get(f'/api/tasks/?view={view_id}')
response_data = response.json()
assert 'tasks' in response_data, response_data
return [task['id'] for task in response_data['tasks']]


def apply_filter_and_get_view_id(business_client, project_id, filters):
payload = {
'project': project_id,
'data': {'filters': filters},
}
response = business_client.post(
'/api/dm/views/',
data=json.dumps(payload),
content_type='application/json',
)
assert response.status_code == 201, response.content
return response.json()['id']


@pytest.mark.django_db
def test_views_filters_with_undefined(business_client, project_id):
"""
1. Import task 1: {"$undefined$": "photo1.jpg"}
2. Filter by `data` with value `photo`
3. It should return task 1

4. Set labeling config <View> <Image value="$image" name="img"/> </View>
5. Filter by `image` with value `photo`
6. It should return task 1

7. Add task 2: {"$undefined$": "photo2.jpg", "extra": "123"}
8. Filter by "extra": "123"
9. It should return task 2
10. Filter by "image" with value `photo`
11. It should return task 1 and task 2

12. Update task 1 with {"extra": "456"}
13. Check project.summary.common_data_columns, there should be ["$undefined$", "extra"]

14. Filter by "image" with "photo" should return task 1 and task 2
"""
project = Project.objects.get(pk=project_id)

# Step 1: Import task 1: {"$undefined$": "photo1.jpg"}
task_data_field_name = settings.DATA_UNDEFINED_NAME # "$undefined$"
task_1 = make_task({'data': {task_data_field_name: 'photo1.jpg'}}, project)
task_id_1 = task_1.id

# Step 2-3: Filter by `data` with value `photo`, should return task 1
filters = {
'conjunction': 'and',
'items': [
{
# data default name when label config is not yet set
# and a file is uploaded directly
'filter': 'filter:tasks:data.data',
'operator': 'contains',
'type': 'String',
'value': 'photo',
}
],
}
view_id = apply_filter_and_get_view_id(business_client, project_id, filters)
response_ids = get_filtered_task_ids(business_client, view_id)
assert set(response_ids) == {task_id_1}, f'Expected {[task_id_1]}, got {response_ids}'

# Step 4: Set labeling config <View> <Image value="$image" name="img"/> </View>
project.label_config = '<View> <Image value="$image" name="img"/> </View>'
project.save()

# Step 5-6: Filter by `image` with value `photo`, should return task 1
filters['items'][0]['filter'] = 'filter:tasks:data.image'
view_id = apply_filter_and_get_view_id(business_client, project_id, filters)
response_ids = get_filtered_task_ids(business_client, view_id)
assert set(response_ids) == {task_id_1}, f'Expected {[task_id_1]}, got {response_ids}'

# Step 7: Add task 2: {"$undefined$": "photo2.jpg", "extra": "123"}
task_2 = make_task({'data': {task_data_field_name: 'photo2.jpg', 'extra': '123'}}, project)
task_id_2 = task_2.id

# Step 8-9: Filter by "extra": "123", should return task 2
filters['items'][0]['filter'] = 'filter:tasks:data.extra'
filters['items'][0]['value'] = '123'
view_id = apply_filter_and_get_view_id(business_client, project_id, filters)
response_ids = get_filtered_task_ids(business_client, view_id)
assert set(response_ids) == {task_id_2}, f'Expected {[task_id_2]}, got {response_ids}'

# Step 10-11: Filter by "image" with value `photo`, should return task 1 and task 2
filters['items'][0]['filter'] = 'filter:tasks:data.image'
filters['items'][0]['value'] = 'photo'
view_id = apply_filter_and_get_view_id(business_client, project_id, filters)
response_ids = get_filtered_task_ids(business_client, view_id)
assert set(response_ids) == {task_id_1, task_id_2}, f'Expected {[task_id_1, task_id_2]}, got {response_ids}'

# Step 12: Update task 1 with {"extra": "456"}
task_1.data['extra'] = '456'
task_1.save()

# we need to fully reset cache, because summary.update_data_columns()
# can't work incrementally
recalculate_created_annotations_and_labels_from_scratch(project, project.summary, 1)

# Step 13: Check project.summary.common_data_columns, there should be ["$undefined$", "extra"]
project.refresh_from_db()
summary = ProjectSummary.objects.get(project=project)
assert set(summary.common_data_columns) == {
task_data_field_name,
'extra',
}, f"Expected {[task_data_field_name, 'extra']}, got {summary.common_data_columns}"

# Step 14: Filter by "image" with "photo" should return task 1 and task 2
# The filter is already set to 'photo' for 'data.image' from previous steps
view_id = apply_filter_and_get_view_id(business_client, project_id, filters)
response_ids = get_filtered_task_ids(business_client, view_id)
assert set(response_ids) == {task_id_1, task_id_2}, f'Expected {[task_id_1, task_id_2]}, got {response_ids}'
Loading