Skip to content

[BUG] Hybrid Search not working with Document Level Security #1303

Open
@PaulRCampbell

Description

@PaulRCampbell

Hybrid Search queries fail when authenticating with an internal user that has Document Level Security enabled.

Steps to reproduce the behaviour:

this is a rather contrived example that I have used for testing purposes

create a role with a dls filter:

{"term": { "divisionId": "448385216"}}

authenticate with an internal user assigned to that role and then run a hybrid search query

GET my_test_index/_search?search_pipeline=hybrid-search-pipeline
{
  "query": {
    "hybrid": {
      "queries": [
        {
          "bool": {
            "filter": [
              {
                "term": {
                  "organisationId": 1
                }
              }
            ]
          }
        },
        {
          "neural": {
            "label_embedding.paraphrase-multilingual-MiniLM-L12-v2": {
              "query_text": "document",
              "model_id": "Cf9VfZYBPcPsA6UT332K",
              "k": "5",
              "filter": {
                "term": {
                  "organisationId": 1
                }
              }
            }
          }
        }
      ]
    }
  },
  "_source": ["label", "organisationId", "esType", "status", "divisionId"]
}

returns error

{
  "error": {
    "root_cause": [
      {
        "type": "class_cast_exception",
        "reason": "class org.apache.lucene.search.ConstantScoreQuery cannot be cast to class org.opensearch.neuralsearch.query.HybridQuery (org.apache.lucene.search.ConstantScoreQuery is in unnamed module of loader 'app'; org.opensearch.neuralsearch.query.HybridQuery is in unnamed module of loader java.net.FactoryURLClassLoader @57435801)"
      },
      {
        "type": "illegal_argument_exception",
        "reason": "hybrid query must be a top level query and cannot be wrapped into other queries"
      }
    ],
    "type": "search_phase_execution_exception",
    "reason": "all shards failed",
    "phase": "query",
    "grouped": true,
    "failed_shards": [
      {
        "shard": 0,
        "index": "my_test_index",
        "node": "j0xoYCCsTh2Jjoe_SNqmdQ",
        "reason": {
          "type": "class_cast_exception",
          "reason": "class org.apache.lucene.search.ConstantScoreQuery cannot be cast to class org.opensearch.neuralsearch.query.HybridQuery (org.apache.lucene.search.ConstantScoreQuery is in unnamed module of loader 'app'; org.opensearch.neuralsearch.query.HybridQuery is in unnamed module of loader java.net.FactoryURLClassLoader @57435801)"
        }
      },
      {
        "shard": 4,
        "index": "my_test_index",
        "node": "j0xoYCCsTh2Jjoe_SNqmdQ",
        "reason": {
          "type": "illegal_argument_exception",
          "reason": "hybrid query must be a top level query and cannot be wrapped into other queries"
        }
      }
    ],
    "caused_by": {
      "type": "class_cast_exception",
      "reason": "class org.apache.lucene.search.ConstantScoreQuery cannot be cast to class org.opensearch.neuralsearch.query.HybridQuery (org.apache.lucene.search.ConstantScoreQuery is in unnamed module of loader 'app'; org.opensearch.neuralsearch.query.HybridQuery is in unnamed module of loader java.net.FactoryURLClassLoader @57435801)",
      "caused_by": {
        "type": "class_cast_exception",
        "reason": "class org.apache.lucene.search.ConstantScoreQuery cannot be cast to class org.opensearch.neuralsearch.query.HybridQuery (org.apache.lucene.search.ConstantScoreQuery is in unnamed module of loader 'app'; org.opensearch.neuralsearch.query.HybridQuery is in unnamed module of loader java.net.FactoryURLClassLoader @57435801)"
      }
    }
  },
  "status": 500
}

Curiously, if I add a suggest block to the query I get the desired results (and the dsl filter is also being applied).

GET my_test_index/_search?search_pipeline=hybrid-search-pipeline
{
  "query": {
    "hybrid": {
      "queries": [
        {
          "bool": {
            "filter": [
              {
                "term": {
                  "organisationId": 1
                }
              }
            ]
          }
        },
        {
          "neural": {
            "label_embedding.paraphrase-multilingual-MiniLM-L12-v2": {
              "query_text": "document",
              "model_id": "Cf9VfZYBPcPsA6UT332K",
              "k": "5",
              "filter": {
                "term": {
                  "organisationId": 1
                }
              }
            }
          }
        }
      ]
    }
  },
    "suggest": {
    "text": "document",
    "label-suggest": {
      "term": {
        "field": "label"
      }
    }
  },
  "_source": ["label", "organisationId", "esType", "status", "divisionId"]
}

returns results

{
  "took": 1176,
  "timed_out": false,
  "_shards": {
    "total": 5,
    "successful": 5,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 1,
      "relation": "eq"
    },
    "max_score": 0.2,
    "hits": [
      {
        "_index": "my_test_index",
        "_id": "1_325861482",
        "_score": 0.2,
        "_routing": "1",
        "_source": {
          "organisationId": 1,
          "divisionId": 448385216,
          "label": "English Document",
          "status": "active"
        }
      }
    ]
  },
  "suggest": {
    "label-suggest": [
      {
        "text": "document",
        "offset": 0,
        "length": 8,
        "options": []
      }
    ]
  }
}

I have tested this on our AWS Opensearch managed service which is on version 2.17.1 and also running 2.19.1 locally with the following docker configuration. (note I have a seperate script to register and deploy the embedding model, and create ingest and search pipelines etc).

version: "3.7"
services:
  opensearch:
    build: .
    container_name: opensearch
    environment:
      - discovery.type=single-node
      - plugins.security.disabled=false
      - plugins.security.ssl.http.enabled=false
      - plugins.security.ssl.transport.enforce_hostname_verification=false
      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=somestrongpassword
      - OPENSEARCH_JAVA_OPTS=-Xms6g -Xmx6g
    ports:
      - "9200:9200" # OpenSearch HTTP
      - "9600:9600" # Performance Analyzer
    ulimits:
      memlock:
        soft: -1
        hard: -1
    mem_limit: 8g
    healthcheck:
      test: ["CMD-SHELL", "curl -s http://localhost:9200 || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 5

  dashboards:
    image: opensearchproject/opensearch-dashboards:2.19.1
    container_name: dashboards
    environment:
      - OPENSEARCH_HOSTS=["http://opensearch:9200"]
      - OPENSEARCH_USERNAME=admin
      - OPENSEARCH_PASSWORD= somestrongpassword
      - SERVER_SSL_ENABLED=false
    ports:
      - "5601:5601"
    depends_on:
      opensearch:
        condition: service_healthy

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions