Skip to content

Commit 949b760

Browse files
mike-tr-adamsonmichaelsembwever
authored andcommitted
Add support for a vector search index in SAI
- Adds jbellis/jvector (1.0.2) library for DiskANN based indexes on floating point vectors - Adds ORDER BY ANN OF capability to do ANN search and order the results by score patch by Mike Adamson; reviewed by Andrés de la Peña, Jonathon Ellis for CASSANDRA-18715 Co-authored-by Jonathon Ellis [email protected] Co-authored-by Zhao Yang [email protected]
1 parent b59b832 commit 949b760

File tree

163 files changed

+13029
-417
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

163 files changed

+13029
-417
lines changed

.build/cassandra-deps-template.xml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,5 +372,9 @@
372372
<groupId>org.apache.lucene</groupId>
373373
<artifactId>lucene-analysis-common</artifactId>
374374
</dependency>
375+
<dependency>
376+
<groupId>io.github.jbellis</groupId>
377+
<artifactId>jvector</artifactId>
378+
</dependency>
375379
</dependencies>
376380
</project>

.build/parent-pom-template.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1201,6 +1201,11 @@
12011201
<artifactId>lucene-analysis-common</artifactId>
12021202
<version>9.7.0</version>
12031203
</dependency>
1204+
<dependency>
1205+
<groupId>io.github.jbellis</groupId>
1206+
<artifactId>jvector</artifactId>
1207+
<version>1.0.2</version>
1208+
</dependency>
12041209
<dependency>
12051210
<groupId>com.carrotsearch.randomizedtesting</groupId>
12061211
<artifactId>randomizedtesting-runner</artifactId>

CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
5.0-alpha2
2+
* Add support for vector search in SAI (CASSANDRA-18715)
23
* Remove crc_check_chance from CompressionParams (CASSANDRA-18872)
34
* Fix schema loading of UDTs inside vectors inside UDTs (CASSANDRA-18964)
45
* Add cqlsh autocompletion for the vector data type (CASSANDRA-18946)

NEWS.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ New features
8686
src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.md
8787
- New `VectorType` (cql `vector<element_type, dimension>`) which adds new fixed-length element arrays. See CASSANDRA-18504
8888
- Added new vector similarity functions `similarity_cosine`, `similarity_euclidean` and `similarity_dot_product`.
89+
- Added ANN vector similarity search via ORDER BY ANN OF syntax on SAI indexes (using jvector library).
8990
- Removed UDT type migration logic for 3.6+ clusters upgrading to 4.0. If migration has been disabled, it must be
9091
enabled before upgrading to 5.0 if the cluster used UDTs. See CASSANDRA-18504
9192
- Entended max expiration time from 2038-01-19T03:14:06+00:00 to 2106-02-07T06:28:13+00:00

src/antlr/Lexer.g

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,8 @@ K_MASKED: M A S K E D;
224224
K_UNMASK: U N M A S K;
225225
K_SELECT_MASKED: S E L E C T '_' M A S K E D;
226226

227-
K_VECTOR: V E C T O R;
227+
K_VECTOR: V E C T O R;
228+
K_ANN: A N N;
228229

229230
// Case-insensitive alpha characters
230231
fragment A: ('a'|'A');

src/antlr/Parser.g

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ selectStatement returns [SelectStatement.RawStatement expr]
268268
@init {
269269
Term.Raw limit = null;
270270
Term.Raw perPartitionLimit = null;
271-
Map<ColumnIdentifier, Boolean> orderings = new LinkedHashMap<>();
271+
List<Ordering.Raw> orderings = new ArrayList<>();
272272
List<Selectable.Raw> groups = new ArrayList<>();
273273
boolean allowFiltering = false;
274274
boolean isJson = false;
@@ -459,11 +459,17 @@ customIndexExpression [WhereClause.Builder clause]
459459
: 'expr(' idxName[name] ',' t=term ')' { clause.add(new CustomIndexExpression(name, t));}
460460
;
461461

462-
orderByClause[Map<ColumnIdentifier, Boolean> orderings]
462+
orderByClause[List<Ordering.Raw> orderings]
463463
@init{
464-
boolean reversed = false;
464+
Ordering.Direction direction = Ordering.Direction.ASC;
465+
}
466+
: c=cident (K_ANN K_OF t=term)? (K_ASC | K_DESC { direction = Ordering.Direction.DESC; })?
467+
{
468+
Ordering.Raw.Expression expr = (t == null)
469+
? new Ordering.Raw.SingleColumn(c)
470+
: new Ordering.Raw.Ann(c, t);
471+
orderings.add(new Ordering.Raw(expr, direction));
465472
}
466-
: c=cident (K_ASC | K_DESC { reversed = true; })? { orderings.put(c, reversed); }
467473
;
468474

469475
groupByClause[List<Selectable.Raw> groups]
@@ -2014,5 +2020,6 @@ basic_unreserved_keyword returns [String str]
20142020
| K_UNMASK
20152021
| K_SELECT_MASKED
20162022
| K_VECTOR
2023+
| K_ANN
20172024
) { $str = $k.text; }
20182025
;

src/java/org/apache/cassandra/config/CassandraRelevantProperties.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -429,8 +429,9 @@ public enum CassandraRelevantProperties
429429
SAI_INTERSECTION_CLAUSE_LIMIT("cassandra.sai.intersection_clause_limit", "2"),
430430
/** Latest version to be used for SAI index writing */
431431
SAI_LATEST_VERSION("cassandra.sai.latest_version", "aa"),
432-
SAI_MAX_FROZEN_TERM_SIZE("cassandra.sai.max_frozen_term_size_kb", "5"),
433-
SAI_MAX_STRING_TERM_SIZE("cassandra.sai.max_string_term_size_kb", "1"),
432+
SAI_MAX_FROZEN_TERM_SIZE("cassandra.sai.max_frozen_term_size", "5KiB"),
433+
SAI_MAX_STRING_TERM_SIZE("cassandra.sai.max_string_term_size", "1KiB"),
434+
SAI_MAX_VECTOR_TERM_SIZE("cassandra.sai.max_vector_term_size", "32KiB"),
434435

435436
/** Minimum number of reachable leaves for a given node to be eligible for an auxiliary posting list */
436437
SAI_MINIMUM_POSTINGS_LEAVES("cassandra.sai.minimum_postings_leaves", "64"),
@@ -455,6 +456,18 @@ public enum CassandraRelevantProperties
455456
SAI_TEST_BALANCED_TREE_DEBUG_ENABLED("cassandra.sai.test.balanced_tree_debug_enabled", "false"),
456457
SAI_TEST_DISABLE_TIMEOUT("cassandra.sai.test.disable.timeout", "false"),
457458

459+
/** Whether to allow the user to specify custom options to the hnsw index */
460+
SAI_VECTOR_ALLOW_CUSTOM_PARAMETERS("cassandra.sai.vector.allow_custom_parameters", "false"),
461+
462+
/** Controls the maximum top-k limit for vector search */
463+
SAI_VECTOR_SEARCH_MAX_TOP_K("cassandra.sai.vector_search.max_top_k", "1000"),
464+
465+
/**
466+
* Controls the maximum number of PrimaryKeys that will be read into memory at one time when ordering/limiting
467+
* the results of an ANN query constrained by non-ANN predicates.
468+
*/
469+
SAI_VECTOR_SEARCH_ORDER_CHUNK_SIZE("cassandra.sai.vector_search.order_chunk_size", "100000"),
470+
458471
SCHEMA_PULL_INTERVAL_MS("cassandra.schema_pull_interval_ms", "60000"),
459472
SCHEMA_UPDATE_HANDLER_FACTORY_CLASS("cassandra.schema.update_handler_factory.class"),
460473
SEARCH_CONCURRENCY_FACTOR("cassandra.search_concurrency_factor", "1"),

src/java/org/apache/cassandra/cql3/Operator.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,23 @@ public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteB
258258
{
259259
throw new UnsupportedOperationException();
260260
}
261+
},
262+
ANN(15)
263+
{
264+
@Override
265+
public String toString()
266+
{
267+
return "ANN";
268+
}
269+
270+
@Override
271+
public boolean isSatisfiedBy(AbstractType<?> type, ByteBuffer leftOperand, ByteBuffer rightOperand)
272+
{
273+
// The ANN operator is only supported by the vector index so, normally, should never be called directly.
274+
// In networked queries (non-local) the coordinator will end up calling the row filter directly. So, this
275+
// needs to return true so that the returned values are allowed through to the VectorTopKProcessor
276+
return true;
277+
}
261278
};
262279

263280
/**
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.cassandra.cql3;
20+
21+
import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction;
22+
import org.apache.cassandra.cql3.restrictions.SingleRestriction;
23+
import org.apache.cassandra.schema.ColumnMetadata;
24+
import org.apache.cassandra.schema.TableMetadata;
25+
26+
/**
27+
* A single element of an ORDER BY clause.
28+
* <code>ORDER BY ordering1 [, ordering2 [, ...]] </code>
29+
* <p>
30+
* An ordering comprises an expression that produces the values to compare against each other
31+
* and a sorting direction (ASC, DESC).
32+
*/
33+
public class Ordering
34+
{
35+
public final Expression expression;
36+
public final Direction direction;
37+
38+
public Ordering(Expression expression, Direction direction)
39+
{
40+
this.expression = expression;
41+
this.direction = direction;
42+
}
43+
44+
public static abstract class Expression
45+
{
46+
protected final ColumnMetadata columnMetadata;
47+
48+
public Expression(ColumnMetadata columnMetadata)
49+
{
50+
this.columnMetadata = columnMetadata;
51+
}
52+
53+
public boolean hasNonClusteredOrdering()
54+
{
55+
return false;
56+
}
57+
58+
public SingleRestriction toRestriction()
59+
{
60+
throw new UnsupportedOperationException();
61+
}
62+
63+
public ColumnMetadata getColumn()
64+
{
65+
return columnMetadata;
66+
}
67+
}
68+
69+
/**
70+
* Represents a single column in <code>ORDER BY column</code>
71+
*/
72+
public static class SingleColumn extends Expression
73+
{
74+
public SingleColumn(ColumnMetadata columnMetadata)
75+
{
76+
super(columnMetadata);
77+
}
78+
}
79+
80+
/**
81+
* An expression used in Approximate Nearest Neighbor ordering. <code>ORDER BY column ANN OF value</code>
82+
*/
83+
public static class Ann extends Expression
84+
{
85+
final Term vectorValue;
86+
87+
public Ann(ColumnMetadata columnMetadata, Term vectorValue)
88+
{
89+
super(columnMetadata);
90+
this.vectorValue = vectorValue;
91+
}
92+
93+
@Override
94+
public boolean hasNonClusteredOrdering()
95+
{
96+
return true;
97+
}
98+
99+
@Override
100+
public SingleRestriction toRestriction()
101+
{
102+
return new SingleColumnRestriction.AnnRestriction(columnMetadata, vectorValue);
103+
}
104+
}
105+
106+
public enum Direction
107+
{ASC, DESC}
108+
109+
110+
/**
111+
* Represents ANTLR's abstract syntax tree of a single element in the {@code ORDER BY} clause.
112+
* This comes directly out of CQL parser.
113+
*/
114+
public static class Raw
115+
{
116+
117+
final Expression expression;
118+
final Direction direction;
119+
120+
public Raw(Expression expression, Direction direction)
121+
{
122+
this.expression = expression;
123+
this.direction = direction;
124+
}
125+
126+
/**
127+
* Resolves column identifiers against the table schema.
128+
* Binds markers (?) to columns.
129+
*/
130+
public Ordering bind(TableMetadata table, VariableSpecifications boundNames)
131+
{
132+
return new Ordering(expression.bind(table, boundNames), direction);
133+
}
134+
135+
public interface Expression
136+
{
137+
Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames);
138+
}
139+
140+
public static class SingleColumn implements Expression
141+
{
142+
final ColumnIdentifier column;
143+
144+
SingleColumn(ColumnIdentifier column)
145+
{
146+
this.column = column;
147+
}
148+
149+
@Override
150+
public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames)
151+
{
152+
return new Ordering.SingleColumn(table.getExistingColumn(column));
153+
}
154+
}
155+
156+
public static class Ann implements Expression
157+
{
158+
final ColumnIdentifier columnId;
159+
final Term.Raw vectorValue;
160+
161+
Ann(ColumnIdentifier column, Term.Raw vectorValue)
162+
{
163+
this.columnId = column;
164+
this.vectorValue = vectorValue;
165+
}
166+
167+
@Override
168+
public Ordering.Expression bind(TableMetadata table, VariableSpecifications boundNames)
169+
{
170+
ColumnMetadata column = table.getExistingColumn(columnId);
171+
Term value = vectorValue.prepare(table.keyspace, column);
172+
value.collectMarkerSpecification(boundNames);
173+
return new Ordering.Ann(column, value);
174+
}
175+
}
176+
}
177+
}
178+
179+
180+

src/java/org/apache/cassandra/cql3/QueryOptions.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ public static QueryOptions addColumnSpecifications(QueryOptions options, List<Co
114114
return new OptionsWithColumnSpecifications(options, columnSpecs);
115115
}
116116

117+
public static QueryOptions withConsistencyLevel(QueryOptions options, ConsistencyLevel consistencyLevel)
118+
{
119+
return new OptionsWithConsistencyLevel(options, consistencyLevel);
120+
}
121+
122+
public static QueryOptions withPageSize(QueryOptions options, int pageSize)
123+
{
124+
return new OptionsWithPageSize(options, pageSize);
125+
}
126+
117127
public abstract ConsistencyLevel getConsistency();
118128
public abstract List<ByteBuffer> getValues();
119129
public abstract boolean skipMetadata();
@@ -424,6 +434,40 @@ public QueryOptions prepare(List<ColumnSpecification> specs)
424434
}
425435
}
426436

437+
static class OptionsWithConsistencyLevel extends QueryOptionsWrapper
438+
{
439+
private final ConsistencyLevel consistencyLevel;
440+
441+
OptionsWithConsistencyLevel(QueryOptions wrapped, ConsistencyLevel consistencyLevel)
442+
{
443+
super(wrapped);
444+
this.consistencyLevel = consistencyLevel;
445+
}
446+
447+
@Override
448+
public ConsistencyLevel getConsistency()
449+
{
450+
return consistencyLevel;
451+
}
452+
}
453+
454+
static class OptionsWithPageSize extends QueryOptionsWrapper
455+
{
456+
private final int pageSize;
457+
458+
OptionsWithPageSize(QueryOptions wrapped, int pageSize)
459+
{
460+
super(wrapped);
461+
this.pageSize = pageSize;
462+
}
463+
464+
@Override
465+
public int getPageSize()
466+
{
467+
return pageSize;
468+
}
469+
}
470+
427471
/**
428472
* <code>QueryOptions</code> decorator that provides access to the column specifications.
429473
*/

src/java/org/apache/cassandra/cql3/Relation.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,8 @@ public final Restriction toRestriction(TableMetadata table, VariableSpecificatio
155155
case LIKE_MATCHES:
156156
case LIKE:
157157
return newLikeRestriction(table, boundNames, relationType);
158+
case ANN:
159+
throw invalidRequest("ANN is only supported in ORDER BY");
158160
default: throw invalidRequest("Unsupported \"!=\" relation: %s", this);
159161
}
160162
}

0 commit comments

Comments
 (0)