mirror of https://github.com/redis/redis.git
VSIM EPSILON fixes (#14223)
Hi, this PR implements the following changes: 1. The EPSILON option of VSIM is now documented. 2. The EPSILON behavior was fixed: the score was incorrectly divided by two in the meaning, with a 0-2 interval provided by the underlying cosine similarity, instead of the 0-1 interval. So an EPSILON of 0.2 only returned elements with a distance between 1 and 0.9 instead of 1 and 0.8. This is a *breaking change* but the command was not documented so far, and it is a fix, as the user sees the similarity score so was a total mismatch. I believe this fix should definitely be back ported as soon as possible. 3. There are now tests. Thanks for checking, Salvatore
This commit is contained in:
parent
9bd3d246b8
commit
13861df68d
|
@ -66,7 +66,7 @@ performed in the background, while the command is executed in the main thread.
|
|||
|
||||
**VSIM: return elements by vector similarity**
|
||||
|
||||
VSIM key [ELE|FP32|VALUES] <vector or element> [WITHSCORES] [WITHATTRIBS] [COUNT num] [EF search-exploration-factor] [FILTER expression] [FILTER-EF max-filtering-effort] [TRUTH] [NOTHREAD]
|
||||
VSIM key [ELE|FP32|VALUES] <vector or element> [WITHSCORES] [WITHATTRIBS] [COUNT num] [EPSILON delta] [EF search-exploration-factor] [FILTER expression] [FILTER-EF max-filtering-effort] [TRUTH] [NOTHREAD]
|
||||
|
||||
The command returns similar vectors, for simplicity (and verbosity) in the following example, instead of providing a vector using FP32 or VALUES (like in `VADD`), we will ask for elements having a vector similar to a given element already in the sorted set:
|
||||
|
||||
|
@ -92,6 +92,8 @@ It is possible to specify a `COUNT` and also to get the similarity score (from 1
|
|||
5) "pear"
|
||||
6) "0.8226882219314575"
|
||||
|
||||
It is also possible to specify a `EPSILON`, that is a floating point number between 0 and 1 in order to only return elements that have a distance that is no further than the specified one. In vector sets, the returned elements have a similarity score (when compared to the query vector) that is between 1 and 0, where 1 means identical, 0 opposite vectors. If for instance the `EPSILON` option is specified with an argument of 0.2, it means that we will get only elements that have a similarity of 0.8 or better (a distance < 0.2). This is useful when a large `COUNT` is specified, yet we don't want elements that are too far away our query vector.
|
||||
|
||||
The `EF` argument is the exploration factor: the higher it is, the slower the command becomes, but the better the index is explored to find nodes that are near to our query. Sensible values are from 50 to 1000.
|
||||
|
||||
The `TRUTH` option forces the command to perform a linear scan of all the entries inside the set, without using the graph search inside the HNSW, so it returns the best matching elements (the perfect result set) that can be used in order to easily calculate the recall. Of course the linear scan is `O(N)`, so it is much slower than the `log(N)` (considering a small `COUNT`) provided by the HNSW index.
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
from test import TestCase
|
||||
|
||||
class EpsilonOption(TestCase):
|
||||
def getname(self):
|
||||
return "VSIM EPSILON option filtering"
|
||||
|
||||
def estimated_runtime(self):
|
||||
return 0.1
|
||||
|
||||
def test(self):
|
||||
# Add vectors as shown in the example
|
||||
# Vector 'a' at (1, 1) - normalized to (0.707, 0.707)
|
||||
result = self.redis.execute_command('VADD', self.test_key, 'VALUES', '2', '1', '1', 'a')
|
||||
assert result == 1, "VADD should return 1 for item 'a'"
|
||||
|
||||
# Vector 'b' at (0, 1) - normalized to (0, 1)
|
||||
result = self.redis.execute_command('VADD', self.test_key, 'VALUES', '2', '0', '1', 'b')
|
||||
assert result == 1, "VADD should return 1 for item 'b'"
|
||||
|
||||
# Vector 'c' at (0, 0) - this will be a zero vector, might be handled specially
|
||||
result = self.redis.execute_command('VADD', self.test_key, 'VALUES', '2', '0', '0', 'c')
|
||||
assert result == 1, "VADD should return 1 for item 'c'"
|
||||
|
||||
# Vector 'd' at (0, -1) - normalized to (0, -1)
|
||||
result = self.redis.execute_command('VADD', self.test_key, 'VALUES', '2', '0', '-1', 'd')
|
||||
assert result == 1, "VADD should return 1 for item 'd'"
|
||||
|
||||
# Vector 'e' at (-1, -1) - normalized to (-0.707, -0.707)
|
||||
result = self.redis.execute_command('VADD', self.test_key, 'VALUES', '2', '-1', '-1', 'e')
|
||||
assert result == 1, "VADD should return 1 for item 'e'"
|
||||
|
||||
# Test without EPSILON - should return all items
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', '2', '1', '1', 'WITHSCORES')
|
||||
# Result is a flat list: [elem1, score1, elem2, score2, ...]
|
||||
elements_all = [result[i].decode() for i in range(0, len(result), 2)]
|
||||
scores_all = [float(result[i]) for i in range(1, len(result), 2)]
|
||||
|
||||
assert len(elements_all) == 5, f"Should return 5 elements without EPSILON, got {len(elements_all)}"
|
||||
assert elements_all[0] == 'a', "First element should be 'a' (most similar)"
|
||||
assert scores_all[0] == 1.0, "Score for 'a' should be 1.0 (identical)"
|
||||
|
||||
# Test with EPSILON 0.5 - should return only elements with similarity >= 0.5 (distance < 0.5)
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', '2', '1', '1', 'WITHSCORES', 'EPSILON', '0.5')
|
||||
elements_epsilon_0_5 = [result[i].decode() for i in range(0, len(result), 2)]
|
||||
scores_epsilon_0_5 = [float(result[i]) for i in range(1, len(result), 2)]
|
||||
|
||||
assert len(elements_epsilon_0_5) == 3, f"With EPSILON 0.5, should return 3 elements, got {len(elements_epsilon_0_5)}"
|
||||
assert set(elements_epsilon_0_5) == {'a', 'b', 'c'}, f"With EPSILON 0.5, should get a, b, c, got {elements_epsilon_0_5}"
|
||||
|
||||
# Verify all returned scores are >= 0.5
|
||||
for i, score in enumerate(scores_epsilon_0_5):
|
||||
assert score >= 0.5, f"Element {elements_epsilon_0_5[i]} has score {score} which is < 0.5"
|
||||
|
||||
# Test with EPSILON 0.2 - should return only elements with similarity >= 0.8 (distance < 0.2)
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', '2', '1', '1', 'WITHSCORES', 'EPSILON', '0.2')
|
||||
elements_epsilon_0_2 = [result[i].decode() for i in range(0, len(result), 2)]
|
||||
scores_epsilon_0_2 = [float(result[i]) for i in range(1, len(result), 2)]
|
||||
|
||||
assert len(elements_epsilon_0_2) == 2, f"With EPSILON 0.2, should return 2 elements, got {len(elements_epsilon_0_2)}"
|
||||
assert set(elements_epsilon_0_2) == {'a', 'b'}, f"With EPSILON 0.2, should get a, b, got {elements_epsilon_0_2}"
|
||||
|
||||
# Verify all returned scores are >= 0.8 (since distance < 0.2 means similarity > 0.8)
|
||||
for i, score in enumerate(scores_epsilon_0_2):
|
||||
assert score >= 0.8, f"Element {elements_epsilon_0_2[i]} has score {score} which is < 0.8"
|
||||
|
||||
# Test with very small EPSILON - should return only the exact match
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', '2', '1', '1', 'WITHSCORES', 'EPSILON', '0.001')
|
||||
elements_epsilon_small = [result[i].decode() for i in range(0, len(result), 2)]
|
||||
|
||||
assert len(elements_epsilon_small) == 1, f"With EPSILON 0.001, should return only 1 element, got {len(elements_epsilon_small)}"
|
||||
assert elements_epsilon_small[0] == 'a', "With very small EPSILON, should only get 'a'"
|
||||
|
||||
# Test with EPSILON 1.0 - should return all elements (since all similarities are between 0 and 1)
|
||||
result = self.redis.execute_command('VSIM', self.test_key, 'VALUES', '2', '1', '1', 'WITHSCORES', 'EPSILON', '1.0')
|
||||
elements_epsilon_1 = [result[i].decode() for i in range(0, len(result), 2)]
|
||||
|
||||
assert len(elements_epsilon_1) == 5, f"With EPSILON 1.0, should return all 5 elements, got {len(elements_epsilon_1)}"
|
|
@ -852,7 +852,7 @@ void VSIM_execute(RedisModuleCtx *ctx, struct vsetObject *vset,
|
|||
|
||||
long long arraylen = 0;
|
||||
for (unsigned int i = 0; i < found && i < count; i++) {
|
||||
if (distances[i] > epsilon) break;
|
||||
if (distances[i]/2 > epsilon) break;
|
||||
struct vsetNodeVal *nv = neighbors[i]->value;
|
||||
RedisModule_ReplyWithString(ctx, nv->item);
|
||||
arraylen++;
|
||||
|
|
Loading…
Reference in New Issue