chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
# This folder holds basic sanity checks for the distributed version of chromadb
|
||||
# while it is in development. In the future, it may hold more extensive tests
|
||||
# in tandem with the main test suite, targeted at the distributed version.
|
||||
@@ -0,0 +1,54 @@
|
||||
# Add up to 200k records until the log-is-full message is seen.
|
||||
|
||||
import grpc
|
||||
import math
|
||||
import random
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from chromadb.api import ClientAPI
|
||||
from chromadb.proto.logservice_pb2 import SealLogRequest, MigrateLogRequest
|
||||
from chromadb.proto.logservice_pb2_grpc import LogServiceStub
|
||||
from chromadb.test.conftest import (
|
||||
reset,
|
||||
skip_if_not_cluster,
|
||||
)
|
||||
from chromadb.test.property import invariants
|
||||
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
|
||||
|
||||
RECORDS = 2000000
|
||||
BATCH_SIZE = 100
|
||||
|
||||
@skip_if_not_cluster()
|
||||
def test_log_backpressure(
|
||||
client: ClientAPI,
|
||||
) -> None:
|
||||
seed = time.time()
|
||||
random.seed(seed)
|
||||
print("Generating data with seed ", seed)
|
||||
reset(client)
|
||||
collection = client.create_collection(
|
||||
name="test",
|
||||
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
|
||||
)
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
print('backpressuring for', collection.id)
|
||||
|
||||
excepted = False
|
||||
# Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
|
||||
for i in range(0, RECORDS, BATCH_SIZE):
|
||||
ids = []
|
||||
embeddings = []
|
||||
ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
|
||||
embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
|
||||
try:
|
||||
collection.add(ids=ids, embeddings=embeddings)
|
||||
except Exception as x:
|
||||
print(f"Caught exception:\n{x}")
|
||||
if 'Backoff and retry' in str(x):
|
||||
excepted = True
|
||||
break
|
||||
assert excepted, "Expected an exception to be thrown."
|
||||
@@ -0,0 +1,73 @@
|
||||
# Add some records, wait for compaction, then roll back the log offset.
|
||||
# Poll the log for up to 30s to see if the offset gets repaired.
|
||||
|
||||
import grpc
|
||||
import random
|
||||
import time
|
||||
from typing import cast, List, Any, Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
from chromadb.api import ClientAPI
|
||||
from chromadb.proto.logservice_pb2 import InspectLogStateRequest, UpdateCollectionLogOffsetRequest
|
||||
from chromadb.proto.logservice_pb2_grpc import LogServiceStub
|
||||
from chromadb.test.conftest import (
|
||||
reset,
|
||||
skip_if_not_cluster,
|
||||
)
|
||||
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
|
||||
|
||||
RECORDS = 1000
|
||||
BATCH_SIZE = 100
|
||||
|
||||
@skip_if_not_cluster()
|
||||
def test_repair_collection_log_offset(
|
||||
client: ClientAPI,
|
||||
) -> None:
|
||||
seed = time.time()
|
||||
random.seed(seed)
|
||||
print("Generating data with seed ", seed)
|
||||
reset(client)
|
||||
|
||||
channel = grpc.insecure_channel('localhost:50054')
|
||||
log_service_stub = LogServiceStub(channel)
|
||||
|
||||
collection = client.create_collection(
|
||||
name="test_repair_collection_log_offset",
|
||||
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
|
||||
)
|
||||
print("collection_id =", collection.id)
|
||||
|
||||
initial_version = cast(int, collection.get_model()["version"])
|
||||
|
||||
# Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
|
||||
for i in range(0, RECORDS, BATCH_SIZE):
|
||||
ids = []
|
||||
embeddings = []
|
||||
ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
|
||||
embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
|
||||
collection.add(ids=ids, embeddings=embeddings)
|
||||
|
||||
wait_for_version_increase(client, collection.name, initial_version)
|
||||
|
||||
found = False
|
||||
now = time.time()
|
||||
while time.time() - now < 240:
|
||||
request = InspectLogStateRequest(collection_id=str(collection.id))
|
||||
response = log_service_stub.InspectLogState(request, timeout=60)
|
||||
if '''LogPosition { offset: 1001 }''' in response.debug:
|
||||
found = True
|
||||
break
|
||||
assert found
|
||||
|
||||
request = UpdateCollectionLogOffsetRequest (collection_id=str(collection.id), log_offset=1)
|
||||
response = log_service_stub.RollbackCollectionLogOffset(request, timeout=60)
|
||||
|
||||
now = time.time()
|
||||
while time.time() - now < 240:
|
||||
request = InspectLogStateRequest(collection_id=str(collection.id))
|
||||
response = log_service_stub.InspectLogState(request, timeout=60)
|
||||
if '''LogPosition { offset: 1001 }''' in response.debug:
|
||||
return
|
||||
time.sleep(1)
|
||||
raise RuntimeError("Test timed out without repair")
|
||||
@@ -0,0 +1,74 @@
|
||||
from typing import Sequence
|
||||
from chromadb.test.conftest import (
|
||||
reset,
|
||||
skip_if_not_cluster,
|
||||
)
|
||||
from chromadb.api import ClientAPI
|
||||
from kubernetes import client as k8s_client, config
|
||||
import time
|
||||
|
||||
|
||||
@skip_if_not_cluster()
|
||||
def test_reroute(
|
||||
client: ClientAPI,
|
||||
) -> None:
|
||||
reset(client)
|
||||
collection = client.create_collection(
|
||||
name="test",
|
||||
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
|
||||
)
|
||||
|
||||
ids = [str(i) for i in range(10)]
|
||||
embeddings: list[Sequence[float]] = [
|
||||
[float(i), float(i), float(i)] for i in range(10)
|
||||
]
|
||||
collection.add(ids=ids, embeddings=embeddings)
|
||||
collection.query(query_embeddings=[embeddings[0]])
|
||||
|
||||
# Restart the query service using k8s api, in order to trigger a reroute
|
||||
# of the query service
|
||||
config.load_kube_config()
|
||||
v1 = k8s_client.CoreV1Api()
|
||||
# Find all pods with the label "app=query"
|
||||
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
|
||||
assert len(res.items) > 0
|
||||
items = res.items
|
||||
seen_ids = set()
|
||||
|
||||
# Restart all the pods by deleting them
|
||||
for item in items:
|
||||
seen_ids.add(item.metadata.uid)
|
||||
name = item.metadata.name
|
||||
namespace = item.metadata.namespace
|
||||
v1.delete_namespaced_pod(name, namespace)
|
||||
|
||||
# Wait until we have len(seen_ids) pods running with new UIDs
|
||||
timeout_secs = 10
|
||||
start_time = time.time()
|
||||
while True:
|
||||
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
|
||||
items = res.items
|
||||
new_ids = set([item.metadata.uid for item in items])
|
||||
if len(new_ids) == len(seen_ids) and len(new_ids.intersection(seen_ids)) == 0:
|
||||
break
|
||||
if time.time() - start_time > timeout_secs:
|
||||
assert False, "Timed out waiting for new pods to start"
|
||||
time.sleep(1)
|
||||
|
||||
# Wait for the query service to be ready, or timeout
|
||||
while True:
|
||||
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
|
||||
items = res.items
|
||||
ready = True
|
||||
for item in items:
|
||||
if item.status.phase != "Running":
|
||||
ready = False
|
||||
break
|
||||
if ready:
|
||||
break
|
||||
if time.time() - start_time > timeout_secs:
|
||||
assert False, "Timed out waiting for new pods to be ready"
|
||||
time.sleep(1)
|
||||
|
||||
time.sleep(1)
|
||||
collection.query(query_embeddings=[embeddings[0]])
|
||||
@@ -0,0 +1,102 @@
|
||||
# This tests a very minimal of test_add in test_add.py as a example based test
|
||||
# instead of a property based test. We can use the delta to get the property
|
||||
# test working and then enable
|
||||
import random
|
||||
import time
|
||||
from chromadb.api import ClientAPI
|
||||
from chromadb.test.conftest import (
|
||||
reset,
|
||||
skip_if_not_cluster,
|
||||
)
|
||||
from chromadb.test.property import invariants
|
||||
from chromadb.test.utils.wait_for_version_increase import (
|
||||
wait_for_version_increase,
|
||||
get_collection_version,
|
||||
)
|
||||
import numpy as np
|
||||
|
||||
|
||||
@skip_if_not_cluster()
|
||||
def test_add(
|
||||
client: ClientAPI,
|
||||
) -> None:
|
||||
seed = time.time()
|
||||
random.seed(seed)
|
||||
print("Generating data with seed ", seed)
|
||||
reset(client)
|
||||
collection = client.create_collection(
|
||||
name="test",
|
||||
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
|
||||
)
|
||||
|
||||
# Add 1000 records, where each embedding has 3 dimensions randomly generated
|
||||
# between 0 and 1
|
||||
ids = []
|
||||
embeddings = []
|
||||
for i in range(1000):
|
||||
ids.append(str(i))
|
||||
embeddings.append(np.random.rand(1, 3)[0])
|
||||
collection.add(
|
||||
ids=[str(i)],
|
||||
embeddings=[embeddings[-1]],
|
||||
)
|
||||
|
||||
random_query = np.random.rand(1, 3)[0]
|
||||
print("Generated data with seed ", seed)
|
||||
|
||||
invariants.ann_accuracy(
|
||||
collection,
|
||||
{
|
||||
"ids": ids,
|
||||
"embeddings": embeddings,
|
||||
"metadatas": None,
|
||||
"documents": None,
|
||||
},
|
||||
10,
|
||||
query_embeddings=[random_query],
|
||||
)
|
||||
|
||||
|
||||
@skip_if_not_cluster()
|
||||
def test_add_include_all_with_compaction_delay(client: ClientAPI) -> None:
|
||||
seed = time.time()
|
||||
random.seed(seed)
|
||||
print("Generating data with seed ", seed)
|
||||
reset(client)
|
||||
collection = client.create_collection(
|
||||
name="test_add_include_all_with_compaction_delay",
|
||||
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
|
||||
)
|
||||
initial_version = get_collection_version(client, collection.name)
|
||||
|
||||
ids = []
|
||||
embeddings = []
|
||||
documents = []
|
||||
for i in range(1000):
|
||||
ids.append(str(i))
|
||||
embeddings.append(np.random.rand(1, 3)[0])
|
||||
documents.append(f"document_{i}")
|
||||
collection.add(
|
||||
ids=[str(i)],
|
||||
embeddings=[embeddings[-1]],
|
||||
documents=[documents[-1]],
|
||||
)
|
||||
|
||||
wait_for_version_increase(client, collection.name, initial_version, 120)
|
||||
|
||||
random_query_1 = np.random.rand(1, 3)[0]
|
||||
random_query_2 = np.random.rand(1, 3)[0]
|
||||
print("Generated data with seed ", seed)
|
||||
|
||||
# Query the collection with a random query
|
||||
invariants.ann_accuracy(
|
||||
collection,
|
||||
{
|
||||
"ids": ids,
|
||||
"embeddings": embeddings,
|
||||
"metadatas": None,
|
||||
"documents": documents,
|
||||
},
|
||||
10,
|
||||
query_embeddings=[random_query_1, random_query_2],
|
||||
)
|
||||
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
Integration test for Chroma's Task API
|
||||
|
||||
Tests the task creation, execution, and removal functionality
|
||||
for automatically processing collections.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from chromadb.api.client import Client as ClientCreator
|
||||
from chromadb.config import System
|
||||
from chromadb.errors import ChromaError, NotFoundError
|
||||
|
||||
|
||||
def test_function_attach_and_detach(basic_http_client: System) -> None:
|
||||
"""Test creating and removing a function with the record_counter operator"""
|
||||
client = ClientCreator.from_system(basic_http_client)
|
||||
client.reset()
|
||||
|
||||
# Create a collection
|
||||
collection = client.get_or_create_collection(
|
||||
name="my_document",
|
||||
metadata={"description": "Sample documents for task processing"},
|
||||
)
|
||||
|
||||
# Add initial documents
|
||||
collection.add(
|
||||
ids=["doc1", "doc2", "doc3"],
|
||||
documents=[
|
||||
"The quick brown fox jumps over the lazy dog",
|
||||
"Machine learning is a subset of artificial intelligence",
|
||||
"Python is a popular programming language",
|
||||
],
|
||||
metadatas=[{"source": "proverb"}, {"source": "tech"}, {"source": "tech"}],
|
||||
)
|
||||
|
||||
# Verify collection has documents
|
||||
assert collection.count() == 3
|
||||
|
||||
# Create a task that counts records in the collection
|
||||
attached_fn = collection.attach_function(
|
||||
name="count_my_docs",
|
||||
function_id="record_counter", # Built-in operator that counts records
|
||||
output_collection="my_documents_counts",
|
||||
params=None,
|
||||
)
|
||||
|
||||
# Verify task creation succeeded
|
||||
assert attached_fn is not None
|
||||
|
||||
# Add more documents
|
||||
collection.add(
|
||||
ids=["doc4", "doc5"],
|
||||
documents=[
|
||||
"Chroma is a vector database",
|
||||
"Tasks automate data processing",
|
||||
],
|
||||
)
|
||||
|
||||
# Verify documents were added
|
||||
assert collection.count() == 5
|
||||
|
||||
# Remove the task
|
||||
success = attached_fn.detach(
|
||||
delete_output_collection=True,
|
||||
)
|
||||
|
||||
# Verify task removal succeeded
|
||||
assert success is True
|
||||
|
||||
|
||||
def test_task_with_invalid_function(basic_http_client: System) -> None:
|
||||
"""Test that creating a task with an invalid function raises an error"""
|
||||
client = ClientCreator.from_system(basic_http_client)
|
||||
client.reset()
|
||||
|
||||
collection = client.get_or_create_collection(name="test_invalid_function")
|
||||
collection.add(ids=["id1"], documents=["test document"])
|
||||
|
||||
# Attempt to create task with non-existent function should raise ChromaError
|
||||
with pytest.raises(ChromaError, match="function not found"):
|
||||
collection.attach_function(
|
||||
name="invalid_task",
|
||||
function_id="nonexistent_function",
|
||||
output_collection="output_collection",
|
||||
params=None,
|
||||
)
|
||||
|
||||
|
||||
def test_function_multiple_collections(basic_http_client: System) -> None:
|
||||
"""Test attaching functions on multiple collections"""
|
||||
client = ClientCreator.from_system(basic_http_client)
|
||||
client.reset()
|
||||
|
||||
# Create first collection and task
|
||||
collection1 = client.create_collection(name="collection_1")
|
||||
collection1.add(ids=["id1", "id2"], documents=["doc1", "doc2"])
|
||||
|
||||
attached_fn1 = collection1.attach_function(
|
||||
name="task_1",
|
||||
function_id="record_counter",
|
||||
output_collection="output_1",
|
||||
params=None,
|
||||
)
|
||||
|
||||
assert attached_fn1 is not None
|
||||
|
||||
# Create second collection and task
|
||||
collection2 = client.create_collection(name="collection_2")
|
||||
collection2.add(ids=["id3", "id4"], documents=["doc3", "doc4"])
|
||||
|
||||
attached_fn2 = collection2.attach_function(
|
||||
name="task_2",
|
||||
function_id="record_counter",
|
||||
output_collection="output_2",
|
||||
params=None,
|
||||
)
|
||||
|
||||
assert attached_fn2 is not None
|
||||
|
||||
# Task IDs should be different
|
||||
assert attached_fn1.id != attached_fn2.id
|
||||
|
||||
# Clean up
|
||||
assert attached_fn1.detach(delete_output_collection=True) is True
|
||||
assert attached_fn2.detach(delete_output_collection=True) is True
|
||||
|
||||
|
||||
def test_functions_multiple_attached_functions(basic_http_client: System) -> None:
|
||||
"""Test attaching multiple functions on the same collection"""
|
||||
client = ClientCreator.from_system(basic_http_client)
|
||||
client.reset()
|
||||
|
||||
# Create a single collection
|
||||
collection = client.create_collection(name="multi_task_collection")
|
||||
collection.add(ids=["id1", "id2", "id3"], documents=["doc1", "doc2", "doc3"])
|
||||
|
||||
# Create first task on the collection
|
||||
attached_fn1 = collection.attach_function(
|
||||
name="task_1",
|
||||
function_id="record_counter",
|
||||
output_collection="output_1",
|
||||
params=None,
|
||||
)
|
||||
|
||||
assert attached_fn1 is not None
|
||||
|
||||
# Create second task on the SAME collection with a different name
|
||||
attached_fn2 = collection.attach_function(
|
||||
name="task_2",
|
||||
function_id="record_counter",
|
||||
output_collection="output_2",
|
||||
params=None,
|
||||
)
|
||||
|
||||
assert attached_fn2 is not None
|
||||
|
||||
# Task IDs should be different even though they're on the same collection
|
||||
assert attached_fn1.id != attached_fn2.id
|
||||
|
||||
# Create third task on the same collection
|
||||
attached_fn3 = collection.attach_function(
|
||||
name="task_3",
|
||||
function_id="record_counter",
|
||||
output_collection="output_3",
|
||||
params=None,
|
||||
)
|
||||
|
||||
assert attached_fn3 is not None
|
||||
assert attached_fn3.id != attached_fn1.id
|
||||
assert attached_fn3.id != attached_fn2.id
|
||||
|
||||
# Attempt to create a task with duplicate name on same collection should fail
|
||||
with pytest.raises(ChromaError, match="already exists"):
|
||||
collection.attach_function(
|
||||
name="task_1", # Duplicate name
|
||||
function_id="record_counter",
|
||||
output_collection="output_duplicate",
|
||||
params=None,
|
||||
)
|
||||
|
||||
# Clean up - remove each task individually
|
||||
assert attached_fn1.detach(delete_output_collection=True) is True
|
||||
assert attached_fn2.detach(delete_output_collection=True) is True
|
||||
assert attached_fn3.detach(delete_output_collection=True) is True
|
||||
|
||||
|
||||
def test_function_remove_nonexistent(basic_http_client: System) -> None:
|
||||
"""Test removing a task that doesn't exist raises NotFoundError"""
|
||||
client = ClientCreator.from_system(basic_http_client)
|
||||
client.reset()
|
||||
|
||||
collection = client.create_collection(name="test_collection")
|
||||
collection.add(ids=["id1"], documents=["test"])
|
||||
attached_fn = collection.attach_function(
|
||||
name="test_function",
|
||||
function_id="record_counter",
|
||||
output_collection="output_collection",
|
||||
params=None,
|
||||
)
|
||||
|
||||
attached_fn.detach(delete_output_collection=True)
|
||||
|
||||
# Trying to detach this function again should raise NotFoundError
|
||||
with pytest.raises(NotFoundError, match="does not exist"):
|
||||
attached_fn.detach(delete_output_collection=True)
|
||||
Reference in New Issue
Block a user