chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
# This folder holds basic sanity checks for the distributed version of chromadb
# while it is in development. In the future, it may hold more extensive tests
# in tandem with the main test suite, targeted at the distributed version.

View File

@@ -0,0 +1,54 @@
# Add up to 200k records until the log-is-full message is seen.
import grpc
import math
import random
import time
import numpy as np
from chromadb.api import ClientAPI
from chromadb.proto.logservice_pb2 import SealLogRequest, MigrateLogRequest
from chromadb.proto.logservice_pb2_grpc import LogServiceStub
from chromadb.test.conftest import (
reset,
skip_if_not_cluster,
)
from chromadb.test.property import invariants
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
RECORDS = 2000000
BATCH_SIZE = 100
@skip_if_not_cluster()
def test_log_backpressure(
client: ClientAPI,
) -> None:
seed = time.time()
random.seed(seed)
print("Generating data with seed ", seed)
reset(client)
collection = client.create_collection(
name="test",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
time.sleep(1)
print('backpressuring for', collection.id)
excepted = False
# Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
for i in range(0, RECORDS, BATCH_SIZE):
ids = []
embeddings = []
ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
try:
collection.add(ids=ids, embeddings=embeddings)
except Exception as x:
print(f"Caught exception:\n{x}")
if 'Backoff and retry' in str(x):
excepted = True
break
assert excepted, "Expected an exception to be thrown."

View File

@@ -0,0 +1,73 @@
# Add some records, wait for compaction, then roll back the log offset.
# Poll the log for up to 30s to see if the offset gets repaired.
import grpc
import random
import time
from typing import cast, List, Any, Dict
import numpy as np
from chromadb.api import ClientAPI
from chromadb.proto.logservice_pb2 import InspectLogStateRequest, UpdateCollectionLogOffsetRequest
from chromadb.proto.logservice_pb2_grpc import LogServiceStub
from chromadb.test.conftest import (
reset,
skip_if_not_cluster,
)
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
RECORDS = 1000
BATCH_SIZE = 100
@skip_if_not_cluster()
def test_repair_collection_log_offset(
client: ClientAPI,
) -> None:
seed = time.time()
random.seed(seed)
print("Generating data with seed ", seed)
reset(client)
channel = grpc.insecure_channel('localhost:50054')
log_service_stub = LogServiceStub(channel)
collection = client.create_collection(
name="test_repair_collection_log_offset",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
print("collection_id =", collection.id)
initial_version = cast(int, collection.get_model()["version"])
# Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
for i in range(0, RECORDS, BATCH_SIZE):
ids = []
embeddings = []
ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
collection.add(ids=ids, embeddings=embeddings)
wait_for_version_increase(client, collection.name, initial_version)
found = False
now = time.time()
while time.time() - now < 240:
request = InspectLogStateRequest(collection_id=str(collection.id))
response = log_service_stub.InspectLogState(request, timeout=60)
if '''LogPosition { offset: 1001 }''' in response.debug:
found = True
break
assert found
request = UpdateCollectionLogOffsetRequest (collection_id=str(collection.id), log_offset=1)
response = log_service_stub.RollbackCollectionLogOffset(request, timeout=60)
now = time.time()
while time.time() - now < 240:
request = InspectLogStateRequest(collection_id=str(collection.id))
response = log_service_stub.InspectLogState(request, timeout=60)
if '''LogPosition { offset: 1001 }''' in response.debug:
return
time.sleep(1)
raise RuntimeError("Test timed out without repair")

View File

@@ -0,0 +1,74 @@
from typing import Sequence
from chromadb.test.conftest import (
reset,
skip_if_not_cluster,
)
from chromadb.api import ClientAPI
from kubernetes import client as k8s_client, config
import time
@skip_if_not_cluster()
def test_reroute(
client: ClientAPI,
) -> None:
reset(client)
collection = client.create_collection(
name="test",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
ids = [str(i) for i in range(10)]
embeddings: list[Sequence[float]] = [
[float(i), float(i), float(i)] for i in range(10)
]
collection.add(ids=ids, embeddings=embeddings)
collection.query(query_embeddings=[embeddings[0]])
# Restart the query service using k8s api, in order to trigger a reroute
# of the query service
config.load_kube_config()
v1 = k8s_client.CoreV1Api()
# Find all pods with the label "app=query"
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
assert len(res.items) > 0
items = res.items
seen_ids = set()
# Restart all the pods by deleting them
for item in items:
seen_ids.add(item.metadata.uid)
name = item.metadata.name
namespace = item.metadata.namespace
v1.delete_namespaced_pod(name, namespace)
# Wait until we have len(seen_ids) pods running with new UIDs
timeout_secs = 10
start_time = time.time()
while True:
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
items = res.items
new_ids = set([item.metadata.uid for item in items])
if len(new_ids) == len(seen_ids) and len(new_ids.intersection(seen_ids)) == 0:
break
if time.time() - start_time > timeout_secs:
assert False, "Timed out waiting for new pods to start"
time.sleep(1)
# Wait for the query service to be ready, or timeout
while True:
res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
items = res.items
ready = True
for item in items:
if item.status.phase != "Running":
ready = False
break
if ready:
break
if time.time() - start_time > timeout_secs:
assert False, "Timed out waiting for new pods to be ready"
time.sleep(1)
time.sleep(1)
collection.query(query_embeddings=[embeddings[0]])

View File

@@ -0,0 +1,102 @@
# This tests a very minimal of test_add in test_add.py as a example based test
# instead of a property based test. We can use the delta to get the property
# test working and then enable
import random
import time
from chromadb.api import ClientAPI
from chromadb.test.conftest import (
reset,
skip_if_not_cluster,
)
from chromadb.test.property import invariants
from chromadb.test.utils.wait_for_version_increase import (
wait_for_version_increase,
get_collection_version,
)
import numpy as np
@skip_if_not_cluster()
def test_add(
client: ClientAPI,
) -> None:
seed = time.time()
random.seed(seed)
print("Generating data with seed ", seed)
reset(client)
collection = client.create_collection(
name="test",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
# Add 1000 records, where each embedding has 3 dimensions randomly generated
# between 0 and 1
ids = []
embeddings = []
for i in range(1000):
ids.append(str(i))
embeddings.append(np.random.rand(1, 3)[0])
collection.add(
ids=[str(i)],
embeddings=[embeddings[-1]],
)
random_query = np.random.rand(1, 3)[0]
print("Generated data with seed ", seed)
invariants.ann_accuracy(
collection,
{
"ids": ids,
"embeddings": embeddings,
"metadatas": None,
"documents": None,
},
10,
query_embeddings=[random_query],
)
@skip_if_not_cluster()
def test_add_include_all_with_compaction_delay(client: ClientAPI) -> None:
seed = time.time()
random.seed(seed)
print("Generating data with seed ", seed)
reset(client)
collection = client.create_collection(
name="test_add_include_all_with_compaction_delay",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
initial_version = get_collection_version(client, collection.name)
ids = []
embeddings = []
documents = []
for i in range(1000):
ids.append(str(i))
embeddings.append(np.random.rand(1, 3)[0])
documents.append(f"document_{i}")
collection.add(
ids=[str(i)],
embeddings=[embeddings[-1]],
documents=[documents[-1]],
)
wait_for_version_increase(client, collection.name, initial_version, 120)
random_query_1 = np.random.rand(1, 3)[0]
random_query_2 = np.random.rand(1, 3)[0]
print("Generated data with seed ", seed)
# Query the collection with a random query
invariants.ann_accuracy(
collection,
{
"ids": ids,
"embeddings": embeddings,
"metadatas": None,
"documents": documents,
},
10,
query_embeddings=[random_query_1, random_query_2],
)

View File

@@ -0,0 +1,205 @@
"""
Integration test for Chroma's Task API
Tests the task creation, execution, and removal functionality
for automatically processing collections.
"""
import pytest
from chromadb.api.client import Client as ClientCreator
from chromadb.config import System
from chromadb.errors import ChromaError, NotFoundError
def test_function_attach_and_detach(basic_http_client: System) -> None:
"""Test creating and removing a function with the record_counter operator"""
client = ClientCreator.from_system(basic_http_client)
client.reset()
# Create a collection
collection = client.get_or_create_collection(
name="my_document",
metadata={"description": "Sample documents for task processing"},
)
# Add initial documents
collection.add(
ids=["doc1", "doc2", "doc3"],
documents=[
"The quick brown fox jumps over the lazy dog",
"Machine learning is a subset of artificial intelligence",
"Python is a popular programming language",
],
metadatas=[{"source": "proverb"}, {"source": "tech"}, {"source": "tech"}],
)
# Verify collection has documents
assert collection.count() == 3
# Create a task that counts records in the collection
attached_fn = collection.attach_function(
name="count_my_docs",
function_id="record_counter", # Built-in operator that counts records
output_collection="my_documents_counts",
params=None,
)
# Verify task creation succeeded
assert attached_fn is not None
# Add more documents
collection.add(
ids=["doc4", "doc5"],
documents=[
"Chroma is a vector database",
"Tasks automate data processing",
],
)
# Verify documents were added
assert collection.count() == 5
# Remove the task
success = attached_fn.detach(
delete_output_collection=True,
)
# Verify task removal succeeded
assert success is True
def test_task_with_invalid_function(basic_http_client: System) -> None:
"""Test that creating a task with an invalid function raises an error"""
client = ClientCreator.from_system(basic_http_client)
client.reset()
collection = client.get_or_create_collection(name="test_invalid_function")
collection.add(ids=["id1"], documents=["test document"])
# Attempt to create task with non-existent function should raise ChromaError
with pytest.raises(ChromaError, match="function not found"):
collection.attach_function(
name="invalid_task",
function_id="nonexistent_function",
output_collection="output_collection",
params=None,
)
def test_function_multiple_collections(basic_http_client: System) -> None:
"""Test attaching functions on multiple collections"""
client = ClientCreator.from_system(basic_http_client)
client.reset()
# Create first collection and task
collection1 = client.create_collection(name="collection_1")
collection1.add(ids=["id1", "id2"], documents=["doc1", "doc2"])
attached_fn1 = collection1.attach_function(
name="task_1",
function_id="record_counter",
output_collection="output_1",
params=None,
)
assert attached_fn1 is not None
# Create second collection and task
collection2 = client.create_collection(name="collection_2")
collection2.add(ids=["id3", "id4"], documents=["doc3", "doc4"])
attached_fn2 = collection2.attach_function(
name="task_2",
function_id="record_counter",
output_collection="output_2",
params=None,
)
assert attached_fn2 is not None
# Task IDs should be different
assert attached_fn1.id != attached_fn2.id
# Clean up
assert attached_fn1.detach(delete_output_collection=True) is True
assert attached_fn2.detach(delete_output_collection=True) is True
def test_functions_multiple_attached_functions(basic_http_client: System) -> None:
"""Test attaching multiple functions on the same collection"""
client = ClientCreator.from_system(basic_http_client)
client.reset()
# Create a single collection
collection = client.create_collection(name="multi_task_collection")
collection.add(ids=["id1", "id2", "id3"], documents=["doc1", "doc2", "doc3"])
# Create first task on the collection
attached_fn1 = collection.attach_function(
name="task_1",
function_id="record_counter",
output_collection="output_1",
params=None,
)
assert attached_fn1 is not None
# Create second task on the SAME collection with a different name
attached_fn2 = collection.attach_function(
name="task_2",
function_id="record_counter",
output_collection="output_2",
params=None,
)
assert attached_fn2 is not None
# Task IDs should be different even though they're on the same collection
assert attached_fn1.id != attached_fn2.id
# Create third task on the same collection
attached_fn3 = collection.attach_function(
name="task_3",
function_id="record_counter",
output_collection="output_3",
params=None,
)
assert attached_fn3 is not None
assert attached_fn3.id != attached_fn1.id
assert attached_fn3.id != attached_fn2.id
# Attempt to create a task with duplicate name on same collection should fail
with pytest.raises(ChromaError, match="already exists"):
collection.attach_function(
name="task_1", # Duplicate name
function_id="record_counter",
output_collection="output_duplicate",
params=None,
)
# Clean up - remove each task individually
assert attached_fn1.detach(delete_output_collection=True) is True
assert attached_fn2.detach(delete_output_collection=True) is True
assert attached_fn3.detach(delete_output_collection=True) is True
def test_function_remove_nonexistent(basic_http_client: System) -> None:
"""Test removing a task that doesn't exist raises NotFoundError"""
client = ClientCreator.from_system(basic_http_client)
client.reset()
collection = client.create_collection(name="test_collection")
collection.add(ids=["id1"], documents=["test"])
attached_fn = collection.attach_function(
name="test_function",
function_id="record_counter",
output_collection="output_collection",
params=None,
)
attached_fn.detach(delete_output_collection=True)
# Trying to detach this function again should raise NotFoundError
with pytest.raises(NotFoundError, match="does not exist"):
attached_fn.detach(delete_output_collection=True)