chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境
- 包含所有Python依赖包
- 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions

View File

@@ -0,0 +1,73 @@
from concurrent.futures import ThreadPoolExecutor
import uuid
from chromadb.api import ClientAPI
from chromadb.errors import ChromaError, UniqueConstraintError
def test_duplicate_collection_create(
client: ClientAPI,
) -> None:
client.reset()
client.create_collection(
name="test",
metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
)
try:
client.create_collection(
name="test",
metadata={
"hnsw:construction_ef": 128,
"hnsw:search_ef": 128,
"hnsw:M": 128,
},
)
assert False, "Expected exception"
except Exception as e:
print("Collection creation failed as expected with error ", e)
assert "already exists" in e.args[0] or isinstance(e, UniqueConstraintError)
def test_not_existing_collection_delete(
client: ClientAPI,
) -> None:
try:
client.delete_collection(
name="test101",
)
assert False, "Expected exception"
except Exception as e:
print("Collection deletion failed as expected with error ", e)
assert "does not exist" in e.args[0]
def test_multithreaded_get_or_create(client: ClientAPI) -> None:
N_THREADS = 50
new_name = str(uuid.uuid4())
def create_maybe_delete_collection(i: int) -> None:
try:
coll = client.get_or_create_collection(new_name)
assert coll.name == new_name
except ChromaError as e:
if "concurrent" not in e.message():
raise e
try:
if i % 2 == 0:
client.delete_collection(new_name)
except ChromaError as e:
if "does not exist" not in e.message():
raise e
# Stress to trigger a potential race condition
with ThreadPoolExecutor(max_workers=N_THREADS) as executor:
futures = [
executor.submit(create_maybe_delete_collection, i) for i in range(N_THREADS)
]
for future in futures:
try:
future.result()
except Exception as e:
assert False, f"Thread raised an exception: {e}"

View File

@@ -0,0 +1,82 @@
import pytest
from chromadb.api.client import AdminClient, Client
from chromadb.config import System
from chromadb.db.impl.sqlite import SqliteDB
from chromadb.errors import NotFoundError
from chromadb.test.conftest import ClientFactories
def test_deletes_database(client_factories: ClientFactories) -> None:
client = client_factories.create_client()
client.reset()
admin_client = client_factories.create_admin_client_from_system()
admin_client.create_database("test_delete_database")
client = client_factories.create_client(database="test_delete_database")
collection = client.create_collection("foo")
admin_client.delete_database("test_delete_database")
with pytest.raises(NotFoundError):
admin_client.get_database("test_delete_database")
with pytest.raises(NotFoundError):
client.get_collection("foo")
with pytest.raises(NotFoundError):
collection.upsert(["foo"], [0.0, 0.0, 0.0])
def test_does_not_affect_other_databases(client_factories: ClientFactories) -> None:
client = client_factories.create_client()
client.reset()
admin_client = client_factories.create_admin_client_from_system()
admin_client.create_database("first")
admin_client.create_database("second")
first_client = client_factories.create_client(database="first")
first_client.create_collection("test")
second_client = client_factories.create_client(database="second")
second_collection = second_client.create_collection("test")
admin_client.delete_database("first")
assert second_client.get_collection("test").id == second_collection.id
with pytest.raises(NotFoundError):
first_client.get_collection("test")
def test_collection_was_removed(sqlite_persistent: System) -> None:
sqlite = sqlite_persistent.instance(SqliteDB)
admin_client = AdminClient.from_system(sqlite_persistent)
admin_client.create_database("test_delete_database")
client = Client.from_system(sqlite_persistent, database="test_delete_database")
client.create_collection("foo")
admin_client.delete_database("test_delete_database")
with pytest.raises(NotFoundError):
client.get_collection("foo")
# Check table
with sqlite.tx() as cur:
row = cur.execute("SELECT COUNT(*) from collections").fetchone()
assert row[0] == 0
def test_errors_when_database_does_not_exist(client_factories: ClientFactories) -> None:
client = client_factories.create_client()
client.reset()
admin_client = client_factories.create_admin_client_from_system()
with pytest.raises(NotFoundError):
admin_client.delete_database("foo")

View File

@@ -0,0 +1,8 @@
import pytest
from chromadb.errors import NotFoundError
from chromadb.test.conftest import ClientFactories
def test_get_database_not_found(client_factories: ClientFactories) -> None:
with pytest.raises(NotFoundError):
client_factories.create_client(database="does_not_exist")

View File

@@ -0,0 +1,17 @@
import numpy as np
from chromadb.api import ClientAPI
def test_invalid_update(client: ClientAPI) -> None:
client.reset()
collection = client.create_collection("test")
# Update is invalid because ID does not exist
collection.update(ids=["foo"], embeddings=[[0.0, 0.0, 0.0]])
collection.add(ids=["foo"], embeddings=[[1.0, 1.0, 1.0]])
result = collection.get(ids=["foo"], include=["embeddings"])
# Embeddings should be the same as what was provided to .add()
assert result["embeddings"] is not None
assert np.allclose(result["embeddings"][0], np.array([1.0, 1.0, 1.0]))

View File

@@ -0,0 +1,66 @@
import logging
import chromadb.test.property.strategies as strategies
import hypothesis.strategies as st
from chromadb.api import ClientAPI
from chromadb.test.conftest import NOT_CLUSTER_ONLY, reset
from chromadb.test.property import invariants
from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
from hypothesis import HealthCheck, given, settings
collection_st = st.shared(
strategies.collections(add_filterable_data=True, with_hnsw_params=True),
key="coll",
)
recordset_st = st.shared(
strategies.recordsets(collection_st, max_size=1000), key="recordset"
)
@settings(
deadline=90000,
suppress_health_check=[
HealthCheck.function_scoped_fixture,
HealthCheck.large_base_example,
HealthCheck.filter_too_much,
],
) # type: ignore
@given(
collection=collection_st,
record_set=recordset_st,
limit=st.integers(min_value=1, max_value=10),
offset=st.integers(min_value=0, max_value=10),
should_compact=st.booleans(),
)
def test_get_limit_offset(
caplog,
client: ClientAPI,
collection: strategies.Collection,
record_set: dict,
limit: int,
offset: int,
should_compact: bool,
) -> None:
caplog.set_level(logging.ERROR)
reset(client)
coll = client.create_collection(
name=collection.name,
metadata=collection.metadata, # type: ignore
embedding_function=collection.embedding_function,
)
initial_version = coll.get_model()["version"]
coll.add(**record_set)
if not NOT_CLUSTER_ONLY:
# Only wait for compaction if the size of the collection is
# some minimal size
if should_compact and len(invariants.wrap(record_set["ids"])) > 10:
# Wait for the model to be updated
wait_for_version_increase(client, collection.name, initial_version)
result_ids = coll.get(offset=offset, limit=limit)["ids"]
all_offset_ids = coll.get()["ids"]
assert result_ids == all_offset_ids[offset : offset + limit]

View File

@@ -0,0 +1,89 @@
from typing import Dict, List
from hypothesis import given
from chromadb.test.conftest import ClientFactories
import hypothesis.strategies as st
def test_list_databases(client_factories: ClientFactories) -> None:
client = client_factories.create_client()
client.reset()
admin_client = client_factories.create_admin_client_from_system()
for i in range(10):
admin_client.create_database(f"test_list_databases_{i}")
databases = admin_client.list_databases()
assert len(databases) == 11 # add 1 for the default_database
for i in range(10):
assert any(d["name"] == f"test_list_databases_{i}" for d in databases)
assert any(d["name"] == "default_database" for d in databases)
@st.composite
def tenants_and_databases_st(
draw: st.DrawFn, max_tenants: int, max_databases: int
) -> Dict[str, List[str]]:
"""Generates a set of random tenants and databases. Each database is assigned to a random tenant. Returns a dictionary where the key is the tenant name and the value is a list of database names for that tenant."""
num_tenants = draw(st.integers(min_value=1, max_value=max_tenants))
num_databases = draw(st.integers(min_value=0, max_value=max_databases))
database_i_to_tenant_i = draw(
st.lists(
st.integers(min_value=0, max_value=num_tenants - 1),
min_size=num_databases,
max_size=num_databases,
)
)
tenants = [f"tenant_{i}" for i in range(num_tenants)]
databases = [f"database_{i}" for i in range(num_databases)]
result: Dict[str, List[str]] = {}
for database_i, tenant_i in enumerate(database_i_to_tenant_i):
tenant = tenants[tenant_i]
database = databases[database_i]
if tenant not in result:
result[tenant] = []
result[tenant].append(database)
return result
@given(
limit=st.integers(min_value=1, max_value=10),
offset=st.integers(min_value=0, max_value=10),
tenants_and_databases=tenants_and_databases_st(max_tenants=10, max_databases=10),
)
def test_list_databases_with_limit_offset(
limit: int,
offset: int,
tenants_and_databases: Dict[str, List[str]],
client_factories: ClientFactories,
) -> None:
client = client_factories.create_client()
client.reset()
admin_client = client_factories.create_admin_client_from_system()
for tenant, databases in tenants_and_databases.items():
admin_client.create_tenant(tenant)
for database in databases:
admin_client.create_database(database, tenant)
for tenant, all_databases in tenants_and_databases.items():
listed_databases = admin_client.list_databases(
limit=limit, offset=offset, tenant=tenant
)
expected_databases = all_databases[offset : offset + limit]
if limit + offset > len(all_databases):
assert len(listed_databases) == max(len(all_databases) - offset, 0)
assert [d["name"] for d in listed_databases] == expected_databases
else:
assert len(listed_databases) == limit
assert [d["name"] for d in listed_databases] == expected_databases

View File

@@ -0,0 +1,62 @@
# Tests that various combinations of numpy and python lists work as expected as inputs
# to add/query/update/upsert operations
from typing import Any, Dict, List
import numpy as np
from chromadb.api import ClientAPI
from chromadb.api.models.Collection import Collection
from chromadb.test.conftest import reset
def add_and_validate(
collection: Collection,
ids: List[str],
embeddings: Any,
metadatas: List[Dict[str, Any]],
documents: List[str],
) -> None:
collection.add(ids=ids, embeddings=embeddings, metadatas=metadatas, documents=documents) # type: ignore
results = collection.get(include=["metadatas", "documents", "embeddings"]) # type: ignore
assert results["ids"] == ids
assert results["metadatas"] == metadatas
assert results["documents"] == documents
# Using integers instead of floats to avoid floating point comparison issues
assert np.array_equal(results["embeddings"], embeddings) # type: ignore
def test_py_list_of_numpy(client: ClientAPI) -> None:
reset(client)
coll = client.create_collection("test")
ids = ["1", "2", "3"]
embeddings = [np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2, 3])]
metadatas = [{"a": 1}, {"a": 2}, {"a": 3}]
documents = ["a", "b", "c"]
# List of numpy arrays
add_and_validate(coll, ids, embeddings, metadatas, documents)
def test_py_list_of_py(client: ClientAPI) -> None:
reset(client)
coll = client.create_collection("test")
ids = ["4", "5", "6"]
embeddings = [[1, 2, 3], [1, 2, 3], [1, 2, 3]]
metadatas = [{"a": 4}, {"a": 5}, {"a": 6}]
documents = ["d", "e", "f"]
# List of python lists
add_and_validate(coll, ids, embeddings, metadatas, documents)
def test_numpy(client: ClientAPI) -> None:
reset(client)
coll = client.create_collection("test")
ids = ["7", "8", "9"]
embeddings = np.array([[1, 2, 3], [1, 2, 3], [1, 2, 3]])
metadata = [{"a": 7}, {"a": 8}, {"a": 9}]
documents = ["g", "h", "i"]
# Numpy array
add_and_validate(coll, ids, embeddings, metadata, documents)

View File

@@ -0,0 +1,105 @@
import pytest
from typing import List, cast, Dict, Any
from chromadb.api.types import Documents, Image, Document, Embeddings
from chromadb.utils.embedding_functions import (
EmbeddingFunction,
register_embedding_function,
)
import numpy as np
def random_embeddings() -> Embeddings:
return cast(
Embeddings, [embedding for embedding in np.random.random(size=(10, 10))]
)
def random_image() -> Image:
return np.random.randint(0, 255, size=(10, 10, 3), dtype=np.int64)
def random_documents() -> List[Document]:
return [str(random_image()) for _ in range(10)]
def test_embedding_function_results_format_when_response_is_valid() -> None:
valid_embeddings = random_embeddings()
@register_embedding_function
class TestEmbeddingFunction(EmbeddingFunction[Documents]):
def __init__(self) -> None:
pass
@staticmethod
def name() -> str:
return "test"
@staticmethod
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]":
return TestEmbeddingFunction()
def get_config(self) -> Dict[str, Any]:
return {}
def __call__(self, input: Documents) -> Embeddings:
return valid_embeddings
@staticmethod
def validate_config(config: Dict[str, Any]) -> None:
pass
def validate_config_update(
self, old_config: Dict[str, Any], new_config: Dict[str, Any]
) -> None:
pass
ef = TestEmbeddingFunction()
embeddings = ef(random_documents())
for i, e in enumerate(embeddings):
assert np.array_equal(e, valid_embeddings[i])
def test_embedding_function_results_format_when_response_is_invalid() -> None:
invalid_embedding = {"error": "test"}
@register_embedding_function
class TestEmbeddingFunction(EmbeddingFunction[Documents]):
def __init__(self) -> None:
pass
@staticmethod
def name() -> str:
return "test"
@staticmethod
def build_from_config(config: Dict[str, Any]) -> "EmbeddingFunction[Documents]":
return TestEmbeddingFunction()
def get_config(self) -> Dict[str, Any]:
return {}
@staticmethod
def validate_config(config: Dict[str, Any]) -> None:
pass
def validate_config_update(
self, old_config: Dict[str, Any], new_config: Dict[str, Any]
) -> None:
pass
def __call__(self, input: Documents) -> Embeddings:
# Return something that's not a valid Embeddings type
return cast(Embeddings, invalid_embedding)
ef = TestEmbeddingFunction()
# The EmbeddingFunction protocol should validate the return value
# but we need to bypass the protocol's __call__ wrapper for this test
with pytest.raises(ValueError):
# This should raise a ValueError during normalization/validation
result = ef.__call__(random_documents())
# The normalize_embeddings function will raise a ValueError when given an invalid embedding
from chromadb.api.types import normalize_embeddings
normalize_embeddings(result)