chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/README.md
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/README.md
@@ -0,0 +1,3 @@
+# This folder holds basic sanity checks for the distributed version of chromadb
+# while it is in development. In the future, it may hold more extensive tests
+# in tandem with the main test suite, targeted at the distributed version.
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_log_backpressure.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_log_backpressure.py
@@ -0,0 +1,54 @@
+# Add up to 200k records until the log-is-full message is seen.
+
+import grpc
+import math
+import random
+import time
+
+import numpy as np
+
+from chromadb.api import ClientAPI
+from chromadb.proto.logservice_pb2 import SealLogRequest, MigrateLogRequest
+from chromadb.proto.logservice_pb2_grpc import LogServiceStub
+from chromadb.test.conftest import (
+    reset,
+    skip_if_not_cluster,
+)
+from chromadb.test.property import invariants
+from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
+
+RECORDS = 2000000
+BATCH_SIZE = 100
+
+@skip_if_not_cluster()
+def test_log_backpressure(
+    client: ClientAPI,
+) -> None:
+    seed = time.time()
+    random.seed(seed)
+    print("Generating data with seed ", seed)
+    reset(client)
+    collection = client.create_collection(
+        name="test",
+        metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
+    )
+
+    time.sleep(1)
+
+    print('backpressuring for', collection.id)
+
+    excepted = False
+    # Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
+    for i in range(0, RECORDS, BATCH_SIZE):
+        ids = []
+        embeddings = []
+        ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
+        embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
+        try:
+            collection.add(ids=ids, embeddings=embeddings)
+        except Exception as x:
+            print(f"Caught exception:\n{x}")
+            if 'Backoff and retry' in str(x):
+                excepted = True
+                break
+    assert excepted, "Expected an exception to be thrown."
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_repair_collection_log_offset.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_repair_collection_log_offset.py
@@ -0,0 +1,73 @@
+# Add some records, wait for compaction, then roll back the log offset.
+# Poll the log for up to 30s to see if the offset gets repaired.
+
+import grpc
+import random
+import time
+from typing import cast, List, Any, Dict
+
+import numpy as np
+
+from chromadb.api import ClientAPI
+from chromadb.proto.logservice_pb2 import InspectLogStateRequest, UpdateCollectionLogOffsetRequest
+from chromadb.proto.logservice_pb2_grpc import LogServiceStub
+from chromadb.test.conftest import (
+    reset,
+    skip_if_not_cluster,
+)
+from chromadb.test.utils.wait_for_version_increase import wait_for_version_increase
+
+RECORDS = 1000
+BATCH_SIZE = 100
+
+@skip_if_not_cluster()
+def test_repair_collection_log_offset(
+    client: ClientAPI,
+) -> None:
+    seed = time.time()
+    random.seed(seed)
+    print("Generating data with seed ", seed)
+    reset(client)
+
+    channel = grpc.insecure_channel('localhost:50054')
+    log_service_stub = LogServiceStub(channel)
+
+    collection = client.create_collection(
+        name="test_repair_collection_log_offset",
+        metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
+    )
+    print("collection_id =", collection.id)
+
+    initial_version = cast(int, collection.get_model()["version"])
+
+    # Add RECORDS records, where each embedding has 3 dimensions randomly generated between 0 and 1
+    for i in range(0, RECORDS, BATCH_SIZE):
+        ids = []
+        embeddings = []
+        ids.extend([str(x) for x in range(i, i + BATCH_SIZE)])
+        embeddings.extend([np.random.rand(1, 3)[0] for x in range(i, i + BATCH_SIZE)])
+        collection.add(ids=ids, embeddings=embeddings)
+
+    wait_for_version_increase(client, collection.name, initial_version)
+
+    found = False
+    now = time.time()
+    while time.time() - now < 240:
+        request = InspectLogStateRequest(collection_id=str(collection.id))
+        response = log_service_stub.InspectLogState(request, timeout=60)
+        if '''LogPosition { offset: 1001 }''' in response.debug:
+            found = True
+            break
+    assert found
+
+    request = UpdateCollectionLogOffsetRequest (collection_id=str(collection.id), log_offset=1)
+    response = log_service_stub.RollbackCollectionLogOffset(request, timeout=60)
+
+    now = time.time()
+    while time.time() - now < 240:
+        request = InspectLogStateRequest(collection_id=str(collection.id))
+        response = log_service_stub.InspectLogState(request, timeout=60)
+        if '''LogPosition { offset: 1001 }''' in response.debug:
+            return
+        time.sleep(1)
+    raise RuntimeError("Test timed out without repair")
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_reroute.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_reroute.py
@@ -0,0 +1,74 @@
+from typing import Sequence
+from chromadb.test.conftest import (
+    reset,
+    skip_if_not_cluster,
+)
+from chromadb.api import ClientAPI
+from kubernetes import client as k8s_client, config
+import time
+
+
+@skip_if_not_cluster()
+def test_reroute(
+    client: ClientAPI,
+) -> None:
+    reset(client)
+    collection = client.create_collection(
+        name="test",
+        metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
+    )
+
+    ids = [str(i) for i in range(10)]
+    embeddings: list[Sequence[float]] = [
+        [float(i), float(i), float(i)] for i in range(10)
+    ]
+    collection.add(ids=ids, embeddings=embeddings)
+    collection.query(query_embeddings=[embeddings[0]])
+
+    # Restart the query service using k8s api, in order to trigger a reroute
+    # of the query service
+    config.load_kube_config()
+    v1 = k8s_client.CoreV1Api()
+    # Find all pods with the label "app=query"
+    res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
+    assert len(res.items) > 0
+    items = res.items
+    seen_ids = set()
+
+    # Restart all the pods by deleting them
+    for item in items:
+        seen_ids.add(item.metadata.uid)
+        name = item.metadata.name
+        namespace = item.metadata.namespace
+        v1.delete_namespaced_pod(name, namespace)
+
+    # Wait until we have len(seen_ids) pods running with new UIDs
+    timeout_secs = 10
+    start_time = time.time()
+    while True:
+        res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
+        items = res.items
+        new_ids = set([item.metadata.uid for item in items])
+        if len(new_ids) == len(seen_ids) and len(new_ids.intersection(seen_ids)) == 0:
+            break
+        if time.time() - start_time > timeout_secs:
+            assert False, "Timed out waiting for new pods to start"
+        time.sleep(1)
+
+    # Wait for the query service to be ready, or timeout
+    while True:
+        res = v1.list_namespaced_pod("chroma", label_selector="app=query-service")
+        items = res.items
+        ready = True
+        for item in items:
+            if item.status.phase != "Running":
+                ready = False
+                break
+        if ready:
+            break
+        if time.time() - start_time > timeout_secs:
+            assert False, "Timed out waiting for new pods to be ready"
+        time.sleep(1)
+
+    time.sleep(1)
+    collection.query(query_embeddings=[embeddings[0]])
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_sanity.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_sanity.py
@@ -0,0 +1,102 @@
+# This tests a very minimal of test_add in test_add.py as a example based test
+# instead of a property based test. We can use the delta to get the property
+# test working and then enable
+import random
+import time
+from chromadb.api import ClientAPI
+from chromadb.test.conftest import (
+    reset,
+    skip_if_not_cluster,
+)
+from chromadb.test.property import invariants
+from chromadb.test.utils.wait_for_version_increase import (
+    wait_for_version_increase,
+    get_collection_version,
+)
+import numpy as np
+
+
+@skip_if_not_cluster()
+def test_add(
+    client: ClientAPI,
+) -> None:
+    seed = time.time()
+    random.seed(seed)
+    print("Generating data with seed ", seed)
+    reset(client)
+    collection = client.create_collection(
+        name="test",
+        metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
+    )
+
+    # Add 1000 records, where each embedding has 3 dimensions randomly generated
+    # between 0 and 1
+    ids = []
+    embeddings = []
+    for i in range(1000):
+        ids.append(str(i))
+        embeddings.append(np.random.rand(1, 3)[0])
+        collection.add(
+            ids=[str(i)],
+            embeddings=[embeddings[-1]],
+        )
+
+    random_query = np.random.rand(1, 3)[0]
+    print("Generated data with seed ", seed)
+
+    invariants.ann_accuracy(
+        collection,
+        {
+            "ids": ids,
+            "embeddings": embeddings,
+            "metadatas": None,
+            "documents": None,
+        },
+        10,
+        query_embeddings=[random_query],
+    )
+
+
+@skip_if_not_cluster()
+def test_add_include_all_with_compaction_delay(client: ClientAPI) -> None:
+    seed = time.time()
+    random.seed(seed)
+    print("Generating data with seed ", seed)
+    reset(client)
+    collection = client.create_collection(
+        name="test_add_include_all_with_compaction_delay",
+        metadata={"hnsw:construction_ef": 128, "hnsw:search_ef": 128, "hnsw:M": 128},
+    )
+    initial_version = get_collection_version(client, collection.name)
+
+    ids = []
+    embeddings = []
+    documents = []
+    for i in range(1000):
+        ids.append(str(i))
+        embeddings.append(np.random.rand(1, 3)[0])
+        documents.append(f"document_{i}")
+        collection.add(
+            ids=[str(i)],
+            embeddings=[embeddings[-1]],
+            documents=[documents[-1]],
+        )
+
+    wait_for_version_increase(client, collection.name, initial_version, 120)
+
+    random_query_1 = np.random.rand(1, 3)[0]
+    random_query_2 = np.random.rand(1, 3)[0]
+    print("Generated data with seed ", seed)
+
+    # Query the collection with a random query
+    invariants.ann_accuracy(
+        collection,
+        {
+            "ids": ids,
+            "embeddings": embeddings,
+            "metadatas": None,
+            "documents": documents,
+        },
+        10,
+        query_embeddings=[random_query_1, random_query_2],
+    )
--- a/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_task_api.py
+++ b/backend_service/venv/lib/python3.13/site-packages/chromadb/test/distributed/test_task_api.py
@@ -0,0 +1,205 @@
+"""
+Integration test for Chroma's Task API
+
+Tests the task creation, execution, and removal functionality
+for automatically processing collections.
+"""
+
+import pytest
+from chromadb.api.client import Client as ClientCreator
+from chromadb.config import System
+from chromadb.errors import ChromaError, NotFoundError
+
+
+def test_function_attach_and_detach(basic_http_client: System) -> None:
+    """Test creating and removing a function with the record_counter operator"""
+    client = ClientCreator.from_system(basic_http_client)
+    client.reset()
+
+    # Create a collection
+    collection = client.get_or_create_collection(
+        name="my_document",
+        metadata={"description": "Sample documents for task processing"},
+    )
+
+    # Add initial documents
+    collection.add(
+        ids=["doc1", "doc2", "doc3"],
+        documents=[
+            "The quick brown fox jumps over the lazy dog",
+            "Machine learning is a subset of artificial intelligence",
+            "Python is a popular programming language",
+        ],
+        metadatas=[{"source": "proverb"}, {"source": "tech"}, {"source": "tech"}],
+    )
+
+    # Verify collection has documents
+    assert collection.count() == 3
+
+    # Create a task that counts records in the collection
+    attached_fn = collection.attach_function(
+        name="count_my_docs",
+        function_id="record_counter",  # Built-in operator that counts records
+        output_collection="my_documents_counts",
+        params=None,
+    )
+
+    # Verify task creation succeeded
+    assert attached_fn is not None
+
+    # Add more documents
+    collection.add(
+        ids=["doc4", "doc5"],
+        documents=[
+            "Chroma is a vector database",
+            "Tasks automate data processing",
+        ],
+    )
+
+    # Verify documents were added
+    assert collection.count() == 5
+
+    # Remove the task
+    success = attached_fn.detach(
+        delete_output_collection=True,
+    )
+
+    # Verify task removal succeeded
+    assert success is True
+
+
+def test_task_with_invalid_function(basic_http_client: System) -> None:
+    """Test that creating a task with an invalid function raises an error"""
+    client = ClientCreator.from_system(basic_http_client)
+    client.reset()
+
+    collection = client.get_or_create_collection(name="test_invalid_function")
+    collection.add(ids=["id1"], documents=["test document"])
+
+    # Attempt to create task with non-existent function should raise ChromaError
+    with pytest.raises(ChromaError, match="function not found"):
+        collection.attach_function(
+            name="invalid_task",
+            function_id="nonexistent_function",
+            output_collection="output_collection",
+            params=None,
+        )
+
+
+def test_function_multiple_collections(basic_http_client: System) -> None:
+    """Test attaching functions on multiple collections"""
+    client = ClientCreator.from_system(basic_http_client)
+    client.reset()
+
+    # Create first collection and task
+    collection1 = client.create_collection(name="collection_1")
+    collection1.add(ids=["id1", "id2"], documents=["doc1", "doc2"])
+
+    attached_fn1 = collection1.attach_function(
+        name="task_1",
+        function_id="record_counter",
+        output_collection="output_1",
+        params=None,
+    )
+
+    assert attached_fn1 is not None
+
+    # Create second collection and task
+    collection2 = client.create_collection(name="collection_2")
+    collection2.add(ids=["id3", "id4"], documents=["doc3", "doc4"])
+
+    attached_fn2 = collection2.attach_function(
+        name="task_2",
+        function_id="record_counter",
+        output_collection="output_2",
+        params=None,
+    )
+
+    assert attached_fn2 is not None
+
+    # Task IDs should be different
+    assert attached_fn1.id != attached_fn2.id
+
+    # Clean up
+    assert attached_fn1.detach(delete_output_collection=True) is True
+    assert attached_fn2.detach(delete_output_collection=True) is True
+
+
+def test_functions_multiple_attached_functions(basic_http_client: System) -> None:
+    """Test attaching multiple functions on the same collection"""
+    client = ClientCreator.from_system(basic_http_client)
+    client.reset()
+
+    # Create a single collection
+    collection = client.create_collection(name="multi_task_collection")
+    collection.add(ids=["id1", "id2", "id3"], documents=["doc1", "doc2", "doc3"])
+
+    # Create first task on the collection
+    attached_fn1 = collection.attach_function(
+        name="task_1",
+        function_id="record_counter",
+        output_collection="output_1",
+        params=None,
+    )
+
+    assert attached_fn1 is not None
+
+    # Create second task on the SAME collection with a different name
+    attached_fn2 = collection.attach_function(
+        name="task_2",
+        function_id="record_counter",
+        output_collection="output_2",
+        params=None,
+    )
+
+    assert attached_fn2 is not None
+
+    # Task IDs should be different even though they're on the same collection
+    assert attached_fn1.id != attached_fn2.id
+
+    # Create third task on the same collection
+    attached_fn3 = collection.attach_function(
+        name="task_3",
+        function_id="record_counter",
+        output_collection="output_3",
+        params=None,
+    )
+
+    assert attached_fn3 is not None
+    assert attached_fn3.id != attached_fn1.id
+    assert attached_fn3.id != attached_fn2.id
+
+    # Attempt to create a task with duplicate name on same collection should fail
+    with pytest.raises(ChromaError, match="already exists"):
+        collection.attach_function(
+            name="task_1",  # Duplicate name
+            function_id="record_counter",
+            output_collection="output_duplicate",
+            params=None,
+        )
+
+    # Clean up - remove each task individually
+    assert attached_fn1.detach(delete_output_collection=True) is True
+    assert attached_fn2.detach(delete_output_collection=True) is True
+    assert attached_fn3.detach(delete_output_collection=True) is True
+
+
+def test_function_remove_nonexistent(basic_http_client: System) -> None:
+    """Test removing a task that doesn't exist raises NotFoundError"""
+    client = ClientCreator.from_system(basic_http_client)
+    client.reset()
+
+    collection = client.create_collection(name="test_collection")
+    collection.add(ids=["id1"], documents=["test"])
+    attached_fn = collection.attach_function(
+        name="test_function",
+        function_id="record_counter",
+        output_collection="output_collection",
+        params=None,
+    )
+
+    attached_fn.detach(delete_output_collection=True)
+
+    # Trying to detach this function again should raise NotFoundError
+    with pytest.raises(NotFoundError, match="does not exist"):
+        attached_fn.detach(delete_output_collection=True)