chore: 添加虚拟环境到仓库

- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意：虚拟环境约393MB，包含12655个文件
2025-12-03 10:19:25 +08:00
parent a6c2027caa
commit c4f851d387
12655 changed files with 3009376 additions and 0 deletions
--- a/backend_service/venv/lib/python3.13/site-packages/tiktoken/model.py
+++ b/backend_service/venv/lib/python3.13/site-packages/tiktoken/model.py
@@ -0,0 +1,118 @@
+from __future__ import annotations
+
+from .core import Encoding
+from .registry import get_encoding
+
+# TODO: these will likely be replaced by an API endpoint
+MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
+    "o1-": "o200k_base",
+    "o3-": "o200k_base",
+    "o4-mini-": "o200k_base",
+    # chat
+    "gpt-5-": "o200k_base",
+    "gpt-4.5-": "o200k_base",
+    "gpt-4.1-": "o200k_base",
+    "chatgpt-4o-": "o200k_base",
+    "gpt-4o-": "o200k_base",  # e.g., gpt-4o-2024-05-13
+    "gpt-4-": "cl100k_base",  # e.g., gpt-4-0314, etc., plus gpt-4-32k
+    "gpt-3.5-turbo-": "cl100k_base",  # e.g, gpt-3.5-turbo-0301, -0401, etc.
+    "gpt-35-turbo-": "cl100k_base",  # Azure deployment name
+    "gpt-oss-": "o200k_harmony",
+    # fine-tuned
+    "ft:gpt-4o": "o200k_base",
+    "ft:gpt-4": "cl100k_base",
+    "ft:gpt-3.5-turbo": "cl100k_base",
+    "ft:davinci-002": "cl100k_base",
+    "ft:babbage-002": "cl100k_base",
+}
+
+MODEL_TO_ENCODING: dict[str, str] = {
+    # reasoning
+    "o1": "o200k_base",
+    "o3": "o200k_base",
+    "o4-mini": "o200k_base",
+    # chat
+    "gpt-5": "o200k_base",
+    "gpt-4.1": "o200k_base",
+    "gpt-4o": "o200k_base",
+    "gpt-4": "cl100k_base",
+    "gpt-3.5-turbo": "cl100k_base",
+    "gpt-3.5": "cl100k_base",  # Common shorthand
+    "gpt-35-turbo": "cl100k_base",  # Azure deployment name
+    # base
+    "davinci-002": "cl100k_base",
+    "babbage-002": "cl100k_base",
+    # embeddings
+    "text-embedding-ada-002": "cl100k_base",
+    "text-embedding-3-small": "cl100k_base",
+    "text-embedding-3-large": "cl100k_base",
+    # DEPRECATED MODELS
+    # text (DEPRECATED)
+    "text-davinci-003": "p50k_base",
+    "text-davinci-002": "p50k_base",
+    "text-davinci-001": "r50k_base",
+    "text-curie-001": "r50k_base",
+    "text-babbage-001": "r50k_base",
+    "text-ada-001": "r50k_base",
+    "davinci": "r50k_base",
+    "curie": "r50k_base",
+    "babbage": "r50k_base",
+    "ada": "r50k_base",
+    # code (DEPRECATED)
+    "code-davinci-002": "p50k_base",
+    "code-davinci-001": "p50k_base",
+    "code-cushman-002": "p50k_base",
+    "code-cushman-001": "p50k_base",
+    "davinci-codex": "p50k_base",
+    "cushman-codex": "p50k_base",
+    # edit (DEPRECATED)
+    "text-davinci-edit-001": "p50k_edit",
+    "code-davinci-edit-001": "p50k_edit",
+    # old embeddings (DEPRECATED)
+    "text-similarity-davinci-001": "r50k_base",
+    "text-similarity-curie-001": "r50k_base",
+    "text-similarity-babbage-001": "r50k_base",
+    "text-similarity-ada-001": "r50k_base",
+    "text-search-davinci-doc-001": "r50k_base",
+    "text-search-curie-doc-001": "r50k_base",
+    "text-search-babbage-doc-001": "r50k_base",
+    "text-search-ada-doc-001": "r50k_base",
+    "code-search-babbage-code-001": "r50k_base",
+    "code-search-ada-code-001": "r50k_base",
+    # open source
+    "gpt2": "gpt2",
+    "gpt-2": "gpt2",  # Maintains consistency with gpt-4
+}
+
+
+def encoding_name_for_model(model_name: str) -> str:
+    """Returns the name of the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    encoding_name = None
+    if model_name in MODEL_TO_ENCODING:
+        encoding_name = MODEL_TO_ENCODING[model_name]
+    else:
+        # Check if the model matches a known prefix
+        # Prefix matching avoids needing library updates for every model version release
+        # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
+        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
+            if model_name.startswith(model_prefix):
+                return model_encoding_name
+
+    if encoding_name is None:
+        raise KeyError(
+            f"Could not automatically map {model_name} to a tokeniser. "
+            "Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
+        ) from None
+
+    return encoding_name
+
+
+def encoding_for_model(model_name: str) -> Encoding:
+    """Returns the encoding used by a model.
+
+    Raises a KeyError if the model name is not recognised.
+    """
+    return get_encoding(encoding_name_for_model(model_name))