chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,118 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .core import Encoding
|
||||
from .registry import get_encoding
|
||||
|
||||
# TODO: these will likely be replaced by an API endpoint
|
||||
MODEL_PREFIX_TO_ENCODING: dict[str, str] = {
|
||||
"o1-": "o200k_base",
|
||||
"o3-": "o200k_base",
|
||||
"o4-mini-": "o200k_base",
|
||||
# chat
|
||||
"gpt-5-": "o200k_base",
|
||||
"gpt-4.5-": "o200k_base",
|
||||
"gpt-4.1-": "o200k_base",
|
||||
"chatgpt-4o-": "o200k_base",
|
||||
"gpt-4o-": "o200k_base", # e.g., gpt-4o-2024-05-13
|
||||
"gpt-4-": "cl100k_base", # e.g., gpt-4-0314, etc., plus gpt-4-32k
|
||||
"gpt-3.5-turbo-": "cl100k_base", # e.g, gpt-3.5-turbo-0301, -0401, etc.
|
||||
"gpt-35-turbo-": "cl100k_base", # Azure deployment name
|
||||
"gpt-oss-": "o200k_harmony",
|
||||
# fine-tuned
|
||||
"ft:gpt-4o": "o200k_base",
|
||||
"ft:gpt-4": "cl100k_base",
|
||||
"ft:gpt-3.5-turbo": "cl100k_base",
|
||||
"ft:davinci-002": "cl100k_base",
|
||||
"ft:babbage-002": "cl100k_base",
|
||||
}
|
||||
|
||||
MODEL_TO_ENCODING: dict[str, str] = {
|
||||
# reasoning
|
||||
"o1": "o200k_base",
|
||||
"o3": "o200k_base",
|
||||
"o4-mini": "o200k_base",
|
||||
# chat
|
||||
"gpt-5": "o200k_base",
|
||||
"gpt-4.1": "o200k_base",
|
||||
"gpt-4o": "o200k_base",
|
||||
"gpt-4": "cl100k_base",
|
||||
"gpt-3.5-turbo": "cl100k_base",
|
||||
"gpt-3.5": "cl100k_base", # Common shorthand
|
||||
"gpt-35-turbo": "cl100k_base", # Azure deployment name
|
||||
# base
|
||||
"davinci-002": "cl100k_base",
|
||||
"babbage-002": "cl100k_base",
|
||||
# embeddings
|
||||
"text-embedding-ada-002": "cl100k_base",
|
||||
"text-embedding-3-small": "cl100k_base",
|
||||
"text-embedding-3-large": "cl100k_base",
|
||||
# DEPRECATED MODELS
|
||||
# text (DEPRECATED)
|
||||
"text-davinci-003": "p50k_base",
|
||||
"text-davinci-002": "p50k_base",
|
||||
"text-davinci-001": "r50k_base",
|
||||
"text-curie-001": "r50k_base",
|
||||
"text-babbage-001": "r50k_base",
|
||||
"text-ada-001": "r50k_base",
|
||||
"davinci": "r50k_base",
|
||||
"curie": "r50k_base",
|
||||
"babbage": "r50k_base",
|
||||
"ada": "r50k_base",
|
||||
# code (DEPRECATED)
|
||||
"code-davinci-002": "p50k_base",
|
||||
"code-davinci-001": "p50k_base",
|
||||
"code-cushman-002": "p50k_base",
|
||||
"code-cushman-001": "p50k_base",
|
||||
"davinci-codex": "p50k_base",
|
||||
"cushman-codex": "p50k_base",
|
||||
# edit (DEPRECATED)
|
||||
"text-davinci-edit-001": "p50k_edit",
|
||||
"code-davinci-edit-001": "p50k_edit",
|
||||
# old embeddings (DEPRECATED)
|
||||
"text-similarity-davinci-001": "r50k_base",
|
||||
"text-similarity-curie-001": "r50k_base",
|
||||
"text-similarity-babbage-001": "r50k_base",
|
||||
"text-similarity-ada-001": "r50k_base",
|
||||
"text-search-davinci-doc-001": "r50k_base",
|
||||
"text-search-curie-doc-001": "r50k_base",
|
||||
"text-search-babbage-doc-001": "r50k_base",
|
||||
"text-search-ada-doc-001": "r50k_base",
|
||||
"code-search-babbage-code-001": "r50k_base",
|
||||
"code-search-ada-code-001": "r50k_base",
|
||||
# open source
|
||||
"gpt2": "gpt2",
|
||||
"gpt-2": "gpt2", # Maintains consistency with gpt-4
|
||||
}
|
||||
|
||||
|
||||
def encoding_name_for_model(model_name: str) -> str:
|
||||
"""Returns the name of the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
encoding_name = None
|
||||
if model_name in MODEL_TO_ENCODING:
|
||||
encoding_name = MODEL_TO_ENCODING[model_name]
|
||||
else:
|
||||
# Check if the model matches a known prefix
|
||||
# Prefix matching avoids needing library updates for every model version release
|
||||
# Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)
|
||||
for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():
|
||||
if model_name.startswith(model_prefix):
|
||||
return model_encoding_name
|
||||
|
||||
if encoding_name is None:
|
||||
raise KeyError(
|
||||
f"Could not automatically map {model_name} to a tokeniser. "
|
||||
"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect."
|
||||
) from None
|
||||
|
||||
return encoding_name
|
||||
|
||||
|
||||
def encoding_for_model(model_name: str) -> Encoding:
|
||||
"""Returns the encoding used by a model.
|
||||
|
||||
Raises a KeyError if the model name is not recognised.
|
||||
"""
|
||||
return get_encoding(encoding_name_for_model(model_name))
|
||||
Reference in New Issue
Block a user