chore: 添加虚拟环境到仓库
- 添加 backend_service/venv 虚拟环境 - 包含所有Python依赖包 - 注意:虚拟环境约393MB,包含12655个文件
This commit is contained in:
@@ -0,0 +1,407 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import copy
|
||||
import json
|
||||
from typing import Any, Dict, Generator, List, Union, AsyncGenerator
|
||||
|
||||
from dashscope.api_entities.dashscope_response import (GenerationResponse,
|
||||
Message, Role)
|
||||
from dashscope.client.base_api import BaseAioApi, BaseApi
|
||||
from dashscope.common.constants import (CUSTOMIZED_MODEL_ID,
|
||||
DEPRECATED_MESSAGE, HISTORY, MESSAGES,
|
||||
PROMPT)
|
||||
from dashscope.common.error import InputRequired, ModelRequired
|
||||
from dashscope.common.logging import logger
|
||||
from dashscope.common.utils import _get_task_group_and_task
|
||||
from dashscope.utils.param_utils import ParamUtil
|
||||
from dashscope.utils.message_utils import merge_single_response
|
||||
|
||||
|
||||
class Generation(BaseApi):
|
||||
task = 'text-generation'
|
||||
"""API for AI-Generated Content(AIGC) models.
|
||||
|
||||
"""
|
||||
class Models:
|
||||
"""@deprecated, use qwen_turbo instead"""
|
||||
qwen_v1 = 'qwen-v1'
|
||||
"""@deprecated, use qwen_plus instead"""
|
||||
qwen_plus_v1 = 'qwen-plus-v1'
|
||||
|
||||
bailian_v1 = 'bailian-v1'
|
||||
dolly_12b_v2 = 'dolly-12b-v2'
|
||||
qwen_turbo = 'qwen-turbo'
|
||||
qwen_plus = 'qwen-plus'
|
||||
qwen_max = 'qwen-max'
|
||||
|
||||
@classmethod
|
||||
def call(
|
||||
cls,
|
||||
model: str,
|
||||
prompt: Any = None,
|
||||
history: list = None,
|
||||
api_key: str = None,
|
||||
messages: List[Message] = None,
|
||||
plugins: Union[str, Dict[str, Any]] = None,
|
||||
workspace: str = None,
|
||||
**kwargs
|
||||
) -> Union[GenerationResponse, Generator[GenerationResponse, None, None]]:
|
||||
"""Call generation model service.
|
||||
|
||||
Args:
|
||||
model (str): The requested model, such as qwen-turbo
|
||||
prompt (Any): The input prompt.
|
||||
history (list):The user provided history, deprecated
|
||||
examples:
|
||||
[{'user':'The weather is fine today.',
|
||||
'bot': 'Suitable for outings'}].
|
||||
Defaults to None.
|
||||
api_key (str, optional): The api api_key, can be None,
|
||||
if None, will get by default rule(TODO: api key doc).
|
||||
messages (list): The generation messages.
|
||||
examples:
|
||||
[{'role': 'user',
|
||||
'content': 'The weather is fine today.'},
|
||||
{'role': 'assistant', 'content': 'Suitable for outings'}]
|
||||
plugins (Any): The plugin config. Can be plugins config str, or dict.
|
||||
**kwargs:
|
||||
stream(bool, `optional`): Enable server-sent events
|
||||
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
|
||||
the result will back partially[qwen-turbo,bailian-v1].
|
||||
temperature(float, `optional`): Used to control the degree
|
||||
of randomness and diversity. Specifically, the temperature
|
||||
value controls the degree to which the probability distribution
|
||||
of each candidate word is smoothed when generating text.
|
||||
A higher temperature value will reduce the peak value of
|
||||
the probability, allowing more low-probability words to be
|
||||
selected, and the generated results will be more diverse;
|
||||
while a lower temperature value will enhance the peak value
|
||||
of the probability, making it easier for high-probability
|
||||
words to be selected, the generated results are more
|
||||
deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
|
||||
top_p(float, `optional`): A sampling strategy, called nucleus
|
||||
sampling, where the model considers the results of the
|
||||
tokens with top_p probability mass. So 0.1 means only
|
||||
the tokens comprising the top 10% probability mass are
|
||||
considered[qwen-turbo,bailian-v1].
|
||||
top_k(int, `optional`): The size of the sample candidate set when generated. # noqa E501
|
||||
For example, when the value is 50, only the 50 highest-scoring tokens # noqa E501
|
||||
in a single generation form a randomly sampled candidate set. # noqa E501
|
||||
The larger the value, the higher the randomness generated; # noqa E501
|
||||
the smaller the value, the higher the certainty generated. # noqa E501
|
||||
The default value is 0, which means the top_k policy is # noqa E501
|
||||
not enabled. At this time, only the top_p policy takes effect. # noqa E501
|
||||
enable_search(bool, `optional`): Whether to enable web search(quark). # noqa E501
|
||||
Currently works best only on the first round of conversation.
|
||||
Default to False, support model: [qwen-turbo].
|
||||
customized_model_id(str, required) The enterprise-specific
|
||||
large model id, which needs to be generated from the
|
||||
operation background of the enterprise-specific
|
||||
large model product, support model: [bailian-v1].
|
||||
result_format(str, `optional`): [message|text] Set result result format. # noqa E501
|
||||
Default result is text
|
||||
incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
|
||||
If true, the subsequent output will include the previously input content. # noqa E501
|
||||
Otherwise, the subsequent output will not include the previously output # noqa E501
|
||||
content. Default false.
|
||||
stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop # noqa E501
|
||||
when encountering setting str or token ids, the result will not include # noqa E501
|
||||
stop words or tokens.
|
||||
max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
|
||||
noted that the length generated by the model will only be less than max_tokens, # noqa E501
|
||||
not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
|
||||
directly prompt that the length exceeds the limit. It is generally # noqa E501
|
||||
not recommended to set this value.
|
||||
repetition_penalty(float, `optional`): Used to control the repeatability when generating models. # noqa E501
|
||||
Increasing repetition_penalty can reduce the duplication of model generation. # noqa E501
|
||||
1.0 means no punishment.
|
||||
workspace (str): The dashscope workspace id.
|
||||
Raises:
|
||||
InvalidInput: The history and auto_history are mutually exclusive.
|
||||
|
||||
Returns:
|
||||
Union[GenerationResponse,
|
||||
Generator[GenerationResponse, None, None]]: If
|
||||
stream is True, return Generator, otherwise GenerationResponse.
|
||||
"""
|
||||
if (prompt is None or not prompt) and (messages is None
|
||||
or not messages):
|
||||
raise InputRequired('prompt or messages is required!')
|
||||
if model is None or not model:
|
||||
raise ModelRequired('Model is required!')
|
||||
task_group, function = _get_task_group_and_task(__name__)
|
||||
if plugins is not None:
|
||||
headers = kwargs.pop('headers', {})
|
||||
if isinstance(plugins, str):
|
||||
headers['X-DashScope-Plugin'] = plugins
|
||||
else:
|
||||
headers['X-DashScope-Plugin'] = json.dumps(plugins)
|
||||
kwargs['headers'] = headers
|
||||
input, parameters = cls._build_input_parameters(
|
||||
model, prompt, history, messages, **kwargs)
|
||||
|
||||
is_stream = parameters.get('stream', False)
|
||||
# Check if we need to merge incremental output
|
||||
is_incremental_output = kwargs.get('incremental_output', None)
|
||||
to_merge_incremental_output = False
|
||||
if (ParamUtil.should_modify_incremental_output(model) and
|
||||
is_stream and is_incremental_output is False):
|
||||
to_merge_incremental_output = True
|
||||
parameters['incremental_output'] = True
|
||||
|
||||
# Pass incremental_to_full flag via headers user-agent
|
||||
if 'headers' not in parameters:
|
||||
parameters['headers'] = {}
|
||||
flag = '1' if to_merge_incremental_output else '0'
|
||||
parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'
|
||||
|
||||
response = super().call(model=model,
|
||||
task_group=task_group,
|
||||
task=Generation.task,
|
||||
function=function,
|
||||
api_key=api_key,
|
||||
input=input,
|
||||
workspace=workspace,
|
||||
**parameters)
|
||||
if is_stream:
|
||||
if to_merge_incremental_output:
|
||||
# Extract n parameter for merge logic
|
||||
n = parameters.get('n', 1)
|
||||
return cls._merge_generation_response(response, n)
|
||||
else:
|
||||
return (GenerationResponse.from_api_response(rsp)
|
||||
for rsp in response)
|
||||
else:
|
||||
return GenerationResponse.from_api_response(response)
|
||||
|
||||
@classmethod
|
||||
def _build_input_parameters(cls, model, prompt, history, messages,
|
||||
**kwargs):
|
||||
if model == Generation.Models.qwen_v1:
|
||||
logger.warning(
|
||||
'Model %s is deprecated, use %s instead!' %
|
||||
(Generation.Models.qwen_v1, Generation.Models.qwen_turbo))
|
||||
if model == Generation.Models.qwen_plus_v1:
|
||||
logger.warning(
|
||||
'Model %s is deprecated, use %s instead!' %
|
||||
(Generation.Models.qwen_plus_v1, Generation.Models.qwen_plus))
|
||||
parameters = {}
|
||||
input = {}
|
||||
if history is not None:
|
||||
logger.warning(DEPRECATED_MESSAGE)
|
||||
input[HISTORY] = history
|
||||
if prompt is not None and prompt:
|
||||
input[PROMPT] = prompt
|
||||
elif messages is not None:
|
||||
msgs = copy.deepcopy(messages)
|
||||
if prompt is not None and prompt:
|
||||
msgs.append({'role': Role.USER, 'content': prompt})
|
||||
input = {MESSAGES: msgs}
|
||||
else:
|
||||
input[PROMPT] = prompt
|
||||
|
||||
if model.startswith('qwen'):
|
||||
enable_search = kwargs.pop('enable_search', False)
|
||||
if enable_search:
|
||||
parameters['enable_search'] = enable_search
|
||||
elif model.startswith('bailian'):
|
||||
customized_model_id = kwargs.pop('customized_model_id', None)
|
||||
if customized_model_id is None:
|
||||
raise InputRequired('customized_model_id is required for %s' %
|
||||
model)
|
||||
input[CUSTOMIZED_MODEL_ID] = customized_model_id
|
||||
|
||||
return input, {**parameters, **kwargs}
|
||||
|
||||
@classmethod
|
||||
def _merge_generation_response(cls, response, n=1) -> Generator[GenerationResponse, None, None]:
|
||||
"""Merge incremental response chunks to simulate non-incremental output."""
|
||||
accumulated_data = {}
|
||||
for rsp in response:
|
||||
parsed_response = GenerationResponse.from_api_response(rsp)
|
||||
result = merge_single_response(parsed_response, accumulated_data, n)
|
||||
if result is True:
|
||||
yield parsed_response
|
||||
elif isinstance(result, list):
|
||||
# Multiple responses to yield (for n>1 non-stop cases)
|
||||
for resp in result:
|
||||
yield resp
|
||||
|
||||
|
||||
class AioGeneration(BaseAioApi):
|
||||
task = 'text-generation'
|
||||
"""API for AI-Generated Content(AIGC) models.
|
||||
|
||||
"""
|
||||
class Models:
|
||||
"""@deprecated, use qwen_turbo instead"""
|
||||
qwen_v1 = 'qwen-v1'
|
||||
"""@deprecated, use qwen_plus instead"""
|
||||
qwen_plus_v1 = 'qwen-plus-v1'
|
||||
|
||||
bailian_v1 = 'bailian-v1'
|
||||
dolly_12b_v2 = 'dolly-12b-v2'
|
||||
qwen_turbo = 'qwen-turbo'
|
||||
qwen_plus = 'qwen-plus'
|
||||
qwen_max = 'qwen-max'
|
||||
|
||||
@classmethod
|
||||
async def call(
|
||||
cls,
|
||||
model: str,
|
||||
prompt: Any = None,
|
||||
history: list = None,
|
||||
api_key: str = None,
|
||||
messages: List[Message] = None,
|
||||
plugins: Union[str, Dict[str, Any]] = None,
|
||||
workspace: str = None,
|
||||
**kwargs
|
||||
) -> Union[GenerationResponse, AsyncGenerator[GenerationResponse, None]]:
|
||||
"""Call generation model service.
|
||||
|
||||
Args:
|
||||
model (str): The requested model, such as qwen-turbo
|
||||
prompt (Any): The input prompt.
|
||||
history (list):The user provided history, deprecated
|
||||
examples:
|
||||
[{'user':'The weather is fine today.',
|
||||
'bot': 'Suitable for outings'}].
|
||||
Defaults to None.
|
||||
api_key (str, optional): The api api_key, can be None,
|
||||
if None, will get by default rule(TODO: api key doc).
|
||||
messages (list): The generation messages.
|
||||
examples:
|
||||
[{'role': 'user',
|
||||
'content': 'The weather is fine today.'},
|
||||
{'role': 'assistant', 'content': 'Suitable for outings'}]
|
||||
plugins (Any): The plugin config. Can be plugins config str, or dict.
|
||||
**kwargs:
|
||||
stream(bool, `optional`): Enable server-sent events
|
||||
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
|
||||
the result will back partially[qwen-turbo,bailian-v1].
|
||||
temperature(float, `optional`): Used to control the degree
|
||||
of randomness and diversity. Specifically, the temperature
|
||||
value controls the degree to which the probability distribution
|
||||
of each candidate word is smoothed when generating text.
|
||||
A higher temperature value will reduce the peak value of
|
||||
the probability, allowing more low-probability words to be
|
||||
selected, and the generated results will be more diverse;
|
||||
while a lower temperature value will enhance the peak value
|
||||
of the probability, making it easier for high-probability
|
||||
words to be selected, the generated results are more
|
||||
deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
|
||||
top_p(float, `optional`): A sampling strategy, called nucleus
|
||||
sampling, where the model considers the results of the
|
||||
tokens with top_p probability mass. So 0.1 means only
|
||||
the tokens comprising the top 10% probability mass are
|
||||
considered[qwen-turbo,bailian-v1].
|
||||
top_k(int, `optional`): The size of the sample candidate set when generated. # noqa E501
|
||||
For example, when the value is 50, only the 50 highest-scoring tokens # noqa E501
|
||||
in a single generation form a randomly sampled candidate set. # noqa E501
|
||||
The larger the value, the higher the randomness generated; # noqa E501
|
||||
the smaller the value, the higher the certainty generated. # noqa E501
|
||||
The default value is 0, which means the top_k policy is # noqa E501
|
||||
not enabled. At this time, only the top_p policy takes effect. # noqa E501
|
||||
enable_search(bool, `optional`): Whether to enable web search(quark). # noqa E501
|
||||
Currently works best only on the first round of conversation.
|
||||
Default to False, support model: [qwen-turbo].
|
||||
customized_model_id(str, required) The enterprise-specific
|
||||
large model id, which needs to be generated from the
|
||||
operation background of the enterprise-specific
|
||||
large model product, support model: [bailian-v1].
|
||||
result_format(str, `optional`): [message|text] Set result result format. # noqa E501
|
||||
Default result is text
|
||||
incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
|
||||
If true, the subsequent output will include the previously input content. # noqa E501
|
||||
Otherwise, the subsequent output will not include the previously output # noqa E501
|
||||
content. Default false.
|
||||
stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop # noqa E501
|
||||
when encountering setting str or token ids, the result will not include # noqa E501
|
||||
stop words or tokens.
|
||||
max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
|
||||
noted that the length generated by the model will only be less than max_tokens, # noqa E501
|
||||
not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
|
||||
directly prompt that the length exceeds the limit. It is generally # noqa E501
|
||||
not recommended to set this value.
|
||||
repetition_penalty(float, `optional`): Used to control the repeatability when generating models. # noqa E501
|
||||
Increasing repetition_penalty can reduce the duplication of model generation. # noqa E501
|
||||
1.0 means no punishment.
|
||||
workspace (str): The dashscope workspace id.
|
||||
Raises:
|
||||
InvalidInput: The history and auto_history are mutually exclusive.
|
||||
|
||||
Returns:
|
||||
Union[GenerationResponse,
|
||||
AsyncGenerator[GenerationResponse, None]]: If
|
||||
stream is True, return AsyncGenerator, otherwise GenerationResponse.
|
||||
"""
|
||||
if (prompt is None or not prompt) and (messages is None
|
||||
or not messages):
|
||||
raise InputRequired('prompt or messages is required!')
|
||||
if model is None or not model:
|
||||
raise ModelRequired('Model is required!')
|
||||
task_group, function = _get_task_group_and_task(__name__)
|
||||
if plugins is not None:
|
||||
headers = kwargs.pop('headers', {})
|
||||
if isinstance(plugins, str):
|
||||
headers['X-DashScope-Plugin'] = plugins
|
||||
else:
|
||||
headers['X-DashScope-Plugin'] = json.dumps(plugins)
|
||||
kwargs['headers'] = headers
|
||||
input, parameters = Generation._build_input_parameters(
|
||||
model, prompt, history, messages, **kwargs)
|
||||
|
||||
is_stream = parameters.get('stream', False)
|
||||
# Check if we need to merge incremental output
|
||||
is_incremental_output = kwargs.get('incremental_output', None)
|
||||
to_merge_incremental_output = False
|
||||
if (ParamUtil.should_modify_incremental_output(model) and
|
||||
is_stream and is_incremental_output is False):
|
||||
to_merge_incremental_output = True
|
||||
parameters['incremental_output'] = True
|
||||
|
||||
# Pass incremental_to_full flag via headers user-agent
|
||||
if 'headers' not in parameters:
|
||||
parameters['headers'] = {}
|
||||
flag = '1' if to_merge_incremental_output else '0'
|
||||
parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'
|
||||
|
||||
response = await super().call(model=model,
|
||||
task_group=task_group,
|
||||
task=Generation.task,
|
||||
function=function,
|
||||
api_key=api_key,
|
||||
input=input,
|
||||
workspace=workspace,
|
||||
**parameters)
|
||||
if is_stream:
|
||||
if to_merge_incremental_output:
|
||||
# Extract n parameter for merge logic
|
||||
n = parameters.get('n', 1)
|
||||
return cls._merge_generation_response(response, n)
|
||||
else:
|
||||
return cls._stream_responses(response)
|
||||
else:
|
||||
return GenerationResponse.from_api_response(response)
|
||||
|
||||
@classmethod
|
||||
async def _stream_responses(cls, response) -> AsyncGenerator[GenerationResponse, None]:
|
||||
"""Convert async response stream to GenerationResponse stream."""
|
||||
# Type hint: when stream=True, response is actually an AsyncIterable
|
||||
async for rsp in response: # type: ignore
|
||||
yield GenerationResponse.from_api_response(rsp)
|
||||
|
||||
@classmethod
|
||||
async def _merge_generation_response(cls, response, n=1) -> AsyncGenerator[GenerationResponse, None]:
|
||||
"""Async version of merge incremental response chunks."""
|
||||
accumulated_data = {}
|
||||
|
||||
async for rsp in response: # type: ignore
|
||||
parsed_response = GenerationResponse.from_api_response(rsp)
|
||||
result = merge_single_response(parsed_response, accumulated_data, n)
|
||||
if result is True:
|
||||
yield parsed_response
|
||||
elif isinstance(result, list):
|
||||
# Multiple responses to yield (for n>1 non-stop cases)
|
||||
for resp in result:
|
||||
yield resp
|
||||
Reference in New Issue
Block a user