408 lines
21 KiB
Python
408 lines
21 KiB
Python
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
|
|
import copy
|
|
import json
|
|
from typing import Any, Dict, Generator, List, Union, AsyncGenerator
|
|
|
|
from dashscope.api_entities.dashscope_response import (GenerationResponse,
|
|
Message, Role)
|
|
from dashscope.client.base_api import BaseAioApi, BaseApi
|
|
from dashscope.common.constants import (CUSTOMIZED_MODEL_ID,
|
|
DEPRECATED_MESSAGE, HISTORY, MESSAGES,
|
|
PROMPT)
|
|
from dashscope.common.error import InputRequired, ModelRequired
|
|
from dashscope.common.logging import logger
|
|
from dashscope.common.utils import _get_task_group_and_task
|
|
from dashscope.utils.param_utils import ParamUtil
|
|
from dashscope.utils.message_utils import merge_single_response
|
|
|
|
|
|
class Generation(BaseApi):
|
|
task = 'text-generation'
|
|
"""API for AI-Generated Content(AIGC) models.
|
|
|
|
"""
|
|
class Models:
|
|
"""@deprecated, use qwen_turbo instead"""
|
|
qwen_v1 = 'qwen-v1'
|
|
"""@deprecated, use qwen_plus instead"""
|
|
qwen_plus_v1 = 'qwen-plus-v1'
|
|
|
|
bailian_v1 = 'bailian-v1'
|
|
dolly_12b_v2 = 'dolly-12b-v2'
|
|
qwen_turbo = 'qwen-turbo'
|
|
qwen_plus = 'qwen-plus'
|
|
qwen_max = 'qwen-max'
|
|
|
|
@classmethod
|
|
def call(
|
|
cls,
|
|
model: str,
|
|
prompt: Any = None,
|
|
history: list = None,
|
|
api_key: str = None,
|
|
messages: List[Message] = None,
|
|
plugins: Union[str, Dict[str, Any]] = None,
|
|
workspace: str = None,
|
|
**kwargs
|
|
) -> Union[GenerationResponse, Generator[GenerationResponse, None, None]]:
|
|
"""Call generation model service.
|
|
|
|
Args:
|
|
model (str): The requested model, such as qwen-turbo
|
|
prompt (Any): The input prompt.
|
|
history (list):The user provided history, deprecated
|
|
examples:
|
|
[{'user':'The weather is fine today.',
|
|
'bot': 'Suitable for outings'}].
|
|
Defaults to None.
|
|
api_key (str, optional): The api api_key, can be None,
|
|
if None, will get by default rule(TODO: api key doc).
|
|
messages (list): The generation messages.
|
|
examples:
|
|
[{'role': 'user',
|
|
'content': 'The weather is fine today.'},
|
|
{'role': 'assistant', 'content': 'Suitable for outings'}]
|
|
plugins (Any): The plugin config. Can be plugins config str, or dict.
|
|
**kwargs:
|
|
stream(bool, `optional`): Enable server-sent events
|
|
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
|
|
the result will back partially[qwen-turbo,bailian-v1].
|
|
temperature(float, `optional`): Used to control the degree
|
|
of randomness and diversity. Specifically, the temperature
|
|
value controls the degree to which the probability distribution
|
|
of each candidate word is smoothed when generating text.
|
|
A higher temperature value will reduce the peak value of
|
|
the probability, allowing more low-probability words to be
|
|
selected, and the generated results will be more diverse;
|
|
while a lower temperature value will enhance the peak value
|
|
of the probability, making it easier for high-probability
|
|
words to be selected, the generated results are more
|
|
deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
|
|
top_p(float, `optional`): A sampling strategy, called nucleus
|
|
sampling, where the model considers the results of the
|
|
tokens with top_p probability mass. So 0.1 means only
|
|
the tokens comprising the top 10% probability mass are
|
|
considered[qwen-turbo,bailian-v1].
|
|
top_k(int, `optional`): The size of the sample candidate set when generated. # noqa E501
|
|
For example, when the value is 50, only the 50 highest-scoring tokens # noqa E501
|
|
in a single generation form a randomly sampled candidate set. # noqa E501
|
|
The larger the value, the higher the randomness generated; # noqa E501
|
|
the smaller the value, the higher the certainty generated. # noqa E501
|
|
The default value is 0, which means the top_k policy is # noqa E501
|
|
not enabled. At this time, only the top_p policy takes effect. # noqa E501
|
|
enable_search(bool, `optional`): Whether to enable web search(quark). # noqa E501
|
|
Currently works best only on the first round of conversation.
|
|
Default to False, support model: [qwen-turbo].
|
|
customized_model_id(str, required) The enterprise-specific
|
|
large model id, which needs to be generated from the
|
|
operation background of the enterprise-specific
|
|
large model product, support model: [bailian-v1].
|
|
result_format(str, `optional`): [message|text] Set result result format. # noqa E501
|
|
Default result is text
|
|
incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
|
|
If true, the subsequent output will include the previously input content. # noqa E501
|
|
Otherwise, the subsequent output will not include the previously output # noqa E501
|
|
content. Default false.
|
|
stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop # noqa E501
|
|
when encountering setting str or token ids, the result will not include # noqa E501
|
|
stop words or tokens.
|
|
max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
|
|
noted that the length generated by the model will only be less than max_tokens, # noqa E501
|
|
not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
|
|
directly prompt that the length exceeds the limit. It is generally # noqa E501
|
|
not recommended to set this value.
|
|
repetition_penalty(float, `optional`): Used to control the repeatability when generating models. # noqa E501
|
|
Increasing repetition_penalty can reduce the duplication of model generation. # noqa E501
|
|
1.0 means no punishment.
|
|
workspace (str): The dashscope workspace id.
|
|
Raises:
|
|
InvalidInput: The history and auto_history are mutually exclusive.
|
|
|
|
Returns:
|
|
Union[GenerationResponse,
|
|
Generator[GenerationResponse, None, None]]: If
|
|
stream is True, return Generator, otherwise GenerationResponse.
|
|
"""
|
|
if (prompt is None or not prompt) and (messages is None
|
|
or not messages):
|
|
raise InputRequired('prompt or messages is required!')
|
|
if model is None or not model:
|
|
raise ModelRequired('Model is required!')
|
|
task_group, function = _get_task_group_and_task(__name__)
|
|
if plugins is not None:
|
|
headers = kwargs.pop('headers', {})
|
|
if isinstance(plugins, str):
|
|
headers['X-DashScope-Plugin'] = plugins
|
|
else:
|
|
headers['X-DashScope-Plugin'] = json.dumps(plugins)
|
|
kwargs['headers'] = headers
|
|
input, parameters = cls._build_input_parameters(
|
|
model, prompt, history, messages, **kwargs)
|
|
|
|
is_stream = parameters.get('stream', False)
|
|
# Check if we need to merge incremental output
|
|
is_incremental_output = kwargs.get('incremental_output', None)
|
|
to_merge_incremental_output = False
|
|
if (ParamUtil.should_modify_incremental_output(model) and
|
|
is_stream and is_incremental_output is False):
|
|
to_merge_incremental_output = True
|
|
parameters['incremental_output'] = True
|
|
|
|
# Pass incremental_to_full flag via headers user-agent
|
|
if 'headers' not in parameters:
|
|
parameters['headers'] = {}
|
|
flag = '1' if to_merge_incremental_output else '0'
|
|
parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'
|
|
|
|
response = super().call(model=model,
|
|
task_group=task_group,
|
|
task=Generation.task,
|
|
function=function,
|
|
api_key=api_key,
|
|
input=input,
|
|
workspace=workspace,
|
|
**parameters)
|
|
if is_stream:
|
|
if to_merge_incremental_output:
|
|
# Extract n parameter for merge logic
|
|
n = parameters.get('n', 1)
|
|
return cls._merge_generation_response(response, n)
|
|
else:
|
|
return (GenerationResponse.from_api_response(rsp)
|
|
for rsp in response)
|
|
else:
|
|
return GenerationResponse.from_api_response(response)
|
|
|
|
@classmethod
|
|
def _build_input_parameters(cls, model, prompt, history, messages,
|
|
**kwargs):
|
|
if model == Generation.Models.qwen_v1:
|
|
logger.warning(
|
|
'Model %s is deprecated, use %s instead!' %
|
|
(Generation.Models.qwen_v1, Generation.Models.qwen_turbo))
|
|
if model == Generation.Models.qwen_plus_v1:
|
|
logger.warning(
|
|
'Model %s is deprecated, use %s instead!' %
|
|
(Generation.Models.qwen_plus_v1, Generation.Models.qwen_plus))
|
|
parameters = {}
|
|
input = {}
|
|
if history is not None:
|
|
logger.warning(DEPRECATED_MESSAGE)
|
|
input[HISTORY] = history
|
|
if prompt is not None and prompt:
|
|
input[PROMPT] = prompt
|
|
elif messages is not None:
|
|
msgs = copy.deepcopy(messages)
|
|
if prompt is not None and prompt:
|
|
msgs.append({'role': Role.USER, 'content': prompt})
|
|
input = {MESSAGES: msgs}
|
|
else:
|
|
input[PROMPT] = prompt
|
|
|
|
if model.startswith('qwen'):
|
|
enable_search = kwargs.pop('enable_search', False)
|
|
if enable_search:
|
|
parameters['enable_search'] = enable_search
|
|
elif model.startswith('bailian'):
|
|
customized_model_id = kwargs.pop('customized_model_id', None)
|
|
if customized_model_id is None:
|
|
raise InputRequired('customized_model_id is required for %s' %
|
|
model)
|
|
input[CUSTOMIZED_MODEL_ID] = customized_model_id
|
|
|
|
return input, {**parameters, **kwargs}
|
|
|
|
@classmethod
|
|
def _merge_generation_response(cls, response, n=1) -> Generator[GenerationResponse, None, None]:
|
|
"""Merge incremental response chunks to simulate non-incremental output."""
|
|
accumulated_data = {}
|
|
for rsp in response:
|
|
parsed_response = GenerationResponse.from_api_response(rsp)
|
|
result = merge_single_response(parsed_response, accumulated_data, n)
|
|
if result is True:
|
|
yield parsed_response
|
|
elif isinstance(result, list):
|
|
# Multiple responses to yield (for n>1 non-stop cases)
|
|
for resp in result:
|
|
yield resp
|
|
|
|
|
|
class AioGeneration(BaseAioApi):
|
|
task = 'text-generation'
|
|
"""API for AI-Generated Content(AIGC) models.
|
|
|
|
"""
|
|
class Models:
|
|
"""@deprecated, use qwen_turbo instead"""
|
|
qwen_v1 = 'qwen-v1'
|
|
"""@deprecated, use qwen_plus instead"""
|
|
qwen_plus_v1 = 'qwen-plus-v1'
|
|
|
|
bailian_v1 = 'bailian-v1'
|
|
dolly_12b_v2 = 'dolly-12b-v2'
|
|
qwen_turbo = 'qwen-turbo'
|
|
qwen_plus = 'qwen-plus'
|
|
qwen_max = 'qwen-max'
|
|
|
|
@classmethod
|
|
async def call(
|
|
cls,
|
|
model: str,
|
|
prompt: Any = None,
|
|
history: list = None,
|
|
api_key: str = None,
|
|
messages: List[Message] = None,
|
|
plugins: Union[str, Dict[str, Any]] = None,
|
|
workspace: str = None,
|
|
**kwargs
|
|
) -> Union[GenerationResponse, AsyncGenerator[GenerationResponse, None]]:
|
|
"""Call generation model service.
|
|
|
|
Args:
|
|
model (str): The requested model, such as qwen-turbo
|
|
prompt (Any): The input prompt.
|
|
history (list):The user provided history, deprecated
|
|
examples:
|
|
[{'user':'The weather is fine today.',
|
|
'bot': 'Suitable for outings'}].
|
|
Defaults to None.
|
|
api_key (str, optional): The api api_key, can be None,
|
|
if None, will get by default rule(TODO: api key doc).
|
|
messages (list): The generation messages.
|
|
examples:
|
|
[{'role': 'user',
|
|
'content': 'The weather is fine today.'},
|
|
{'role': 'assistant', 'content': 'Suitable for outings'}]
|
|
plugins (Any): The plugin config. Can be plugins config str, or dict.
|
|
**kwargs:
|
|
stream(bool, `optional`): Enable server-sent events
|
|
(ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events) # noqa E501
|
|
the result will back partially[qwen-turbo,bailian-v1].
|
|
temperature(float, `optional`): Used to control the degree
|
|
of randomness and diversity. Specifically, the temperature
|
|
value controls the degree to which the probability distribution
|
|
of each candidate word is smoothed when generating text.
|
|
A higher temperature value will reduce the peak value of
|
|
the probability, allowing more low-probability words to be
|
|
selected, and the generated results will be more diverse;
|
|
while a lower temperature value will enhance the peak value
|
|
of the probability, making it easier for high-probability
|
|
words to be selected, the generated results are more
|
|
deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
|
|
top_p(float, `optional`): A sampling strategy, called nucleus
|
|
sampling, where the model considers the results of the
|
|
tokens with top_p probability mass. So 0.1 means only
|
|
the tokens comprising the top 10% probability mass are
|
|
considered[qwen-turbo,bailian-v1].
|
|
top_k(int, `optional`): The size of the sample candidate set when generated. # noqa E501
|
|
For example, when the value is 50, only the 50 highest-scoring tokens # noqa E501
|
|
in a single generation form a randomly sampled candidate set. # noqa E501
|
|
The larger the value, the higher the randomness generated; # noqa E501
|
|
the smaller the value, the higher the certainty generated. # noqa E501
|
|
The default value is 0, which means the top_k policy is # noqa E501
|
|
not enabled. At this time, only the top_p policy takes effect. # noqa E501
|
|
enable_search(bool, `optional`): Whether to enable web search(quark). # noqa E501
|
|
Currently works best only on the first round of conversation.
|
|
Default to False, support model: [qwen-turbo].
|
|
customized_model_id(str, required) The enterprise-specific
|
|
large model id, which needs to be generated from the
|
|
operation background of the enterprise-specific
|
|
large model product, support model: [bailian-v1].
|
|
result_format(str, `optional`): [message|text] Set result result format. # noqa E501
|
|
Default result is text
|
|
incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
|
|
If true, the subsequent output will include the previously input content. # noqa E501
|
|
Otherwise, the subsequent output will not include the previously output # noqa E501
|
|
content. Default false.
|
|
stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop # noqa E501
|
|
when encountering setting str or token ids, the result will not include # noqa E501
|
|
stop words or tokens.
|
|
max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
|
|
noted that the length generated by the model will only be less than max_tokens, # noqa E501
|
|
not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
|
|
directly prompt that the length exceeds the limit. It is generally # noqa E501
|
|
not recommended to set this value.
|
|
repetition_penalty(float, `optional`): Used to control the repeatability when generating models. # noqa E501
|
|
Increasing repetition_penalty can reduce the duplication of model generation. # noqa E501
|
|
1.0 means no punishment.
|
|
workspace (str): The dashscope workspace id.
|
|
Raises:
|
|
InvalidInput: The history and auto_history are mutually exclusive.
|
|
|
|
Returns:
|
|
Union[GenerationResponse,
|
|
AsyncGenerator[GenerationResponse, None]]: If
|
|
stream is True, return AsyncGenerator, otherwise GenerationResponse.
|
|
"""
|
|
if (prompt is None or not prompt) and (messages is None
|
|
or not messages):
|
|
raise InputRequired('prompt or messages is required!')
|
|
if model is None or not model:
|
|
raise ModelRequired('Model is required!')
|
|
task_group, function = _get_task_group_and_task(__name__)
|
|
if plugins is not None:
|
|
headers = kwargs.pop('headers', {})
|
|
if isinstance(plugins, str):
|
|
headers['X-DashScope-Plugin'] = plugins
|
|
else:
|
|
headers['X-DashScope-Plugin'] = json.dumps(plugins)
|
|
kwargs['headers'] = headers
|
|
input, parameters = Generation._build_input_parameters(
|
|
model, prompt, history, messages, **kwargs)
|
|
|
|
is_stream = parameters.get('stream', False)
|
|
# Check if we need to merge incremental output
|
|
is_incremental_output = kwargs.get('incremental_output', None)
|
|
to_merge_incremental_output = False
|
|
if (ParamUtil.should_modify_incremental_output(model) and
|
|
is_stream and is_incremental_output is False):
|
|
to_merge_incremental_output = True
|
|
parameters['incremental_output'] = True
|
|
|
|
# Pass incremental_to_full flag via headers user-agent
|
|
if 'headers' not in parameters:
|
|
parameters['headers'] = {}
|
|
flag = '1' if to_merge_incremental_output else '0'
|
|
parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'
|
|
|
|
response = await super().call(model=model,
|
|
task_group=task_group,
|
|
task=Generation.task,
|
|
function=function,
|
|
api_key=api_key,
|
|
input=input,
|
|
workspace=workspace,
|
|
**parameters)
|
|
if is_stream:
|
|
if to_merge_incremental_output:
|
|
# Extract n parameter for merge logic
|
|
n = parameters.get('n', 1)
|
|
return cls._merge_generation_response(response, n)
|
|
else:
|
|
return cls._stream_responses(response)
|
|
else:
|
|
return GenerationResponse.from_api_response(response)
|
|
|
|
@classmethod
|
|
async def _stream_responses(cls, response) -> AsyncGenerator[GenerationResponse, None]:
|
|
"""Convert async response stream to GenerationResponse stream."""
|
|
# Type hint: when stream=True, response is actually an AsyncIterable
|
|
async for rsp in response: # type: ignore
|
|
yield GenerationResponse.from_api_response(rsp)
|
|
|
|
@classmethod
|
|
async def _merge_generation_response(cls, response, n=1) -> AsyncGenerator[GenerationResponse, None]:
|
|
"""Async version of merge incremental response chunks."""
|
|
accumulated_data = {}
|
|
|
|
async for rsp in response: # type: ignore
|
|
parsed_response = GenerationResponse.from_api_response(rsp)
|
|
result = merge_single_response(parsed_response, accumulated_data, n)
|
|
if result is True:
|
|
yield parsed_response
|
|
elif isinstance(result, list):
|
|
# Multiple responses to yield (for n>1 non-stop cases)
|
|
for resp in result:
|
|
yield resp
|