# Copyright (c) Alibaba, Inc. and its affiliates.

import copy
import json
from typing import Any, Dict, Generator, List, Union, AsyncGenerator

from dashscope.api_entities.dashscope_response import (GenerationResponse,
                                                       Message, Role)
from dashscope.client.base_api import BaseAioApi, BaseApi
from dashscope.common.constants import (CUSTOMIZED_MODEL_ID,
                                        DEPRECATED_MESSAGE, HISTORY, MESSAGES,
                                        PROMPT)
from dashscope.common.error import InputRequired, ModelRequired
from dashscope.common.logging import logger
from dashscope.common.utils import _get_task_group_and_task
from dashscope.utils.param_utils import ParamUtil
from dashscope.utils.message_utils import merge_single_response


class Generation(BaseApi):
    task = 'text-generation'
    """API for AI-Generated Content(AIGC) models.

    """
    class Models:
        """@deprecated, use qwen_turbo instead"""
        qwen_v1 = 'qwen-v1'
        """@deprecated, use qwen_plus instead"""
        qwen_plus_v1 = 'qwen-plus-v1'

        bailian_v1 = 'bailian-v1'
        dolly_12b_v2 = 'dolly-12b-v2'
        qwen_turbo = 'qwen-turbo'
        qwen_plus = 'qwen-plus'
        qwen_max = 'qwen-max'

    @classmethod
    def call(
        cls,
        model: str,
        prompt: Any = None,
        history: list = None,
        api_key: str = None,
        messages: List[Message] = None,
        plugins: Union[str, Dict[str, Any]] = None,
        workspace: str = None,
        **kwargs
    ) -> Union[GenerationResponse, Generator[GenerationResponse, None, None]]:
        """Call generation model service.

        Args:
            model (str): The requested model, such as qwen-turbo
            prompt (Any): The input prompt.
            history (list):The user provided history, deprecated
                examples:
                    [{'user':'The weather is fine today.',
                    'bot': 'Suitable for outings'}].
                Defaults to None.
            api_key (str, optional): The api api_key, can be None,
                if None, will get by default rule(TODO: api key doc).
            messages (list): The generation messages.
                examples:
                    [{'role': 'user',
                      'content': 'The weather is fine today.'},
                      {'role': 'assistant', 'content': 'Suitable for outings'}]
            plugins (Any): The plugin config. Can be plugins config str, or dict.
            **kwargs:
                stream(bool, `optional`): Enable server-sent events
                    (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
                    the result will back partially[qwen-turbo,bailian-v1].
                temperature(float, `optional`): Used to control the degree
                    of randomness and diversity. Specifically, the temperature
                    value controls the degree to which the probability distribution
                    of each candidate word is smoothed when generating text.
                    A higher temperature value will reduce the peak value of
                    the probability, allowing more low-probability words to be
                    selected, and the generated results will be more diverse;
                    while a lower temperature value will enhance the peak value
                    of the probability, making it easier for high-probability
                    words to be selected, the generated results are more
                    deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
                top_p(float, `optional`): A sampling strategy, called nucleus
                    sampling, where the model considers the results of the
                    tokens with top_p probability mass. So 0.1 means only
                    the tokens comprising the top 10% probability mass are
                    considered[qwen-turbo,bailian-v1].
                top_k(int, `optional`): The size of the sample candidate set when generated.  # noqa E501
                    For example, when the value is 50, only the 50 highest-scoring tokens  # noqa E501
                    in a single generation form a randomly sampled candidate set. # noqa E501
                    The larger the value, the higher the randomness generated;  # noqa E501
                    the smaller the value, the higher the certainty generated. # noqa E501
                    The default value is 0, which means the top_k policy is  # noqa E501
                    not enabled. At this time, only the top_p policy takes effect. # noqa E501
                enable_search(bool, `optional`): Whether to enable web search(quark).  # noqa E501
                    Currently works best only on the first round of conversation.
                    Default to False, support model: [qwen-turbo].
                customized_model_id(str, required) The enterprise-specific
                    large model id, which needs to be generated from the
                    operation background of the enterprise-specific
                    large model product, support model: [bailian-v1].
                result_format(str, `optional`): [message|text] Set result result format. # noqa E501
                    Default result is text
                incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
                    If true, the subsequent output will include the previously input content. # noqa E501
                    Otherwise, the subsequent output will not include the previously output # noqa E501
                    content. Default false.
                stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop  # noqa E501
                    when encountering setting str or token ids, the result will not include # noqa E501
                    stop words or tokens.
                max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
                    noted that the length generated by the model will only be less than max_tokens,  # noqa E501
                    not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
                    directly prompt that the length exceeds the limit. It is generally # noqa E501
                    not recommended to set this value.
                repetition_penalty(float, `optional`): Used to control the repeatability when generating models.  # noqa E501
                    Increasing repetition_penalty can reduce the duplication of model generation.  # noqa E501
                    1.0 means no punishment.
            workspace (str): The dashscope workspace id.
        Raises:
            InvalidInput: The history and auto_history are mutually exclusive.

        Returns:
            Union[GenerationResponse,
                  Generator[GenerationResponse, None, None]]: If
            stream is True, return Generator, otherwise GenerationResponse.
        """
        if (prompt is None or not prompt) and (messages is None
                                               or not messages):
            raise InputRequired('prompt or messages is required!')
        if model is None or not model:
            raise ModelRequired('Model is required!')
        task_group, function = _get_task_group_and_task(__name__)
        if plugins is not None:
            headers = kwargs.pop('headers', {})
            if isinstance(plugins, str):
                headers['X-DashScope-Plugin'] = plugins
            else:
                headers['X-DashScope-Plugin'] = json.dumps(plugins)
            kwargs['headers'] = headers
        input, parameters = cls._build_input_parameters(
            model, prompt, history, messages, **kwargs)

        is_stream = parameters.get('stream', False)
        # Check if we need to merge incremental output
        is_incremental_output = kwargs.get('incremental_output', None)
        to_merge_incremental_output = False
        if (ParamUtil.should_modify_incremental_output(model) and
                is_stream and is_incremental_output is False):
            to_merge_incremental_output = True
            parameters['incremental_output'] = True

        # Pass incremental_to_full flag via headers user-agent
        if 'headers' not in parameters:
            parameters['headers'] = {}
        flag = '1' if to_merge_incremental_output else '0'
        parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'

        response = super().call(model=model,
                                task_group=task_group,
                                task=Generation.task,
                                function=function,
                                api_key=api_key,
                                input=input,
                                workspace=workspace,
                                **parameters)
        if is_stream:
            if to_merge_incremental_output:
                # Extract n parameter for merge logic
                n = parameters.get('n', 1)
                return cls._merge_generation_response(response, n)
            else:
                return (GenerationResponse.from_api_response(rsp)
                        for rsp in response)
        else:
            return GenerationResponse.from_api_response(response)

    @classmethod
    def _build_input_parameters(cls, model, prompt, history, messages,
                                **kwargs):
        if model == Generation.Models.qwen_v1:
            logger.warning(
                'Model %s is deprecated, use %s instead!' %
                (Generation.Models.qwen_v1, Generation.Models.qwen_turbo))
        if model == Generation.Models.qwen_plus_v1:
            logger.warning(
                'Model %s is deprecated, use %s instead!' %
                (Generation.Models.qwen_plus_v1, Generation.Models.qwen_plus))
        parameters = {}
        input = {}
        if history is not None:
            logger.warning(DEPRECATED_MESSAGE)
            input[HISTORY] = history
            if prompt is not None and prompt:
                input[PROMPT] = prompt
        elif messages is not None:
            msgs = copy.deepcopy(messages)
            if prompt is not None and prompt:
                msgs.append({'role': Role.USER, 'content': prompt})
            input = {MESSAGES: msgs}
        else:
            input[PROMPT] = prompt

        if model.startswith('qwen'):
            enable_search = kwargs.pop('enable_search', False)
            if enable_search:
                parameters['enable_search'] = enable_search
        elif model.startswith('bailian'):
            customized_model_id = kwargs.pop('customized_model_id', None)
            if customized_model_id is None:
                raise InputRequired('customized_model_id is required for %s' %
                                    model)
            input[CUSTOMIZED_MODEL_ID] = customized_model_id

        return input, {**parameters, **kwargs}

    @classmethod
    def _merge_generation_response(cls, response, n=1) -> Generator[GenerationResponse, None, None]:
        """Merge incremental response chunks to simulate non-incremental output."""
        accumulated_data = {}
        for rsp in response:
            parsed_response = GenerationResponse.from_api_response(rsp)
            result = merge_single_response(parsed_response, accumulated_data, n)
            if result is True:
                yield parsed_response
            elif isinstance(result, list):
                # Multiple responses to yield (for n>1 non-stop cases)
                for resp in result:
                    yield resp


class AioGeneration(BaseAioApi):
    task = 'text-generation'
    """API for AI-Generated Content(AIGC) models.

    """
    class Models:
        """@deprecated, use qwen_turbo instead"""
        qwen_v1 = 'qwen-v1'
        """@deprecated, use qwen_plus instead"""
        qwen_plus_v1 = 'qwen-plus-v1'

        bailian_v1 = 'bailian-v1'
        dolly_12b_v2 = 'dolly-12b-v2'
        qwen_turbo = 'qwen-turbo'
        qwen_plus = 'qwen-plus'
        qwen_max = 'qwen-max'

    @classmethod
    async def call(
        cls,
        model: str,
        prompt: Any = None,
        history: list = None,
        api_key: str = None,
        messages: List[Message] = None,
        plugins: Union[str, Dict[str, Any]] = None,
        workspace: str = None,
        **kwargs
    ) -> Union[GenerationResponse, AsyncGenerator[GenerationResponse, None]]:
        """Call generation model service.

        Args:
            model (str): The requested model, such as qwen-turbo
            prompt (Any): The input prompt.
            history (list):The user provided history, deprecated
                examples:
                    [{'user':'The weather is fine today.',
                    'bot': 'Suitable for outings'}].
                Defaults to None.
            api_key (str, optional): The api api_key, can be None,
                if None, will get by default rule(TODO: api key doc).
            messages (list): The generation messages.
                examples:
                    [{'role': 'user',
                      'content': 'The weather is fine today.'},
                      {'role': 'assistant', 'content': 'Suitable for outings'}]
            plugins (Any): The plugin config. Can be plugins config str, or dict.
            **kwargs:
                stream(bool, `optional`): Enable server-sent events
                    (ref: https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)  # noqa E501
                    the result will back partially[qwen-turbo,bailian-v1].
                temperature(float, `optional`): Used to control the degree
                    of randomness and diversity. Specifically, the temperature
                    value controls the degree to which the probability distribution
                    of each candidate word is smoothed when generating text.
                    A higher temperature value will reduce the peak value of
                    the probability, allowing more low-probability words to be
                    selected, and the generated results will be more diverse;
                    while a lower temperature value will enhance the peak value
                    of the probability, making it easier for high-probability
                    words to be selected, the generated results are more
                    deterministic, range(0, 2) .[qwen-turbo,qwen-plus].
                top_p(float, `optional`): A sampling strategy, called nucleus
                    sampling, where the model considers the results of the
                    tokens with top_p probability mass. So 0.1 means only
                    the tokens comprising the top 10% probability mass are
                    considered[qwen-turbo,bailian-v1].
                top_k(int, `optional`): The size of the sample candidate set when generated.  # noqa E501
                    For example, when the value is 50, only the 50 highest-scoring tokens  # noqa E501
                    in a single generation form a randomly sampled candidate set. # noqa E501
                    The larger the value, the higher the randomness generated;  # noqa E501
                    the smaller the value, the higher the certainty generated. # noqa E501
                    The default value is 0, which means the top_k policy is  # noqa E501
                    not enabled. At this time, only the top_p policy takes effect. # noqa E501
                enable_search(bool, `optional`): Whether to enable web search(quark).  # noqa E501
                    Currently works best only on the first round of conversation.
                    Default to False, support model: [qwen-turbo].
                customized_model_id(str, required) The enterprise-specific
                    large model id, which needs to be generated from the
                    operation background of the enterprise-specific
                    large model product, support model: [bailian-v1].
                result_format(str, `optional`): [message|text] Set result result format. # noqa E501
                    Default result is text
                incremental_output(bool, `optional`): Used to control the streaming output mode. # noqa E501
                    If true, the subsequent output will include the previously input content. # noqa E501
                    Otherwise, the subsequent output will not include the previously output # noqa E501
                    content. Default false.
                stop(list[str] or list[list[int]], `optional`): Used to control the generation to stop  # noqa E501
                    when encountering setting str or token ids, the result will not include # noqa E501
                    stop words or tokens.
                max_tokens(int, `optional`): The maximum token num expected to be output. It should be # noqa E501
                    noted that the length generated by the model will only be less than max_tokens,  # noqa E501
                    not necessarily equal to it. If max_tokens is set too large, the service will # noqa E501
                    directly prompt that the length exceeds the limit. It is generally # noqa E501
                    not recommended to set this value.
                repetition_penalty(float, `optional`): Used to control the repeatability when generating models.  # noqa E501
                    Increasing repetition_penalty can reduce the duplication of model generation.  # noqa E501
                    1.0 means no punishment.
            workspace (str): The dashscope workspace id.
        Raises:
            InvalidInput: The history and auto_history are mutually exclusive.

        Returns:
            Union[GenerationResponse,
                  AsyncGenerator[GenerationResponse, None]]: If
            stream is True, return AsyncGenerator, otherwise GenerationResponse.
        """
        if (prompt is None or not prompt) and (messages is None
                                               or not messages):
            raise InputRequired('prompt or messages is required!')
        if model is None or not model:
            raise ModelRequired('Model is required!')
        task_group, function = _get_task_group_and_task(__name__)
        if plugins is not None:
            headers = kwargs.pop('headers', {})
            if isinstance(plugins, str):
                headers['X-DashScope-Plugin'] = plugins
            else:
                headers['X-DashScope-Plugin'] = json.dumps(plugins)
            kwargs['headers'] = headers
        input, parameters = Generation._build_input_parameters(
            model, prompt, history, messages, **kwargs)

        is_stream = parameters.get('stream', False)
        # Check if we need to merge incremental output
        is_incremental_output = kwargs.get('incremental_output', None)
        to_merge_incremental_output = False
        if (ParamUtil.should_modify_incremental_output(model) and
                is_stream and is_incremental_output is False):
            to_merge_incremental_output = True
            parameters['incremental_output'] = True

        # Pass incremental_to_full flag via headers user-agent
        if 'headers' not in parameters:
            parameters['headers'] = {}
        flag = '1' if to_merge_incremental_output else '0'
        parameters['headers']['user-agent'] = f'incremental_to_full/{flag}'

        response = await super().call(model=model,
                                      task_group=task_group,
                                      task=Generation.task,
                                      function=function,
                                      api_key=api_key,
                                      input=input,
                                      workspace=workspace,
                                      **parameters)
        if is_stream:
            if to_merge_incremental_output:
                # Extract n parameter for merge logic
                n = parameters.get('n', 1)
                return cls._merge_generation_response(response, n)
            else:
                return cls._stream_responses(response)
        else:
            return GenerationResponse.from_api_response(response)

    @classmethod
    async def _stream_responses(cls, response) -> AsyncGenerator[GenerationResponse, None]:
        """Convert async response stream to GenerationResponse stream."""
        # Type hint: when stream=True, response is actually an AsyncIterable
        async for rsp in response:  # type: ignore
            yield GenerationResponse.from_api_response(rsp)

    @classmethod
    async def _merge_generation_response(cls, response, n=1) -> AsyncGenerator[GenerationResponse, None]:
        """Async version of merge incremental response chunks."""
        accumulated_data = {}

        async for rsp in response:  # type: ignore
            parsed_response = GenerationResponse.from_api_response(rsp)
            result = merge_single_response(parsed_response, accumulated_data, n)
            if result is True:
                yield parsed_response
            elif isinstance(result, list):
                # Multiple responses to yield (for n>1 non-stop cases)
                for resp in result:
                    yield resp