增加环绕侦察场景适配
This commit is contained in:
@@ -75,7 +75,7 @@ class SplitDelimiterBehavior(Enum):
|
||||
CONTIGUOUS = "contiguous"
|
||||
|
||||
|
||||
from .tokenizers import (
|
||||
from .tokenizers import ( # type: ignore[import]
|
||||
AddedToken,
|
||||
Encoding,
|
||||
NormalizedString,
|
||||
|
||||
@@ -34,7 +34,15 @@ class AddedToken:
|
||||
Defines whether this token should be skipped when decoding.
|
||||
|
||||
"""
|
||||
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
||||
def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
@@ -44,6 +52,13 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@content.setter
|
||||
def content(self, value):
|
||||
"""
|
||||
Get the content of this :obj:`AddedToken`
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def lstrip(self):
|
||||
"""
|
||||
@@ -51,6 +66,13 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@lstrip.setter
|
||||
def lstrip(self, value):
|
||||
"""
|
||||
Get the value of the :obj:`lstrip` option
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def normalized(self):
|
||||
"""
|
||||
@@ -58,6 +80,13 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@normalized.setter
|
||||
def normalized(self, value):
|
||||
"""
|
||||
Get the value of the :obj:`normalized` option
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def rstrip(self):
|
||||
"""
|
||||
@@ -65,6 +94,13 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@rstrip.setter
|
||||
def rstrip(self, value):
|
||||
"""
|
||||
Get the value of the :obj:`rstrip` option
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def single_word(self):
|
||||
"""
|
||||
@@ -72,6 +108,13 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@single_word.setter
|
||||
def single_word(self, value):
|
||||
"""
|
||||
Get the value of the :obj:`single_word` option
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def special(self):
|
||||
"""
|
||||
@@ -79,10 +122,28 @@ class AddedToken:
|
||||
"""
|
||||
pass
|
||||
|
||||
@special.setter
|
||||
def special(self, value):
|
||||
"""
|
||||
Get the value of the :obj:`special` option
|
||||
"""
|
||||
pass
|
||||
|
||||
class Encoding:
|
||||
"""
|
||||
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def attention_mask(self):
|
||||
"""
|
||||
@@ -97,6 +158,20 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@attention_mask.setter
|
||||
def attention_mask(self, value):
|
||||
"""
|
||||
The attention mask
|
||||
|
||||
This indicates to the LM which tokens should be attended to, and which should not.
|
||||
This is especially important when batching sequences, where we need to applying
|
||||
padding.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: The attention mask
|
||||
"""
|
||||
pass
|
||||
|
||||
def char_to_token(self, char_pos, sequence_index=0):
|
||||
"""
|
||||
Get the token that contains the char at the given position in the input sequence.
|
||||
@@ -140,6 +215,19 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@ids.setter
|
||||
def ids(self, value):
|
||||
"""
|
||||
The generated IDs
|
||||
|
||||
The IDs are the main input to a Language Model. They are the token indices,
|
||||
the numerical representations that a LM understands.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: The list of IDs
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def merge(encodings, growing_offsets=True):
|
||||
"""
|
||||
@@ -167,6 +255,16 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@n_sequences.setter
|
||||
def n_sequences(self, value):
|
||||
"""
|
||||
The number of sequences represented
|
||||
|
||||
Returns:
|
||||
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def offsets(self):
|
||||
"""
|
||||
@@ -180,6 +278,19 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@offsets.setter
|
||||
def offsets(self, value):
|
||||
"""
|
||||
The offsets associated to each token
|
||||
|
||||
These offsets let's you slice the input string, and thus retrieve the original
|
||||
part that led to producing the corresponding token.
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def overflowing(self):
|
||||
"""
|
||||
@@ -195,6 +306,21 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@overflowing.setter
|
||||
def overflowing(self, value):
|
||||
"""
|
||||
A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
|
||||
|
||||
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
|
||||
the output into as many pieces as required to match the specified maximum length.
|
||||
This field lets you retrieve all the subsequent pieces.
|
||||
|
||||
When you use pairs of sequences, the overflowing pieces will contain enough
|
||||
variations to cover all the possible combinations, while respecting the provided
|
||||
maximum length.
|
||||
"""
|
||||
pass
|
||||
|
||||
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
|
||||
"""
|
||||
Pad the :class:`~tokenizers.Encoding` at the given length
|
||||
@@ -231,6 +357,20 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@sequence_ids.setter
|
||||
def sequence_ids(self, value):
|
||||
"""
|
||||
The generated sequence indices.
|
||||
|
||||
They represent the index of the input sequence associated to each token.
|
||||
The sequence id can be None if the token is not related to any input sequence,
|
||||
like for example with special tokens.
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
|
||||
"""
|
||||
pass
|
||||
|
||||
def set_sequence_id(self, sequence_id):
|
||||
"""
|
||||
Set the given sequence index
|
||||
@@ -252,6 +392,18 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@special_tokens_mask.setter
|
||||
def special_tokens_mask(self, value):
|
||||
"""
|
||||
The special token mask
|
||||
|
||||
This indicates which tokens are special tokens, and which are not.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: The special tokens mask
|
||||
"""
|
||||
pass
|
||||
|
||||
def token_to_chars(self, token_index):
|
||||
"""
|
||||
Get the offsets of the token at the given index.
|
||||
@@ -314,6 +466,18 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@tokens.setter
|
||||
def tokens(self, value):
|
||||
"""
|
||||
The generated tokens
|
||||
|
||||
They are the string representation of the IDs.
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: The list of tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
def truncate(self, max_length, stride=0, direction="right"):
|
||||
"""
|
||||
Truncate the :class:`~tokenizers.Encoding` at the given length
|
||||
@@ -346,6 +510,19 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@type_ids.setter
|
||||
def type_ids(self, value):
|
||||
"""
|
||||
The generated type IDs
|
||||
|
||||
Generally used for tasks like sequence classification or question answering,
|
||||
these tokens let the LM know which input sequence corresponds to each tokens.
|
||||
|
||||
Returns:
|
||||
:obj:`List[int]`: The list of type ids
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def word_ids(self):
|
||||
"""
|
||||
@@ -364,6 +541,24 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@word_ids.setter
|
||||
def word_ids(self, value):
|
||||
"""
|
||||
The generated word indices.
|
||||
|
||||
They represent the index of the word associated to each token.
|
||||
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
||||
otherwise they correspond to the words indices as defined by the
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
||||
|
||||
For special tokens and such (any token that was generated from something that was
|
||||
not part of the input), the output is :obj:`None`
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
||||
"""
|
||||
pass
|
||||
|
||||
def word_to_chars(self, word_index, sequence_index=0):
|
||||
"""
|
||||
Get the offsets of the word at the given index in one of the input sequences.
|
||||
@@ -417,6 +612,28 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
|
||||
@words.setter
|
||||
def words(self, value):
|
||||
"""
|
||||
The generated word indices.
|
||||
|
||||
.. warning::
|
||||
This is deprecated and will be removed in a future version.
|
||||
Please use :obj:`~tokenizers.Encoding.word_ids` instead.
|
||||
|
||||
They represent the index of the word associated to each token.
|
||||
When the input is pre-tokenized, they correspond to the ID of the given input label,
|
||||
otherwise they correspond to the words indices as defined by the
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
|
||||
|
||||
For special tokens and such (any token that was generated from something that was
|
||||
not part of the input), the output is :obj:`None`
|
||||
|
||||
Returns:
|
||||
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
|
||||
"""
|
||||
pass
|
||||
|
||||
class NormalizedString:
|
||||
"""
|
||||
NormalizedString
|
||||
@@ -429,6 +646,21 @@ class NormalizedString:
|
||||
sequence: str:
|
||||
The string sequence used to initialize this NormalizedString
|
||||
"""
|
||||
def __init__(self, sequence):
|
||||
pass
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Return self[key].
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getstate__(self, /):
|
||||
"""
|
||||
Helper for pickle.
|
||||
"""
|
||||
pass
|
||||
|
||||
def append(self, s):
|
||||
"""
|
||||
Append the given sequence to the string
|
||||
@@ -505,6 +737,23 @@ class NormalizedString:
|
||||
"""
|
||||
pass
|
||||
|
||||
@normalized.setter
|
||||
def normalized(self, value):
|
||||
"""
|
||||
The normalized part of the string
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def original(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@original.setter
|
||||
def original(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def prepend(self, s):
|
||||
"""
|
||||
Prepend the given sequence to the string
|
||||
@@ -587,6 +836,12 @@ class PreTokenizedString:
|
||||
def __init__(self, sequence):
|
||||
pass
|
||||
|
||||
def __getstate__(self, /):
|
||||
"""
|
||||
Helper for pickle.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_splits(self, offset_referential="original", offset_type="char"):
|
||||
"""
|
||||
Get the splits currently managed by the PreTokenizedString
|
||||
@@ -671,8 +926,55 @@ class Regex:
|
||||
def __init__(self, pattern):
|
||||
pass
|
||||
|
||||
def __getstate__(self, /):
|
||||
"""
|
||||
Helper for pickle.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Token:
|
||||
pass
|
||||
def __init__(self, id, value, offsets):
|
||||
pass
|
||||
|
||||
def __getstate__(self, /):
|
||||
"""
|
||||
Helper for pickle.
|
||||
"""
|
||||
pass
|
||||
|
||||
def as_tuple(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def id(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@id.setter
|
||||
def id(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def offsets(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@offsets.setter
|
||||
def offsets(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def value(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@value.setter
|
||||
def value(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Tokenizer:
|
||||
"""
|
||||
@@ -687,6 +989,18 @@ class Tokenizer:
|
||||
def __init__(self, model):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def add_special_tokens(self, tokens):
|
||||
"""
|
||||
Add the given special tokens to the Tokenizer.
|
||||
@@ -890,6 +1204,13 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@decoder.setter
|
||||
def decoder(self, value):
|
||||
"""
|
||||
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
def enable_padding(
|
||||
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
||||
):
|
||||
@@ -1067,6 +1388,19 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@encode_special_tokens.setter
|
||||
def encode_special_tokens(self, value):
|
||||
"""
|
||||
Modifies the tokenizer in order to use or not the special tokens
|
||||
during encoding.
|
||||
|
||||
Args:
|
||||
value (:obj:`bool`):
|
||||
Whether to use the special tokens or not
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_buffer(buffer):
|
||||
"""
|
||||
@@ -1187,6 +1521,13 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@model.setter
|
||||
def model(self, value):
|
||||
"""
|
||||
The :class:`~tokenizers.models.Model` in use by the Tokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
def no_padding(self):
|
||||
"""
|
||||
Disable padding
|
||||
@@ -1206,6 +1547,13 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@normalizer.setter
|
||||
def normalizer(self, value):
|
||||
"""
|
||||
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
@@ -1227,6 +1575,19 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@padding.setter
|
||||
def padding(self, value):
|
||||
"""
|
||||
Get the current padding parameters
|
||||
|
||||
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
|
||||
|
||||
Returns:
|
||||
(:obj:`dict`, `optional`):
|
||||
A dict with the current padding parameters if padding is enabled
|
||||
"""
|
||||
pass
|
||||
|
||||
def post_process(self, encoding, pair=None, add_special_tokens=True):
|
||||
"""
|
||||
Apply all the post-processing steps to the given encodings.
|
||||
@@ -1261,6 +1622,13 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@post_processor.setter
|
||||
def post_processor(self, value):
|
||||
"""
|
||||
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def pre_tokenizer(self):
|
||||
"""
|
||||
@@ -1268,6 +1636,13 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
|
||||
@pre_tokenizer.setter
|
||||
def pre_tokenizer(self, value):
|
||||
"""
|
||||
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
def save(self, path, pretty=True):
|
||||
"""
|
||||
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
|
||||
@@ -1360,3 +1735,66 @@ class Tokenizer:
|
||||
A dict with the current truncation parameters if truncation is enabled
|
||||
"""
|
||||
pass
|
||||
|
||||
@truncation.setter
|
||||
def truncation(self, value):
|
||||
"""
|
||||
Get the currently set truncation parameters
|
||||
|
||||
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
|
||||
|
||||
Returns:
|
||||
(:obj:`dict`, `optional`):
|
||||
A dict with the current truncation parameters if truncation is enabled
|
||||
"""
|
||||
pass
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union, Any
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
TextInputSequence = str
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
|
||||
TextEncodeInput = Union[
|
||||
TextInputSequence,
|
||||
Tuple[TextInputSequence, TextInputSequence],
|
||||
List[TextInputSequence],
|
||||
]
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence,
|
||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
List[PreTokenizedInputSequence],
|
||||
]
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||
|
||||
class OffsetReferential(Enum):
|
||||
ORIGINAL = "original"
|
||||
NORMALIZED = "normalized"
|
||||
|
||||
class OffsetType(Enum):
|
||||
BYTE = "byte"
|
||||
CHAR = "char"
|
||||
|
||||
class SplitDelimiterBehavior(Enum):
|
||||
REMOVED = "removed"
|
||||
ISOLATED = "isolated"
|
||||
MERGED_WITH_PREVIOUS = "merged_with_previous"
|
||||
MERGED_WITH_NEXT = "merged_with_next"
|
||||
CONTIGUOUS = "contiguous"
|
||||
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
SentencePieceUnigramTokenizer,
|
||||
)
|
||||
|
||||
def __getattr__(name: str) -> Any: ...
|
||||
|
||||
BertWordPieceTokenizer: Any
|
||||
ByteLevelBPETokenizer: Any
|
||||
CharBPETokenizer: Any
|
||||
SentencePieceBPETokenizer: Any
|
||||
SentencePieceUnigramTokenizer: Any
|
||||
|
||||
@@ -7,6 +7,29 @@ class DecodeStream:
|
||||
def __init__(self, ids=None, skip_special_tokens=False):
|
||||
pass
|
||||
|
||||
def __getstate__(self, /):
|
||||
"""
|
||||
Helper for pickle.
|
||||
"""
|
||||
pass
|
||||
|
||||
def step(self, tokenizer, id):
|
||||
"""
|
||||
Streaming decode step
|
||||
|
||||
Args:
|
||||
tokenizer (:class:`~tokenizers.Tokenizer`):
|
||||
The tokenizer to use for decoding
|
||||
id (:obj:`int` or `List[int]`):
|
||||
The next token id or list of token ids to add to the stream
|
||||
|
||||
|
||||
Returns:
|
||||
:obj:`Optional[str]`: The next decoded string chunk, or None if not enough
|
||||
tokens have been provided yet.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Decoder:
|
||||
"""
|
||||
Base class for all decoders
|
||||
@@ -14,6 +37,19 @@ class Decoder:
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -39,6 +75,19 @@ class BPEDecoder(Decoder):
|
||||
def __init__(self, suffix="</w>"):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -52,6 +101,16 @@ class BPEDecoder(Decoder):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@suffix.setter
|
||||
def suffix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class ByteFallback(Decoder):
|
||||
"""
|
||||
ByteFallback Decoder
|
||||
@@ -63,6 +122,19 @@ class ByteFallback(Decoder):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -86,6 +158,19 @@ class ByteLevel(Decoder):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -115,6 +200,29 @@ class CTC(Decoder):
|
||||
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def cleanup(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@cleanup.setter
|
||||
def cleanup(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -128,6 +236,26 @@ class CTC(Decoder):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def pad_token(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@pad_token.setter
|
||||
def pad_token(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def word_delimiter_token(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@word_delimiter_token.setter
|
||||
def word_delimiter_token(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Fuse(Decoder):
|
||||
"""
|
||||
Fuse Decoder
|
||||
@@ -138,6 +266,19 @@ class Fuse(Decoder):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -169,6 +310,19 @@ class Metaspace(Decoder):
|
||||
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -182,6 +336,36 @@ class Metaspace(Decoder):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def prepend_scheme(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@prepend_scheme.setter
|
||||
def prepend_scheme(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def replacement(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@replacement.setter
|
||||
def replacement(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def split(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@split.setter
|
||||
def split(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Replace(Decoder):
|
||||
"""
|
||||
Replace Decoder
|
||||
@@ -192,6 +376,19 @@ class Replace(Decoder):
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -216,6 +413,23 @@ class Sequence(Decoder):
|
||||
def __init__(self, decoders):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -234,7 +448,30 @@ class Strip(Decoder):
|
||||
Strip normalizer
|
||||
Strips n left characters of each token, or n right characters of each token
|
||||
"""
|
||||
def __init__(self, content, left=0, right=0):
|
||||
def __init__(self, content=" ", left=0, right=0):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@content.setter
|
||||
def content(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
@@ -250,6 +487,26 @@ class Strip(Decoder):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def start(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@start.setter
|
||||
def start(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def stop(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@stop.setter
|
||||
def stop(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class WordPiece(Decoder):
|
||||
"""
|
||||
WordPiece Decoder
|
||||
@@ -265,6 +522,29 @@ class WordPiece(Decoder):
|
||||
def __init__(self, prefix="##", cleanup=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def cleanup(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@cleanup.setter
|
||||
def cleanup(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(decoder):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
Decode the given list of tokens to a final string
|
||||
@@ -277,3 +557,13 @@ class WordPiece(Decoder):
|
||||
:obj:`str`: The decoded string
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def prefix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@prefix.setter
|
||||
def prefix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@@ -187,7 +187,7 @@ class BaseTokenizer:
|
||||
Returns:
|
||||
The normalized string
|
||||
"""
|
||||
return self._tokenizer.normalize(sequence)
|
||||
return self._tokenizer.normalizer.normalize_str(sequence)
|
||||
|
||||
def encode(
|
||||
self,
|
||||
|
||||
@@ -150,7 +150,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
|
||||
sys.path.append(".")
|
||||
|
||||
import sentencepiece_model_pb2 as model
|
||||
import sentencepiece_model_pb2 as model # type: ignore[import]
|
||||
except Exception:
|
||||
raise Exception(
|
||||
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
|
||||
@@ -191,6 +191,6 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
"model": "SentencePieceUnigram",
|
||||
}
|
||||
|
||||
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
|
||||
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) # type: ignore[arg-type]
|
||||
BaseTokenizer.__init__(obj, tokenizer, parameters)
|
||||
return obj
|
||||
|
||||
@@ -8,6 +8,17 @@ class Model:
|
||||
|
||||
This class cannot be constructed directly. Please use one of the concrete models.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
@@ -131,8 +142,56 @@ class BPE(Model):
|
||||
):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def byte_fallback(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@byte_fallback.setter
|
||||
def byte_fallback(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def continuing_subword_prefix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@continuing_subword_prefix.setter
|
||||
def continuing_subword_prefix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def dropout(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@dropout.setter
|
||||
def dropout(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def end_of_word_suffix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@end_of_word_suffix.setter
|
||||
def end_of_word_suffix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_file(cls, vocab, merge, **kwargs):
|
||||
def from_file(vocab, merges, **kwargs):
|
||||
"""
|
||||
Instantiate a BPE model from the given files.
|
||||
|
||||
@@ -157,6 +216,16 @@ class BPE(Model):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def fuse_unk(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@fuse_unk.setter
|
||||
def fuse_unk(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
"""
|
||||
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||
@@ -182,8 +251,18 @@ class BPE(Model):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def ignore_merges(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@ignore_merges.setter
|
||||
def ignore_merges(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(self, vocab, merges):
|
||||
def read_file(vocab, merges):
|
||||
"""
|
||||
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
|
||||
|
||||
@@ -250,6 +329,16 @@ class BPE(Model):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def unk_token(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@unk_token.setter
|
||||
def unk_token(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""
|
||||
An implementation of the Unigram algorithm
|
||||
@@ -258,7 +347,15 @@ class Unigram(Model):
|
||||
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
"""
|
||||
def __init__(self, vocab, unk_id, byte_fallback):
|
||||
def __init__(self, vocab=None, unk_id=None, byte_fallback=None):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def get_trainer(self):
|
||||
@@ -345,11 +442,19 @@ class WordLevel(Model):
|
||||
unk_token (:obj:`str`, `optional`):
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
def __init__(self, vocab, unk_token):
|
||||
def __init__(self, vocab=None, unk_token=None):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_file(vocab, unk_token):
|
||||
def from_file(vocab, unk_token=None):
|
||||
"""
|
||||
Instantiate a WordLevel model from the given file
|
||||
|
||||
@@ -460,6 +565,16 @@ class WordLevel(Model):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def unk_token(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@unk_token.setter
|
||||
def unk_token(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class WordPiece(Model):
|
||||
"""
|
||||
An implementation of the WordPiece algorithm
|
||||
@@ -474,7 +589,25 @@ class WordPiece(Model):
|
||||
max_input_chars_per_word (:obj:`int`, `optional`):
|
||||
The maximum number of characters to authorize in a single word.
|
||||
"""
|
||||
def __init__(self, vocab, unk_token, max_input_chars_per_word):
|
||||
def __init__(self, vocab=None, unk_token="[UNK]", max_input_chars_per_word=100, continuing_subword_prefix="##"):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def continuing_subword_prefix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@continuing_subword_prefix.setter
|
||||
def continuing_subword_prefix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@@ -525,6 +658,16 @@ class WordPiece(Model):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def max_input_chars_per_word(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@max_input_chars_per_word.setter
|
||||
def max_input_chars_per_word(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(vocab):
|
||||
"""
|
||||
@@ -589,3 +732,13 @@ class WordPiece(Model):
|
||||
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def unk_token(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@unk_token.setter
|
||||
def unk_token(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@@ -6,6 +6,19 @@ class Normalizer:
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -65,6 +78,49 @@ class BertNormalizer(Normalizer):
|
||||
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def clean_text(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@clean_text.setter
|
||||
def clean_text(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def handle_chinese_chars(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@handle_chinese_chars.setter
|
||||
def handle_chinese_chars(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def lowercase(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@lowercase.setter
|
||||
def lowercase(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -99,6 +155,16 @@ class BertNormalizer(Normalizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def strip_accents(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@strip_accents.setter
|
||||
def strip_accents(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class ByteLevel(Normalizer):
|
||||
"""
|
||||
Bytelevel Normalizer
|
||||
@@ -106,6 +172,19 @@ class ByteLevel(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -147,6 +226,19 @@ class Lowercase(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -188,6 +280,19 @@ class NFC(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -229,6 +334,19 @@ class NFD(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -270,6 +388,19 @@ class NFKC(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -311,6 +442,19 @@ class NFKD(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -352,6 +496,19 @@ class Nmt(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -394,6 +551,19 @@ class Precompiled(Normalizer):
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -435,6 +605,19 @@ class Prepend(Normalizer):
|
||||
def __init__(self, prepend):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -469,6 +652,16 @@ class Prepend(Normalizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def prepend(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@prepend.setter
|
||||
def prepend(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Replace(Normalizer):
|
||||
"""
|
||||
Replace normalizer
|
||||
@@ -476,6 +669,29 @@ class Replace(Normalizer):
|
||||
def __init__(self, pattern, content):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def content(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@content.setter
|
||||
def content(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -510,6 +726,16 @@ class Replace(Normalizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def pattern(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@pattern.setter
|
||||
def pattern(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
"""
|
||||
Allows concatenating multiple other Normalizer as a Sequence.
|
||||
@@ -519,6 +745,38 @@ class Sequence(Normalizer):
|
||||
normalizers (:obj:`List[Normalizer]`):
|
||||
A list of Normalizer to be run as a sequence
|
||||
"""
|
||||
def __init__(self, normalizers):
|
||||
pass
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Return self[key].
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""
|
||||
Set self[key] to value.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -560,6 +818,29 @@ class Strip(Normalizer):
|
||||
def __init__(self, left=True, right=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def left(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@left.setter
|
||||
def left(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -594,6 +875,16 @@ class Strip(Normalizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def right(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@right.setter
|
||||
def right(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class StripAccents(Normalizer):
|
||||
"""
|
||||
StripAccents normalizer
|
||||
@@ -601,6 +892,19 @@ class StripAccents(Normalizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(normalizer):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
Normalize a :class:`~tokenizers.NormalizedString` in-place
|
||||
@@ -634,3 +938,9 @@ class StripAccents(Normalizer):
|
||||
:obj:`str`: A string after normalization
|
||||
"""
|
||||
pass
|
||||
|
||||
from typing import Dict
|
||||
|
||||
NORMALIZERS: Dict[str, Normalizer]
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer: ...
|
||||
|
||||
@@ -6,6 +6,19 @@ class PreTokenizer:
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -53,6 +66,19 @@ class BertPreTokenizer(PreTokenizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -105,7 +131,25 @@ class ByteLevel(PreTokenizer):
|
||||
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
||||
the GPT2 specific regexp for spliting on whitespace.
|
||||
"""
|
||||
def __init__(self, add_prefix_space=True, use_regex=True):
|
||||
def __init__(self, add_prefix_space=True, trim_offsets=True, use_regex=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def add_prefix_space(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@add_prefix_space.setter
|
||||
def add_prefix_space(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
@@ -122,6 +166,11 @@ class ByteLevel(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -159,6 +208,26 @@ class ByteLevel(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def trim_offsets(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@trim_offsets.setter
|
||||
def trim_offsets(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def use_regex(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@use_regex.setter
|
||||
def use_regex(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
@@ -167,6 +236,36 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
def __init__(self, delimiter):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def delimiter(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@delimiter.setter
|
||||
def delimiter(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -221,6 +320,29 @@ class Digits(PreTokenizer):
|
||||
def __init__(self, individual_digits=False):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def individual_digits(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@individual_digits.setter
|
||||
def individual_digits(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -273,6 +395,29 @@ class FixedLength(PreTokenizer):
|
||||
def __init__(self, length=5):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def length(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@length.setter
|
||||
def length(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -332,6 +477,19 @@ class Metaspace(PreTokenizer):
|
||||
def __init__(self, replacement="_", prepend_scheme="always", split=True):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -369,6 +527,36 @@ class Metaspace(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def prepend_scheme(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@prepend_scheme.setter
|
||||
def prepend_scheme(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def replacement(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@replacement.setter
|
||||
def replacement(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def split(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@split.setter
|
||||
def split(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.
|
||||
@@ -382,6 +570,29 @@ class Punctuation(PreTokenizer):
|
||||
def __init__(self, behavior="isolated"):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def behavior(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@behavior.setter
|
||||
def behavior(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -426,6 +637,35 @@ class Sequence(PreTokenizer):
|
||||
def __init__(self, pretokenizers):
|
||||
pass
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Return self[key].
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""
|
||||
Set self[key] to value.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -489,6 +729,53 @@ class Split(PreTokenizer):
|
||||
def __init__(self, pattern, behavior, invert=False):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def behavior(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@behavior.setter
|
||||
def behavior(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def invert(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@invert.setter
|
||||
def invert(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def pattern(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@pattern.setter
|
||||
def pattern(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -536,6 +823,19 @@ class UnicodeScripts(PreTokenizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -607,6 +907,19 @@ class Whitespace(PreTokenizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
@@ -651,6 +964,19 @@ class WhitespaceSplit(PreTokenizer):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def custom(pretok):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
@@ -6,6 +6,14 @@ class PostProcessor:
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
@@ -56,6 +64,28 @@ class BertProcessing(PostProcessor):
|
||||
def __init__(self, sep, cls):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def cls(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@cls.setter
|
||||
def cls(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
@@ -88,6 +118,16 @@ class BertProcessing(PostProcessor):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def sep(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@sep.setter
|
||||
def sep(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of trimming the offsets.
|
||||
@@ -98,8 +138,31 @@ class ByteLevel(PostProcessor):
|
||||
Args:
|
||||
trim_offsets (:obj:`bool`):
|
||||
Whether to trim the whitespaces from the produced offsets.
|
||||
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
|
||||
the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
|
||||
is set to :obj:`True`.
|
||||
"""
|
||||
def __init__(self, trim_offsets=True):
|
||||
def __init__(self, add_prefix_space=None, trim_offsets=None, use_regex=None):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def add_prefix_space(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@add_prefix_space.setter
|
||||
def add_prefix_space(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
@@ -134,6 +197,26 @@ class ByteLevel(PostProcessor):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def trim_offsets(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@trim_offsets.setter
|
||||
def trim_offsets(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def use_regex(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@use_regex.setter
|
||||
def use_regex(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
"""
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
@@ -164,6 +247,38 @@ class RobertaProcessing(PostProcessor):
|
||||
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def add_prefix_space(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@add_prefix_space.setter
|
||||
def add_prefix_space(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def cls(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@cls.setter
|
||||
def cls(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
@@ -196,6 +311,26 @@ class RobertaProcessing(PostProcessor):
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def sep(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@sep.setter
|
||||
def sep(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def trim_offsets(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@trim_offsets.setter
|
||||
def trim_offsets(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class Sequence(PostProcessor):
|
||||
"""
|
||||
Sequence Processor
|
||||
@@ -207,6 +342,30 @@ class Sequence(PostProcessor):
|
||||
def __init__(self, processors):
|
||||
pass
|
||||
|
||||
def __getitem__(self, key):
|
||||
"""
|
||||
Return self[key].
|
||||
"""
|
||||
pass
|
||||
|
||||
def __getnewargs__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""
|
||||
Set self[key] to value.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
"""
|
||||
Return the number of special tokens that would be added for single/pair sentences.
|
||||
@@ -306,7 +465,15 @@ class TemplateProcessing(PostProcessor):
|
||||
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
||||
the same length.
|
||||
"""
|
||||
def __init__(self, single, pair, special_tokens):
|
||||
def __init__(self, single=None, pair=None, special_tokens=None):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def num_special_tokens_to_add(self, is_pair):
|
||||
@@ -340,3 +507,13 @@ class TemplateProcessing(PostProcessor):
|
||||
:class:`~tokenizers.Encoding`: The final encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def single(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@single.setter
|
||||
def single(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
Binary file not shown.
@@ -16,7 +16,7 @@ with open(css_filename) as f:
|
||||
class Annotation:
|
||||
start: int
|
||||
end: int
|
||||
label: int
|
||||
label: str
|
||||
|
||||
def __init__(self, start: int, end: int, label: str):
|
||||
self.start = start
|
||||
@@ -91,7 +91,7 @@ class EncodingVisualizer:
|
||||
):
|
||||
if default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import HTML, display
|
||||
from IPython.core.display import HTML, display # type: ignore[attr-defined]
|
||||
except ImportError:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
@@ -108,7 +108,7 @@ class EncodingVisualizer:
|
||||
def __call__(
|
||||
self,
|
||||
text: str,
|
||||
annotations: AnnotationList = [],
|
||||
annotations: Optional[List[Any]] = None,
|
||||
default_to_notebook: Optional[bool] = None,
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
@@ -135,12 +135,14 @@ class EncodingVisualizer:
|
||||
final_default_to_notebook = default_to_notebook
|
||||
if final_default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import HTML, display
|
||||
from IPython.core.display import HTML, display # type: ignore[attr-defined]
|
||||
except ImportError:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
Are you running in a notebook?"""
|
||||
)
|
||||
if annotations is None:
|
||||
annotations = []
|
||||
if self.annotation_coverter is not None:
|
||||
annotations = list(map(self.annotation_coverter, annotations))
|
||||
encoding = self.tokenizer.encode(text)
|
||||
@@ -213,6 +215,8 @@ class EncodingVisualizer:
|
||||
return f'<span class="special-token" data-stoken={stoken}></span>'
|
||||
# We're not in a special token so this group has a start and end.
|
||||
last = consecutive_chars_list[-1]
|
||||
assert first.char_ix is not None
|
||||
assert last.char_ix is not None
|
||||
start = first.char_ix
|
||||
end = last.char_ix + 1
|
||||
span_text = text[start:end]
|
||||
|
||||
@@ -6,6 +6,13 @@ class Trainer:
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
"""
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
"""
|
||||
@@ -60,6 +67,104 @@ class BpeTrainer(Trainer):
|
||||
):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def continuing_subword_prefix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@continuing_subword_prefix.setter
|
||||
def continuing_subword_prefix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def end_of_word_suffix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@end_of_word_suffix.setter
|
||||
def end_of_word_suffix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def initial_alphabet(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@initial_alphabet.setter
|
||||
def initial_alphabet(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def limit_alphabet(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@limit_alphabet.setter
|
||||
def limit_alphabet(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def max_token_length(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@max_token_length.setter
|
||||
def max_token_length(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def min_frequency(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@min_frequency.setter
|
||||
def min_frequency(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def show_progress(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@show_progress.setter
|
||||
def show_progress(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def special_tokens(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@special_tokens.setter
|
||||
def special_tokens(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class UnigramTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a Unigram model
|
||||
@@ -107,6 +212,54 @@ class UnigramTrainer(Trainer):
|
||||
):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def initial_alphabet(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@initial_alphabet.setter
|
||||
def initial_alphabet(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def show_progress(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@show_progress.setter
|
||||
def show_progress(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def special_tokens(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@special_tokens.setter
|
||||
def special_tokens(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class WordLevelTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a WorldLevel model
|
||||
@@ -127,6 +280,54 @@ class WordLevelTrainer(Trainer):
|
||||
def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def min_frequency(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@min_frequency.setter
|
||||
def min_frequency(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def show_progress(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@show_progress.setter
|
||||
def show_progress(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def special_tokens(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@special_tokens.setter
|
||||
def special_tokens(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
"""
|
||||
Trainer capable of training a WordPiece model
|
||||
@@ -171,3 +372,91 @@ class WordPieceTrainer(Trainer):
|
||||
end_of_word_suffix=None,
|
||||
):
|
||||
pass
|
||||
|
||||
def __getstate__(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
def __setstate__(self, state):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def continuing_subword_prefix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@continuing_subword_prefix.setter
|
||||
def continuing_subword_prefix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def end_of_word_suffix(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@end_of_word_suffix.setter
|
||||
def end_of_word_suffix(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def initial_alphabet(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@initial_alphabet.setter
|
||||
def initial_alphabet(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def limit_alphabet(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@limit_alphabet.setter
|
||||
def limit_alphabet(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def min_frequency(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@min_frequency.setter
|
||||
def min_frequency(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def show_progress(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@show_progress.setter
|
||||
def show_progress(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def special_tokens(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@special_tokens.setter
|
||||
def special_tokens(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
""" """
|
||||
pass
|
||||
|
||||
@vocab_size.setter
|
||||
def vocab_size(self, value):
|
||||
""" """
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user