增加环绕侦察场景适配

This commit is contained in:
2026-01-08 15:44:38 +08:00
parent 3eba1f962b
commit 10c5bb5a8a
5441 changed files with 40219 additions and 379695 deletions

View File

@@ -75,7 +75,7 @@ class SplitDelimiterBehavior(Enum):
CONTIGUOUS = "contiguous"
from .tokenizers import (
from .tokenizers import ( # type: ignore[import]
AddedToken,
Encoding,
NormalizedString,

View File

@@ -34,7 +34,15 @@ class AddedToken:
Defines whether this token should be skipped when decoding.
"""
def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
def __init__(self, content=None, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
@@ -44,6 +52,13 @@ class AddedToken:
"""
pass
@content.setter
def content(self, value):
"""
Get the content of this :obj:`AddedToken`
"""
pass
@property
def lstrip(self):
"""
@@ -51,6 +66,13 @@ class AddedToken:
"""
pass
@lstrip.setter
def lstrip(self, value):
"""
Get the value of the :obj:`lstrip` option
"""
pass
@property
def normalized(self):
"""
@@ -58,6 +80,13 @@ class AddedToken:
"""
pass
@normalized.setter
def normalized(self, value):
"""
Get the value of the :obj:`normalized` option
"""
pass
@property
def rstrip(self):
"""
@@ -65,6 +94,13 @@ class AddedToken:
"""
pass
@rstrip.setter
def rstrip(self, value):
"""
Get the value of the :obj:`rstrip` option
"""
pass
@property
def single_word(self):
"""
@@ -72,6 +108,13 @@ class AddedToken:
"""
pass
@single_word.setter
def single_word(self, value):
"""
Get the value of the :obj:`single_word` option
"""
pass
@property
def special(self):
"""
@@ -79,10 +122,28 @@ class AddedToken:
"""
pass
@special.setter
def special(self, value):
"""
Get the value of the :obj:`special` option
"""
pass
class Encoding:
"""
The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`.
"""
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def attention_mask(self):
"""
@@ -97,6 +158,20 @@ class Encoding:
"""
pass
@attention_mask.setter
def attention_mask(self, value):
"""
The attention mask
This indicates to the LM which tokens should be attended to, and which should not.
This is especially important when batching sequences, where we need to applying
padding.
Returns:
:obj:`List[int]`: The attention mask
"""
pass
def char_to_token(self, char_pos, sequence_index=0):
"""
Get the token that contains the char at the given position in the input sequence.
@@ -140,6 +215,19 @@ class Encoding:
"""
pass
@ids.setter
def ids(self, value):
"""
The generated IDs
The IDs are the main input to a Language Model. They are the token indices,
the numerical representations that a LM understands.
Returns:
:obj:`List[int]`: The list of IDs
"""
pass
@staticmethod
def merge(encodings, growing_offsets=True):
"""
@@ -167,6 +255,16 @@ class Encoding:
"""
pass
@n_sequences.setter
def n_sequences(self, value):
"""
The number of sequences represented
Returns:
:obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding`
"""
pass
@property
def offsets(self):
"""
@@ -180,6 +278,19 @@ class Encoding:
"""
pass
@offsets.setter
def offsets(self, value):
"""
The offsets associated to each token
These offsets let's you slice the input string, and thus retrieve the original
part that led to producing the corresponding token.
Returns:
A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets
"""
pass
@property
def overflowing(self):
"""
@@ -195,6 +306,21 @@ class Encoding:
"""
pass
@overflowing.setter
def overflowing(self, value):
"""
A :obj:`List` of overflowing :class:`~tokenizers.Encoding`
When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting
the output into as many pieces as required to match the specified maximum length.
This field lets you retrieve all the subsequent pieces.
When you use pairs of sequences, the overflowing pieces will contain enough
variations to cover all the possible combinations, while respecting the provided
maximum length.
"""
pass
def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"):
"""
Pad the :class:`~tokenizers.Encoding` at the given length
@@ -231,6 +357,20 @@ class Encoding:
"""
pass
@sequence_ids.setter
def sequence_ids(self, value):
"""
The generated sequence indices.
They represent the index of the input sequence associated to each token.
The sequence id can be None if the token is not related to any input sequence,
like for example with special tokens.
Returns:
A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index.
"""
pass
def set_sequence_id(self, sequence_id):
"""
Set the given sequence index
@@ -252,6 +392,18 @@ class Encoding:
"""
pass
@special_tokens_mask.setter
def special_tokens_mask(self, value):
"""
The special token mask
This indicates which tokens are special tokens, and which are not.
Returns:
:obj:`List[int]`: The special tokens mask
"""
pass
def token_to_chars(self, token_index):
"""
Get the offsets of the token at the given index.
@@ -314,6 +466,18 @@ class Encoding:
"""
pass
@tokens.setter
def tokens(self, value):
"""
The generated tokens
They are the string representation of the IDs.
Returns:
:obj:`List[str]`: The list of tokens
"""
pass
def truncate(self, max_length, stride=0, direction="right"):
"""
Truncate the :class:`~tokenizers.Encoding` at the given length
@@ -346,6 +510,19 @@ class Encoding:
"""
pass
@type_ids.setter
def type_ids(self, value):
"""
The generated type IDs
Generally used for tasks like sequence classification or question answering,
these tokens let the LM know which input sequence corresponds to each tokens.
Returns:
:obj:`List[int]`: The list of type ids
"""
pass
@property
def word_ids(self):
"""
@@ -364,6 +541,24 @@ class Encoding:
"""
pass
@word_ids.setter
def word_ids(self, value):
"""
The generated word indices.
They represent the index of the word associated to each token.
When the input is pre-tokenized, they correspond to the ID of the given input label,
otherwise they correspond to the words indices as defined by the
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
For special tokens and such (any token that was generated from something that was
not part of the input), the output is :obj:`None`
Returns:
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
"""
pass
def word_to_chars(self, word_index, sequence_index=0):
"""
Get the offsets of the word at the given index in one of the input sequences.
@@ -417,6 +612,28 @@ class Encoding:
"""
pass
@words.setter
def words(self, value):
"""
The generated word indices.
.. warning::
This is deprecated and will be removed in a future version.
Please use :obj:`~tokenizers.Encoding.word_ids` instead.
They represent the index of the word associated to each token.
When the input is pre-tokenized, they correspond to the ID of the given input label,
otherwise they correspond to the words indices as defined by the
:class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used.
For special tokens and such (any token that was generated from something that was
not part of the input), the output is :obj:`None`
Returns:
A :obj:`List` of :obj:`Optional[int]`: A list of optional word index.
"""
pass
class NormalizedString:
"""
NormalizedString
@@ -429,6 +646,21 @@ class NormalizedString:
sequence: str:
The string sequence used to initialize this NormalizedString
"""
def __init__(self, sequence):
pass
def __getitem__(self, key):
"""
Return self[key].
"""
pass
def __getstate__(self, /):
"""
Helper for pickle.
"""
pass
def append(self, s):
"""
Append the given sequence to the string
@@ -505,6 +737,23 @@ class NormalizedString:
"""
pass
@normalized.setter
def normalized(self, value):
"""
The normalized part of the string
"""
pass
@property
def original(self):
""" """
pass
@original.setter
def original(self, value):
""" """
pass
def prepend(self, s):
"""
Prepend the given sequence to the string
@@ -587,6 +836,12 @@ class PreTokenizedString:
def __init__(self, sequence):
pass
def __getstate__(self, /):
"""
Helper for pickle.
"""
pass
def get_splits(self, offset_referential="original", offset_type="char"):
"""
Get the splits currently managed by the PreTokenizedString
@@ -671,8 +926,55 @@ class Regex:
def __init__(self, pattern):
pass
def __getstate__(self, /):
"""
Helper for pickle.
"""
pass
class Token:
pass
def __init__(self, id, value, offsets):
pass
def __getstate__(self, /):
"""
Helper for pickle.
"""
pass
def as_tuple(self):
""" """
pass
@property
def id(self):
""" """
pass
@id.setter
def id(self, value):
""" """
pass
@property
def offsets(self):
""" """
pass
@offsets.setter
def offsets(self, value):
""" """
pass
@property
def value(self):
""" """
pass
@value.setter
def value(self, value):
""" """
pass
class Tokenizer:
"""
@@ -687,6 +989,18 @@ class Tokenizer:
def __init__(self, model):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
def add_special_tokens(self, tokens):
"""
Add the given special tokens to the Tokenizer.
@@ -890,6 +1204,13 @@ class Tokenizer:
"""
pass
@decoder.setter
def decoder(self, value):
"""
The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
"""
pass
def enable_padding(
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
):
@@ -1067,6 +1388,19 @@ class Tokenizer:
"""
pass
@encode_special_tokens.setter
def encode_special_tokens(self, value):
"""
Modifies the tokenizer in order to use or not the special tokens
during encoding.
Args:
value (:obj:`bool`):
Whether to use the special tokens or not
"""
pass
@staticmethod
def from_buffer(buffer):
"""
@@ -1187,6 +1521,13 @@ class Tokenizer:
"""
pass
@model.setter
def model(self, value):
"""
The :class:`~tokenizers.models.Model` in use by the Tokenizer
"""
pass
def no_padding(self):
"""
Disable padding
@@ -1206,6 +1547,13 @@ class Tokenizer:
"""
pass
@normalizer.setter
def normalizer(self, value):
"""
The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
"""
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@@ -1227,6 +1575,19 @@ class Tokenizer:
"""
pass
@padding.setter
def padding(self, value):
"""
Get the current padding parameters
`Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
Returns:
(:obj:`dict`, `optional`):
A dict with the current padding parameters if padding is enabled
"""
pass
def post_process(self, encoding, pair=None, add_special_tokens=True):
"""
Apply all the post-processing steps to the given encodings.
@@ -1261,6 +1622,13 @@ class Tokenizer:
"""
pass
@post_processor.setter
def post_processor(self, value):
"""
The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
"""
pass
@property
def pre_tokenizer(self):
"""
@@ -1268,6 +1636,13 @@ class Tokenizer:
"""
pass
@pre_tokenizer.setter
def pre_tokenizer(self, value):
"""
The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
"""
pass
def save(self, path, pretty=True):
"""
Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
@@ -1360,3 +1735,66 @@ class Tokenizer:
A dict with the current truncation parameters if truncation is enabled
"""
pass
@truncation.setter
def truncation(self, value):
"""
Get the currently set truncation parameters
`Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
Returns:
(:obj:`dict`, `optional`):
A dict with the current truncation parameters if truncation is enabled
"""
pass
from enum import Enum
from typing import List, Tuple, Union, Any
Offsets = Tuple[int, int]
TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str, ...]]
TextEncodeInput = Union[
TextInputSequence,
Tuple[TextInputSequence, TextInputSequence],
List[TextInputSequence],
]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
List[PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
class SplitDelimiterBehavior(Enum):
REMOVED = "removed"
ISOLATED = "isolated"
MERGED_WITH_PREVIOUS = "merged_with_previous"
MERGED_WITH_NEXT = "merged_with_next"
CONTIGUOUS = "contiguous"
from .implementations import (
BertWordPieceTokenizer,
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
)
def __getattr__(name: str) -> Any: ...
BertWordPieceTokenizer: Any
ByteLevelBPETokenizer: Any
CharBPETokenizer: Any
SentencePieceBPETokenizer: Any
SentencePieceUnigramTokenizer: Any

View File

@@ -7,6 +7,29 @@ class DecodeStream:
def __init__(self, ids=None, skip_special_tokens=False):
pass
def __getstate__(self, /):
"""
Helper for pickle.
"""
pass
def step(self, tokenizer, id):
"""
Streaming decode step
Args:
tokenizer (:class:`~tokenizers.Tokenizer`):
The tokenizer to use for decoding
id (:obj:`int` or `List[int]`):
The next token id or list of token ids to add to the stream
Returns:
:obj:`Optional[str]`: The next decoded string chunk, or None if not enough
tokens have been provided yet.
"""
pass
class Decoder:
"""
Base class for all decoders
@@ -14,6 +37,19 @@ class Decoder:
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -39,6 +75,19 @@ class BPEDecoder(Decoder):
def __init__(self, suffix="</w>"):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -52,6 +101,16 @@ class BPEDecoder(Decoder):
"""
pass
@property
def suffix(self):
""" """
pass
@suffix.setter
def suffix(self, value):
""" """
pass
class ByteFallback(Decoder):
"""
ByteFallback Decoder
@@ -63,6 +122,19 @@ class ByteFallback(Decoder):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -86,6 +158,19 @@ class ByteLevel(Decoder):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -115,6 +200,29 @@ class CTC(Decoder):
def __init__(self, pad_token="<pad>", word_delimiter_token="|", cleanup=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def cleanup(self):
""" """
pass
@cleanup.setter
def cleanup(self, value):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -128,6 +236,26 @@ class CTC(Decoder):
"""
pass
@property
def pad_token(self):
""" """
pass
@pad_token.setter
def pad_token(self, value):
""" """
pass
@property
def word_delimiter_token(self):
""" """
pass
@word_delimiter_token.setter
def word_delimiter_token(self, value):
""" """
pass
class Fuse(Decoder):
"""
Fuse Decoder
@@ -138,6 +266,19 @@ class Fuse(Decoder):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -169,6 +310,19 @@ class Metaspace(Decoder):
def __init__(self, replacement="", prepend_scheme="always", split=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -182,6 +336,36 @@ class Metaspace(Decoder):
"""
pass
@property
def prepend_scheme(self):
""" """
pass
@prepend_scheme.setter
def prepend_scheme(self, value):
""" """
pass
@property
def replacement(self):
""" """
pass
@replacement.setter
def replacement(self, value):
""" """
pass
@property
def split(self):
""" """
pass
@split.setter
def split(self, value):
""" """
pass
class Replace(Decoder):
"""
Replace Decoder
@@ -192,6 +376,19 @@ class Replace(Decoder):
def __init__(self, pattern, content):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -216,6 +413,23 @@ class Sequence(Decoder):
def __init__(self, decoders):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -234,7 +448,30 @@ class Strip(Decoder):
Strip normalizer
Strips n left characters of each token, or n right characters of each token
"""
def __init__(self, content, left=0, right=0):
def __init__(self, content=" ", left=0, right=0):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def content(self):
""" """
pass
@content.setter
def content(self, value):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
@@ -250,6 +487,26 @@ class Strip(Decoder):
"""
pass
@property
def start(self):
""" """
pass
@start.setter
def start(self, value):
""" """
pass
@property
def stop(self):
""" """
pass
@stop.setter
def stop(self, value):
""" """
pass
class WordPiece(Decoder):
"""
WordPiece Decoder
@@ -265,6 +522,29 @@ class WordPiece(Decoder):
def __init__(self, prefix="##", cleanup=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def cleanup(self):
""" """
pass
@cleanup.setter
def cleanup(self, value):
""" """
pass
@staticmethod
def custom(decoder):
""" """
pass
def decode(self, tokens):
"""
Decode the given list of tokens to a final string
@@ -277,3 +557,13 @@ class WordPiece(Decoder):
:obj:`str`: The decoded string
"""
pass
@property
def prefix(self):
""" """
pass
@prefix.setter
def prefix(self, value):
""" """
pass

View File

@@ -187,7 +187,7 @@ class BaseTokenizer:
Returns:
The normalized string
"""
return self._tokenizer.normalize(sequence)
return self._tokenizer.normalizer.normalize_str(sequence)
def encode(
self,

View File

@@ -150,7 +150,7 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
sys.path.append(".")
import sentencepiece_model_pb2 as model
import sentencepiece_model_pb2 as model # type: ignore[import]
except Exception:
raise Exception(
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/src/sentencepiece/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
@@ -191,6 +191,6 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
"model": "SentencePieceUnigram",
}
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters) # type: ignore[arg-type]
BaseTokenizer.__init__(obj, tokenizer, parameters)
return obj

View File

@@ -8,6 +8,17 @@ class Model:
This class cannot be constructed directly. Please use one of the concrete models.
"""
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@@ -131,8 +142,56 @@ class BPE(Model):
):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def byte_fallback(self):
""" """
pass
@byte_fallback.setter
def byte_fallback(self, value):
""" """
pass
@property
def continuing_subword_prefix(self):
""" """
pass
@continuing_subword_prefix.setter
def continuing_subword_prefix(self, value):
""" """
pass
@property
def dropout(self):
""" """
pass
@dropout.setter
def dropout(self, value):
""" """
pass
@property
def end_of_word_suffix(self):
""" """
pass
@end_of_word_suffix.setter
def end_of_word_suffix(self, value):
""" """
pass
@staticmethod
def from_file(cls, vocab, merge, **kwargs):
def from_file(vocab, merges, **kwargs):
"""
Instantiate a BPE model from the given files.
@@ -157,6 +216,16 @@ class BPE(Model):
"""
pass
@property
def fuse_unk(self):
""" """
pass
@fuse_unk.setter
def fuse_unk(self, value):
""" """
pass
def get_trainer(self):
"""
Get the associated :class:`~tokenizers.trainers.Trainer`
@@ -182,8 +251,18 @@ class BPE(Model):
"""
pass
@property
def ignore_merges(self):
""" """
pass
@ignore_merges.setter
def ignore_merges(self, value):
""" """
pass
@staticmethod
def read_file(self, vocab, merges):
def read_file(vocab, merges):
"""
Read a :obj:`vocab.json` and a :obj:`merges.txt` files
@@ -250,6 +329,16 @@ class BPE(Model):
"""
pass
@property
def unk_token(self):
""" """
pass
@unk_token.setter
def unk_token(self, value):
""" """
pass
class Unigram(Model):
"""
An implementation of the Unigram algorithm
@@ -258,7 +347,15 @@ class Unigram(Model):
vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
A list of vocabulary items and their relative score [("am", -0.2442),...]
"""
def __init__(self, vocab, unk_id, byte_fallback):
def __init__(self, vocab=None, unk_id=None, byte_fallback=None):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
def get_trainer(self):
@@ -345,11 +442,19 @@ class WordLevel(Model):
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
def __init__(self, vocab, unk_token):
def __init__(self, vocab=None, unk_token=None):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def from_file(vocab, unk_token):
def from_file(vocab, unk_token=None):
"""
Instantiate a WordLevel model from the given file
@@ -460,6 +565,16 @@ class WordLevel(Model):
"""
pass
@property
def unk_token(self):
""" """
pass
@unk_token.setter
def unk_token(self, value):
""" """
pass
class WordPiece(Model):
"""
An implementation of the WordPiece algorithm
@@ -474,7 +589,25 @@ class WordPiece(Model):
max_input_chars_per_word (:obj:`int`, `optional`):
The maximum number of characters to authorize in a single word.
"""
def __init__(self, vocab, unk_token, max_input_chars_per_word):
def __init__(self, vocab=None, unk_token="[UNK]", max_input_chars_per_word=100, continuing_subword_prefix="##"):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def continuing_subword_prefix(self):
""" """
pass
@continuing_subword_prefix.setter
def continuing_subword_prefix(self, value):
""" """
pass
@staticmethod
@@ -525,6 +658,16 @@ class WordPiece(Model):
"""
pass
@property
def max_input_chars_per_word(self):
""" """
pass
@max_input_chars_per_word.setter
def max_input_chars_per_word(self, value):
""" """
pass
@staticmethod
def read_file(vocab):
"""
@@ -589,3 +732,13 @@ class WordPiece(Model):
A :obj:`List` of :class:`~tokenizers.Token`: The generated tokens
"""
pass
@property
def unk_token(self):
""" """
pass
@unk_token.setter
def unk_token(self, value):
""" """
pass

View File

@@ -6,6 +6,19 @@ class Normalizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
"""
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -65,6 +78,49 @@ class BertNormalizer(Normalizer):
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def clean_text(self):
""" """
pass
@clean_text.setter
def clean_text(self, value):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
@property
def handle_chinese_chars(self):
""" """
pass
@handle_chinese_chars.setter
def handle_chinese_chars(self, value):
""" """
pass
@property
def lowercase(self):
""" """
pass
@lowercase.setter
def lowercase(self, value):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -99,6 +155,16 @@ class BertNormalizer(Normalizer):
"""
pass
@property
def strip_accents(self):
""" """
pass
@strip_accents.setter
def strip_accents(self, value):
""" """
pass
class ByteLevel(Normalizer):
"""
Bytelevel Normalizer
@@ -106,6 +172,19 @@ class ByteLevel(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -147,6 +226,19 @@ class Lowercase(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -188,6 +280,19 @@ class NFC(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -229,6 +334,19 @@ class NFD(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -270,6 +388,19 @@ class NFKC(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -311,6 +442,19 @@ class NFKD(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -352,6 +496,19 @@ class Nmt(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -394,6 +551,19 @@ class Precompiled(Normalizer):
def __init__(self, precompiled_charsmap):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -435,6 +605,19 @@ class Prepend(Normalizer):
def __init__(self, prepend):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -469,6 +652,16 @@ class Prepend(Normalizer):
"""
pass
@property
def prepend(self):
""" """
pass
@prepend.setter
def prepend(self, value):
""" """
pass
class Replace(Normalizer):
"""
Replace normalizer
@@ -476,6 +669,29 @@ class Replace(Normalizer):
def __init__(self, pattern, content):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def content(self):
""" """
pass
@content.setter
def content(self, value):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -510,6 +726,16 @@ class Replace(Normalizer):
"""
pass
@property
def pattern(self):
""" """
pass
@pattern.setter
def pattern(self, value):
""" """
pass
class Sequence(Normalizer):
"""
Allows concatenating multiple other Normalizer as a Sequence.
@@ -519,6 +745,38 @@ class Sequence(Normalizer):
normalizers (:obj:`List[Normalizer]`):
A list of Normalizer to be run as a sequence
"""
def __init__(self, normalizers):
pass
def __getitem__(self, key):
"""
Return self[key].
"""
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setitem__(self, key, value):
"""
Set self[key] to value.
"""
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -560,6 +818,29 @@ class Strip(Normalizer):
def __init__(self, left=True, right=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
@property
def left(self):
""" """
pass
@left.setter
def left(self, value):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -594,6 +875,16 @@ class Strip(Normalizer):
"""
pass
@property
def right(self):
""" """
pass
@right.setter
def right(self, value):
""" """
pass
class StripAccents(Normalizer):
"""
StripAccents normalizer
@@ -601,6 +892,19 @@ class StripAccents(Normalizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(normalizer):
""" """
pass
def normalize(self, normalized):
"""
Normalize a :class:`~tokenizers.NormalizedString` in-place
@@ -634,3 +938,9 @@ class StripAccents(Normalizer):
:obj:`str`: A string after normalization
"""
pass
from typing import Dict
NORMALIZERS: Dict[str, Normalizer]
def unicode_normalizer_from_str(normalizer: str) -> Normalizer: ...

View File

@@ -6,6 +6,19 @@ class PreTokenizer:
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
"""
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -53,6 +66,19 @@ class BertPreTokenizer(PreTokenizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -105,7 +131,25 @@ class ByteLevel(PreTokenizer):
Set this to :obj:`False` to prevent this `pre_tokenizer` from using
the GPT2 specific regexp for spliting on whitespace.
"""
def __init__(self, add_prefix_space=True, use_regex=True):
def __init__(self, add_prefix_space=True, trim_offsets=True, use_regex=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def add_prefix_space(self):
""" """
pass
@add_prefix_space.setter
def add_prefix_space(self, value):
""" """
pass
@staticmethod
@@ -122,6 +166,11 @@ class ByteLevel(PreTokenizer):
"""
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -159,6 +208,26 @@ class ByteLevel(PreTokenizer):
"""
pass
@property
def trim_offsets(self):
""" """
pass
@trim_offsets.setter
def trim_offsets(self, value):
""" """
pass
@property
def use_regex(self):
""" """
pass
@use_regex.setter
def use_regex(self, value):
""" """
pass
class CharDelimiterSplit(PreTokenizer):
"""
This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
@@ -167,6 +236,36 @@ class CharDelimiterSplit(PreTokenizer):
delimiter: str:
The delimiter char that will be used to split input
"""
def __init__(self, delimiter):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
@property
def delimiter(self):
""" """
pass
@delimiter.setter
def delimiter(self, value):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -221,6 +320,29 @@ class Digits(PreTokenizer):
def __init__(self, individual_digits=False):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
@property
def individual_digits(self):
""" """
pass
@individual_digits.setter
def individual_digits(self, value):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -273,6 +395,29 @@ class FixedLength(PreTokenizer):
def __init__(self, length=5):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
@property
def length(self):
""" """
pass
@length.setter
def length(self, value):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -332,6 +477,19 @@ class Metaspace(PreTokenizer):
def __init__(self, replacement="_", prepend_scheme="always", split=True):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -369,6 +527,36 @@ class Metaspace(PreTokenizer):
"""
pass
@property
def prepend_scheme(self):
""" """
pass
@prepend_scheme.setter
def prepend_scheme(self, value):
""" """
pass
@property
def replacement(self):
""" """
pass
@replacement.setter
def replacement(self, value):
""" """
pass
@property
def split(self):
""" """
pass
@split.setter
def split(self, value):
""" """
pass
class Punctuation(PreTokenizer):
"""
This pre-tokenizer simply splits on punctuation as individual characters.
@@ -382,6 +570,29 @@ class Punctuation(PreTokenizer):
def __init__(self, behavior="isolated"):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def behavior(self):
""" """
pass
@behavior.setter
def behavior(self, value):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -426,6 +637,35 @@ class Sequence(PreTokenizer):
def __init__(self, pretokenizers):
pass
def __getitem__(self, key):
"""
Return self[key].
"""
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setitem__(self, key, value):
"""
Set self[key] to value.
"""
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -489,6 +729,53 @@ class Split(PreTokenizer):
def __init__(self, pattern, behavior, invert=False):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def behavior(self):
""" """
pass
@behavior.setter
def behavior(self, value):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
@property
def invert(self):
""" """
pass
@invert.setter
def invert(self, value):
""" """
pass
@property
def pattern(self):
""" """
pass
@pattern.setter
def pattern(self, value):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -536,6 +823,19 @@ class UnicodeScripts(PreTokenizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -607,6 +907,19 @@ class Whitespace(PreTokenizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
@@ -651,6 +964,19 @@ class WhitespaceSplit(PreTokenizer):
def __init__(self):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@staticmethod
def custom(pretok):
""" """
pass
def pre_tokenize(self, pretok):
"""
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place

View File

@@ -6,6 +6,14 @@ class PostProcessor:
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
"""
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@@ -56,6 +64,28 @@ class BertProcessing(PostProcessor):
def __init__(self, sep, cls):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def cls(self):
""" """
pass
@cls.setter
def cls(self, value):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@@ -88,6 +118,16 @@ class BertProcessing(PostProcessor):
"""
pass
@property
def sep(self):
""" """
pass
@sep.setter
def sep(self, value):
""" """
pass
class ByteLevel(PostProcessor):
"""
This post-processor takes care of trimming the offsets.
@@ -98,8 +138,31 @@ class ByteLevel(PostProcessor):
Args:
trim_offsets (:obj:`bool`):
Whether to trim the whitespaces from the produced offsets.
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
If :obj:`True`, keeps the first token's offset as is. If :obj:`False`, increments
the start of the first token's offset by 1. Only has an effect if :obj:`trim_offsets`
is set to :obj:`True`.
"""
def __init__(self, trim_offsets=True):
def __init__(self, add_prefix_space=None, trim_offsets=None, use_regex=None):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def add_prefix_space(self):
""" """
pass
@add_prefix_space.setter
def add_prefix_space(self, value):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
@@ -134,6 +197,26 @@ class ByteLevel(PostProcessor):
"""
pass
@property
def trim_offsets(self):
""" """
pass
@trim_offsets.setter
def trim_offsets(self, value):
""" """
pass
@property
def use_regex(self):
""" """
pass
@use_regex.setter
def use_regex(self, value):
""" """
pass
class RobertaProcessing(PostProcessor):
"""
This post-processor takes care of adding the special tokens needed by
@@ -164,6 +247,38 @@ class RobertaProcessing(PostProcessor):
def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True):
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def add_prefix_space(self):
""" """
pass
@add_prefix_space.setter
def add_prefix_space(self, value):
""" """
pass
@property
def cls(self):
""" """
pass
@cls.setter
def cls(self, value):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@@ -196,6 +311,26 @@ class RobertaProcessing(PostProcessor):
"""
pass
@property
def sep(self):
""" """
pass
@sep.setter
def sep(self, value):
""" """
pass
@property
def trim_offsets(self):
""" """
pass
@trim_offsets.setter
def trim_offsets(self, value):
""" """
pass
class Sequence(PostProcessor):
"""
Sequence Processor
@@ -207,6 +342,30 @@ class Sequence(PostProcessor):
def __init__(self, processors):
pass
def __getitem__(self, key):
"""
Return self[key].
"""
pass
def __getnewargs__(self):
""" """
pass
def __getstate__(self):
""" """
pass
def __setitem__(self, key, value):
"""
Set self[key] to value.
"""
pass
def __setstate__(self, state):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
"""
Return the number of special tokens that would be added for single/pair sentences.
@@ -306,7 +465,15 @@ class TemplateProcessing(PostProcessor):
The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
the same length.
"""
def __init__(self, single, pair, special_tokens):
def __init__(self, single=None, pair=None, special_tokens=None):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
def num_special_tokens_to_add(self, is_pair):
@@ -340,3 +507,13 @@ class TemplateProcessing(PostProcessor):
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
@property
def single(self):
""" """
pass
@single.setter
def single(self, value):
""" """
pass

View File

@@ -16,7 +16,7 @@ with open(css_filename) as f:
class Annotation:
start: int
end: int
label: int
label: str
def __init__(self, start: int, end: int, label: str):
self.start = start
@@ -91,7 +91,7 @@ class EncodingVisualizer:
):
if default_to_notebook:
try:
from IPython.core.display import HTML, display
from IPython.core.display import HTML, display # type: ignore[attr-defined]
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
@@ -108,7 +108,7 @@ class EncodingVisualizer:
def __call__(
self,
text: str,
annotations: AnnotationList = [],
annotations: Optional[List[Any]] = None,
default_to_notebook: Optional[bool] = None,
) -> Optional[str]:
"""
@@ -135,12 +135,14 @@ class EncodingVisualizer:
final_default_to_notebook = default_to_notebook
if final_default_to_notebook:
try:
from IPython.core.display import HTML, display
from IPython.core.display import HTML, display # type: ignore[attr-defined]
except ImportError:
raise Exception(
"""We couldn't import IPython utils for html display.
Are you running in a notebook?"""
)
if annotations is None:
annotations = []
if self.annotation_coverter is not None:
annotations = list(map(self.annotation_coverter, annotations))
encoding = self.tokenizer.encode(text)
@@ -213,6 +215,8 @@ class EncodingVisualizer:
return f'<span class="special-token" data-stoken={stoken}></span>'
# We're not in a special token so this group has a start and end.
last = consecutive_chars_list[-1]
assert first.char_ix is not None
assert last.char_ix is not None
start = first.char_ix
end = last.char_ix + 1
span_text = text[start:end]

View File

@@ -6,6 +6,13 @@ class Trainer:
This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated.
"""
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
class BpeTrainer(Trainer):
"""
@@ -60,6 +67,104 @@ class BpeTrainer(Trainer):
):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def continuing_subword_prefix(self):
""" """
pass
@continuing_subword_prefix.setter
def continuing_subword_prefix(self, value):
""" """
pass
@property
def end_of_word_suffix(self):
""" """
pass
@end_of_word_suffix.setter
def end_of_word_suffix(self, value):
""" """
pass
@property
def initial_alphabet(self):
""" """
pass
@initial_alphabet.setter
def initial_alphabet(self, value):
""" """
pass
@property
def limit_alphabet(self):
""" """
pass
@limit_alphabet.setter
def limit_alphabet(self, value):
""" """
pass
@property
def max_token_length(self):
""" """
pass
@max_token_length.setter
def max_token_length(self, value):
""" """
pass
@property
def min_frequency(self):
""" """
pass
@min_frequency.setter
def min_frequency(self, value):
""" """
pass
@property
def show_progress(self):
""" """
pass
@show_progress.setter
def show_progress(self, value):
""" """
pass
@property
def special_tokens(self):
""" """
pass
@special_tokens.setter
def special_tokens(self, value):
""" """
pass
@property
def vocab_size(self):
""" """
pass
@vocab_size.setter
def vocab_size(self, value):
""" """
pass
class UnigramTrainer(Trainer):
"""
Trainer capable of training a Unigram model
@@ -107,6 +212,54 @@ class UnigramTrainer(Trainer):
):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def initial_alphabet(self):
""" """
pass
@initial_alphabet.setter
def initial_alphabet(self, value):
""" """
pass
@property
def show_progress(self):
""" """
pass
@show_progress.setter
def show_progress(self, value):
""" """
pass
@property
def special_tokens(self):
""" """
pass
@special_tokens.setter
def special_tokens(self, value):
""" """
pass
@property
def vocab_size(self):
""" """
pass
@vocab_size.setter
def vocab_size(self, value):
""" """
pass
class WordLevelTrainer(Trainer):
"""
Trainer capable of training a WorldLevel model
@@ -127,6 +280,54 @@ class WordLevelTrainer(Trainer):
def __init__(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[]):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def min_frequency(self):
""" """
pass
@min_frequency.setter
def min_frequency(self, value):
""" """
pass
@property
def show_progress(self):
""" """
pass
@show_progress.setter
def show_progress(self, value):
""" """
pass
@property
def special_tokens(self):
""" """
pass
@special_tokens.setter
def special_tokens(self, value):
""" """
pass
@property
def vocab_size(self):
""" """
pass
@vocab_size.setter
def vocab_size(self, value):
""" """
pass
class WordPieceTrainer(Trainer):
"""
Trainer capable of training a WordPiece model
@@ -171,3 +372,91 @@ class WordPieceTrainer(Trainer):
end_of_word_suffix=None,
):
pass
def __getstate__(self):
""" """
pass
def __setstate__(self, state):
""" """
pass
@property
def continuing_subword_prefix(self):
""" """
pass
@continuing_subword_prefix.setter
def continuing_subword_prefix(self, value):
""" """
pass
@property
def end_of_word_suffix(self):
""" """
pass
@end_of_word_suffix.setter
def end_of_word_suffix(self, value):
""" """
pass
@property
def initial_alphabet(self):
""" """
pass
@initial_alphabet.setter
def initial_alphabet(self, value):
""" """
pass
@property
def limit_alphabet(self):
""" """
pass
@limit_alphabet.setter
def limit_alphabet(self, value):
""" """
pass
@property
def min_frequency(self):
""" """
pass
@min_frequency.setter
def min_frequency(self, value):
""" """
pass
@property
def show_progress(self):
""" """
pass
@show_progress.setter
def show_progress(self, value):
""" """
pass
@property
def special_tokens(self):
""" """
pass
@special_tokens.setter
def special_tokens(self, value):
""" """
pass
@property
def vocab_size(self):
""" """
pass
@vocab_size.setter
def vocab_size(self, value):
""" """
pass