mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Update docstrings and API docs for Tokenizer
This commit is contained in:
parent
f216422ac5
commit
c5a653fa48
|
@ -2,8 +2,6 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import ujson
|
|
||||||
|
|
||||||
from cython.operator cimport dereference as deref
|
from cython.operator cimport dereference as deref
|
||||||
from cython.operator cimport preincrement as preinc
|
from cython.operator cimport preincrement as preinc
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
@ -12,32 +10,31 @@ from preshed.maps cimport PreshMap
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
from . import util
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
|
"""Segment text, and create Doc objects with the discovered segment
|
||||||
|
boundaries.
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||||
"""
|
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||||
Create a Tokenizer, to create Doc objects given unicode text.
|
|
||||||
|
|
||||||
Arguments:
|
vocab (Vocab): A storage container for lexical types.
|
||||||
vocab (Vocab):
|
rules (dict): Exceptions and special-cases for the tokenizer.
|
||||||
A storage container for lexical types.
|
prefix_search (callable): A function matching the signature of
|
||||||
rules (dict):
|
`re.compile(string).search` to match prefixes.
|
||||||
Exceptions and special-cases for the tokenizer.
|
suffix_search (callable): A function matching the signature of
|
||||||
prefix_search:
|
`re.compile(string).search` to match suffixes.
|
||||||
A function matching the signature of re.compile(string).search
|
`infix_finditer` (callable): A function matching the signature of
|
||||||
to match prefixes.
|
`re.compile(string).finditer` to find infixes.
|
||||||
suffix_search:
|
token_match (callable): A boolean function matching strings to be
|
||||||
A function matching the signature of re.compile(string).search
|
recognised as tokens.
|
||||||
to match suffixes.
|
RETURNS (Tokenizer): The newly constructed object.
|
||||||
infix_finditer:
|
|
||||||
A function matching the signature of re.compile(string).finditer
|
EXAMPLE:
|
||||||
to find infixes.
|
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||||
token_match:
|
>>> tokenizer = English().Defaults.create_tokenizer(nlp)
|
||||||
A boolean function matching strings that becomes tokens.
|
|
||||||
"""
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
|
@ -69,13 +66,10 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""
|
"""Tokenize a string.
|
||||||
Tokenize a string.
|
|
||||||
|
|
||||||
Arguments:
|
string (unicode): The string to tokenize.
|
||||||
string (unicode): The string to tokenize.
|
RETURNS (Doc): A container for linguistic annotations.
|
||||||
Returns:
|
|
||||||
Doc A container for linguistic annotations.
|
|
||||||
"""
|
"""
|
||||||
if len(string) >= (2 ** 30):
|
if len(string) >= (2 ** 30):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -123,18 +117,13 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||||
"""
|
"""Tokenize a stream of texts.
|
||||||
Tokenize a stream of texts.
|
|
||||||
|
|
||||||
Arguments:
|
texts: A sequence of unicode texts.
|
||||||
texts: A sequence of unicode texts.
|
batch_size (int): The number of texts to accumulate in an internal buffer.
|
||||||
batch_size (int):
|
n_threads (int): The number of threads to use, if the implementation
|
||||||
The number of texts to accumulate in an internal buffer.
|
supports multi-threading. The default tokenizer is single-threaded.
|
||||||
n_threads (int):
|
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||||
The number of threads to use, if the implementation supports
|
|
||||||
multi-threading. The default tokenizer is single-threaded.
|
|
||||||
Yields:
|
|
||||||
Doc A sequence of Doc objects, in order.
|
|
||||||
"""
|
"""
|
||||||
for text in texts:
|
for text in texts:
|
||||||
yield self(text)
|
yield self(text)
|
||||||
|
@ -278,27 +267,23 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
"""
|
"""Find internal split points of the string, such as hyphens.
|
||||||
Find internal split points of the string, such as hyphens.
|
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
|
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
|
||||||
Returns List[re.MatchObject]
|
and `.end()` methods, denoting the placement of internal segment
|
||||||
A list of objects that have .start() and .end() methods, denoting the
|
separators, e.g. hyphens.
|
||||||
placement of internal segment separators, e.g. hyphens.
|
|
||||||
"""
|
"""
|
||||||
if self.infix_finditer is None:
|
if self.infix_finditer is None:
|
||||||
return 0
|
return 0
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
"""
|
"""Find the length of a prefix that should be segmented from the string,
|
||||||
Find the length of a prefix that should be segmented from the string,
|
|
||||||
or None if no prefix rules match.
|
or None if no prefix rules match.
|
||||||
|
|
||||||
Arguments:
|
string (unicode): The string to segment.
|
||||||
string (unicode): The string to segment.
|
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||||
Returns (int or None): The length of the prefix if present, otherwise None.
|
|
||||||
"""
|
"""
|
||||||
if self.prefix_search is None:
|
if self.prefix_search is None:
|
||||||
return 0
|
return 0
|
||||||
|
@ -306,13 +291,11 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
"""
|
"""Find the length of a suffix that should be segmented from the string,
|
||||||
Find the length of a suffix that should be segmented from the string,
|
|
||||||
or None if no suffix rules match.
|
or None if no suffix rules match.
|
||||||
|
|
||||||
Arguments:
|
string (unicode): The string to segment.
|
||||||
string (unicode): The string to segment.
|
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||||
Returns (int or None): The length of the suffix if present, otherwise None.
|
|
||||||
"""
|
"""
|
||||||
if self.suffix_search is None:
|
if self.suffix_search is None:
|
||||||
return 0
|
return 0
|
||||||
|
@ -320,23 +303,17 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, special_cases):
|
def _load_special_tokenization(self, special_cases):
|
||||||
"""
|
"""Add special-case tokenization rules."""
|
||||||
Add special-case tokenization rules.
|
|
||||||
"""
|
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
"""
|
"""Add a special-case tokenization rule.
|
||||||
Add a special-case tokenization rule.
|
|
||||||
|
|
||||||
Arguments:
|
string (unicode): The string to specially tokenize.
|
||||||
string (unicode): The string to specially tokenize.
|
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||||
token_attrs:
|
a token and its attributes. The `ORTH` fields of the attributes must
|
||||||
A sequence of dicts, where each dict describes a token and its
|
exactly match the string when they are concatenated.
|
||||||
attributes. The ORTH fields of the attributes must exactly match
|
|
||||||
the string when they are concatenated.
|
|
||||||
Returns None
|
|
||||||
"""
|
"""
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
@ -347,3 +324,38 @@ cdef class Tokenizer:
|
||||||
self._specials.set(key, cached)
|
self._specials.set(key, cached)
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
self._rules[string] = substrings
|
self._rules[string] = substrings
|
||||||
|
|
||||||
|
def to_disk(self, path):
|
||||||
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory, which will be created if
|
||||||
|
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def from_disk(self, path):
|
||||||
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
returns it.
|
||||||
|
|
||||||
|
path (unicode or Path): A path to a directory. Paths may be either
|
||||||
|
strings or `Path`-like objects.
|
||||||
|
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def to_bytes(self, **exclude):
|
||||||
|
"""Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
**exclude: Named attributes to prevent from being serialized.
|
||||||
|
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
|
"""Load state from a binary string.
|
||||||
|
|
||||||
|
bytes_data (bytes): The data to load from.
|
||||||
|
**exclude: Named attributes to prevent from being loaded.
|
||||||
|
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
|
@ -11,6 +11,15 @@ p
|
||||||
|
|
||||||
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
# Construction 1
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
|
||||||
|
# Construction 2
|
||||||
|
from spacy.lang.en import English
|
||||||
|
tokenizer = English().Defaults.create_tokenizer(nlp)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code vocab]
|
+cell #[code vocab]
|
||||||
|
@ -43,6 +52,11 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
||||||
| A function matching the signature of
|
| A function matching the signature of
|
||||||
| #[code re.compile(string).finditer] to find infixes.
|
| #[code re.compile(string).finditer] to find infixes.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code token_match]
|
||||||
|
+cell callable
|
||||||
|
+cell A boolean function matching strings to be recognised as tokens.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code Tokenizer]
|
+cell #[code Tokenizer]
|
||||||
|
@ -53,6 +67,10 @@ p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
||||||
|
|
||||||
p Tokenize a string.
|
p Tokenize a string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
tokens = tokenizer(u'This is a sentence')
|
||||||
|
assert len(tokens) == 4
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code string]
|
+cell #[code string]
|
||||||
|
@ -69,6 +87,11 @@ p Tokenize a string.
|
||||||
|
|
||||||
p Tokenize a stream of texts.
|
p Tokenize a stream of texts.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
texts = [u'One document.', u'...', u'Lots of documents']
|
||||||
|
for doc in tokenizer.pipe(texts, batch_size=50):
|
||||||
|
pass
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
+cell #[code texts]
|
+cell #[code texts]
|
||||||
|
@ -105,11 +128,11 @@ p Find internal split points of the string.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code List[re.MatchObject]]
|
+cell list
|
||||||
+cell
|
+cell
|
||||||
| A list of objects that have #[code .start()] and #[code .end()]
|
| A list of #[code re.MatchObject] objects that have #[code .start()]
|
||||||
| methods, denoting the placement of internal segment separators,
|
| and #[code .end()] methods, denoting the placement of internal
|
||||||
| e.g. hyphens.
|
| segment separators, e.g. hyphens.
|
||||||
|
|
||||||
+h(2, "find_prefix") Tokenizer.find_prefix
|
+h(2, "find_prefix") Tokenizer.find_prefix
|
||||||
+tag method
|
+tag method
|
||||||
|
@ -126,7 +149,7 @@ p
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell int / #[code None]
|
+cell int
|
||||||
+cell The length of the prefix if present, otherwise #[code None].
|
+cell The length of the prefix if present, otherwise #[code None].
|
||||||
|
|
||||||
+h(2, "find_suffix") Tokenizer.find_suffix
|
+h(2, "find_suffix") Tokenizer.find_suffix
|
||||||
|
@ -150,7 +173,16 @@ p
|
||||||
+h(2, "add_special_case") Tokenizer.add_special_case
|
+h(2, "add_special_case") Tokenizer.add_special_case
|
||||||
+tag method
|
+tag method
|
||||||
|
|
||||||
p Add a special-case tokenization rule.
|
p
|
||||||
|
| Add a special-case tokenization rule. This mechanism is also used to add
|
||||||
|
| custom tokenizer exceptions to the language data. See the usage workflow
|
||||||
|
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
|
||||||
|
| for more details and examples.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.attrs import ORTH, LEMMA
|
||||||
|
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
|
||||||
|
tokenizer.add_special_case(case)
|
||||||
|
|
||||||
+table(["Name", "Type", "Description"])
|
+table(["Name", "Type", "Description"])
|
||||||
+row
|
+row
|
||||||
|
@ -160,16 +192,98 @@ p Add a special-case tokenization rule.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code token_attrs]
|
+cell #[code token_attrs]
|
||||||
+cell -
|
+cell iterable
|
||||||
+cell
|
+cell
|
||||||
| A sequence of dicts, where each dict describes a token and its
|
| A sequence of dicts, where each dict describes a token and its
|
||||||
| attributes. The #[code ORTH] fields of the attributes must
|
| attributes. The #[code ORTH] fields of the attributes must
|
||||||
| exactly match the string when they are concatenated.
|
| exactly match the string when they are concatenated.
|
||||||
|
|
||||||
|
|
||||||
|
+h(2, "to_disk") Tokenizer.to_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Save the current state to a directory.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
tokenizer.to_disk('/path/to/tokenizer')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory, which will be created if it doesn't exist.
|
||||||
|
| Paths may be either strings or #[code Path]-like objects.
|
||||||
|
|
||||||
|
+h(2, "from_disk") Tokenizer.from_disk
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Loads state from a directory. Modifies the object in place and returns it.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code path]
|
||||||
|
+cell unicode or #[code Path]
|
||||||
|
+cell
|
||||||
|
| A path to a directory. Paths may be either strings or
|
||||||
|
| #[code Path]-like objects.
|
||||||
|
|
||||||
+footrow
|
+footrow
|
||||||
+cell returns
|
+cell returns
|
||||||
+cell #[code None]
|
+cell #[code Tokenizer]
|
||||||
|
+cell The modified #[code Tokenizer] object.
|
||||||
|
|
||||||
|
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Serialize the current state to a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
+cell -
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being serialized.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell bytes
|
||||||
|
+cell The serialized form of the #[code Tokenizer] object.
|
||||||
|
|
||||||
|
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||||
|
+tag method
|
||||||
|
|
||||||
|
p Load state from a binary string.
|
||||||
|
|
||||||
|
+aside-code("Example").
|
||||||
|
fron spacy.tokenizer import Tokenizer
|
||||||
|
tokenizer_bytes = tokenizer.to_bytes()
|
||||||
|
new_tokenizer = Tokenizer(nlp.vocab)
|
||||||
|
new_tokenizer.from_bytes(tokenizer_bytes)
|
||||||
|
|
||||||
|
+table(["Name", "Type", "Description"])
|
||||||
|
+row
|
||||||
|
+cell #[code bytes_data]
|
||||||
|
+cell bytes
|
||||||
|
+cell The data to load from.
|
||||||
|
|
||||||
|
+row
|
||||||
|
+cell #[code **exclude]
|
||||||
|
+cell -
|
||||||
|
+cell Named attributes to prevent from being loaded.
|
||||||
|
|
||||||
|
+footrow
|
||||||
|
+cell returns
|
||||||
|
+cell #[code Tokenizer]
|
||||||
|
+cell The #[code Tokenizer] object.
|
||||||
|
|
||||||
|
|
||||||
+h(2, "attributes") Attributes
|
+h(2, "attributes") Attributes
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user