Merge docstrings

This commit is contained in:
Matthew Honnibal 2017-05-21 13:46:23 -05:00
commit 5db89053aa
68 changed files with 4137 additions and 3113 deletions

View File

@ -14,3 +14,4 @@ regex==2017.4.5
ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0

View File

@ -20,7 +20,17 @@ def download(model, direct=False):
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
link(model_name, model, force=True)
try:
link(model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc):

View File

@ -11,15 +11,14 @@ from .. import util
def info(model=None, markdown=False):
if model:
data_path = util.get_data_path()
data = util.parse_package_meta(data_path / model, require=True)
model_path = Path(__file__).parent / data_path / model
model_path = util.resolve_model_path(model)
meta = util.parse_package_meta(model_path)
if model_path.resolve() != model_path:
data['link'] = path2str(model_path)
data['source'] = path2str(model_path.resolve())
meta['link'] = path2str(model_path)
meta['source'] = path2str(model_path.resolve())
else:
data['source'] = path2str(model_path)
print_info(data, 'model %s' % model, markdown)
meta['source'] = path2str(model_path)
print_info(meta, 'model %s' % model, markdown)
else:
data = {'spaCy version': about.__version__,
'Location': path2str(Path(__file__).parent.parent),

View File

@ -306,25 +306,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False):
"""
Create a GoldParse.
"""Create a GoldParse.
Arguments:
doc (Doc):
The document the annotations refer to.
words:
A sequence of unicode word strings.
tags:
A sequence of strings, representing tag annotations.
heads:
A sequence of integers, representing syntactic head offsets.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
tags (iterable): A sequence of strings, representing tag annotations.
heads (iterable): A sequence of integers, representing syntactic head offsets.
deps (iterable): A sequence of strings, representing the syntactic relation types.
entities (iterable): A sequence of named entity annotations, either as
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
representing the entity positions.
RETURNS (GoldParse): The newly constructed object.
"""
if words is None:
words = [token.text for token in doc]
@ -389,55 +381,45 @@ cdef class GoldParse:
self.heads = proj_heads
def __len__(self):
"""
Get the number of gold-standard tokens.
"""Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
RETURNS (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""
Whether the provided syntactic annotations form a projective dependency
tree.
"""Whether the provided syntactic annotations form a projective
dependency tree.
"""
return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities):
"""
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (biluo).
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
scheme (BILUO).
Arguments:
doc (Doc):
The document that the entity offsets refer to. The output tags will
refer to the token boundaries within the document.
doc (Doc): The document that the entity offsets refer to. The output tags
will refer to the token boundaries within the document.
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
`end` should be character-offset integers denoting the slice into the
original string.
entities (sequence):
A sequence of (start, end, label) triples. start and end should be
character-offset integers denoting the slice into the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns:
tags (list):
A list of unicode strings, describing the tags. Each tag string will
be of the form either "", "O" or "{action}-{label}", where action is one
of "B", "I", "L", "U". The string "-" is used where the entity
offsets don't align with the tokenization in the Doc object. The
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
EXAMPLE:
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
"""
starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc}

View File

@ -13,21 +13,23 @@ from ...attrs import LANG
from ...util import update_exc
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
class Bengali(Language):
lang = 'bn'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
Defaults = BengaliDefaults
__all__ = ['Bengali']

View File

@ -10,15 +10,17 @@ from ...attrs import LANG
from ...util import update_exc
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Danish(Language):
lang = 'da'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = DanishDefaults
__all__ = ['Danish']

View File

@ -14,21 +14,23 @@ from ...attrs import LANG
from ...util import update_exc
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language):
lang = 'de'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = GermanDefaults
__all__ = ['German']

View File

@ -32,7 +32,6 @@ class EnglishDefaults(Language.Defaults):
class English(Language):
lang = 'en'
Defaults = EnglishDefaults

View File

@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults):
class Spanish(Language):
lang = 'es'
Defaults = SpanishDefaults
__all__ = ['Spanish']

View File

@ -10,15 +10,17 @@ from ...attrs import LANG
from ...util import update_exc
class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Finnish(Language):
lang = 'fi'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = FinnishDefaults
__all__ = ['Finnish']

View File

@ -13,22 +13,24 @@ from ...attrs import LANG
from ...util import update_exc
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language):
lang = 'fr'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = FrenchDefaults
__all__ = ['French']

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Hebrew(Language):
lang = 'he'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = HebrewDefaults
__all__ = ['Hebrew']

View File

@ -13,23 +13,25 @@ from ...attrs import LANG
from ...util import update_exc
class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language):
lang = 'hu'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = HungarianDefaults
__all__ = ['Hungarian']

View File

@ -11,19 +11,21 @@ from ...attrs import LANG
from ...util import update_exc
class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language):
lang = 'it'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = ItalianDefaults
__all__ = ['Italian']

View File

@ -11,15 +11,17 @@ from ...attrs import LANG
from ...util import update_exc
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Norwegian(Language):
lang = 'nb'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = NorwegianDefaults
__all__ = ['Norwegian']

View File

@ -9,16 +9,17 @@ from ...attrs import LANG
from ...util import update_exc
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Dutch(Language):
lang = 'nl'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = DutchDefaults
__all__ = ['Dutch']

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Polish(Language):
lang = 'pl'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = PolishDefaults
__all__ = ['Polish']

View File

@ -13,20 +13,22 @@ from ...attrs import LANG
from ...util import update_exc
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language):
lang = 'pt'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = PortugueseDefaults
__all__ = ['Portuguese']

View File

@ -13,19 +13,21 @@ from ...attrs import LANG
from ...util import update_exc
class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language):
lang = 'sv'
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
Defaults = SwedishDefaults
__all__ = ['Swedish']

View File

@ -116,14 +116,30 @@ class BaseDefaults(object):
class Language(object):
"""
A text-processing pipeline. Usually you'll load this once per process, and
pass the instance around your program.
"""A text-processing pipeline. Usually you'll load this once per process,
and pass the instance around your application.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
"""
Defaults = BaseDefaults
lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta)
if vocab is True:
@ -147,22 +163,17 @@ class Language(object):
self.pipeline = []
def __call__(self, text, **disabled):
"""
Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
"""'Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
Args:
text (unicode): The text to be processed.
text (unicode): The text to be processed.
**disabled: Elements of the pipeline that should not be run.
RETURNS (Doc): A container for accessing the annotations.
Returns:
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
EXAMPLE:
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
>>> tokens[0].text, tokens[0].head.tag_
('An', 'NN')
"""
doc = self.make_doc(text)
@ -174,6 +185,21 @@ class Language(object):
return doc
def update(self, docs, golds, drop=0., sgd=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
@ -204,7 +230,20 @@ class Language(object):
for doc, gold in docs_golds:
yield doc, gold
def begin_training(self, get_gold_tuples, **cfg):
def begin_training(self, gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
# Populate vocab
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
@ -233,6 +272,17 @@ class Language(object):
@contextmanager
def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
@ -250,16 +300,20 @@ class Language(object):
pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
"""
Process texts as a stream, and yield Doc objects in order.
"""Process texts as a stream, and yield `Doc` objects in order. Supports
GIL-free multi-threading.
Supports GIL-free multi-threading.
texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
YIELDS (Doc): Documents in the order of the original text.
Arguments:
texts (iterator)
tag (bool)
parse (bool)
entity (bool)
EXAMPLE:
>>> texts = [u'One document.', u'...', u'Lots of documents']
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
>>> assert doc.is_parsed
"""
#docs = (self.make_doc(text) for text in texts)
docs = texts
@ -267,7 +321,6 @@ class Language(object):
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
continue
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
else:
@ -278,11 +331,12 @@ class Language(object):
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
Args:
path: A path to a directory, which will be created if it doesn't
exist. Paths may be either strings or pathlib.Path-like
objects.
**exclude: Prevent named attributes from being saved.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
**exclude: Named attributes to prevent from being saved.
EXAMPLE:
>>> nlp.to_disk('/path/to/models')
"""
path = util.ensure_path(path)
if not path.exists():
@ -301,12 +355,17 @@ class Language(object):
dill.dump(props, file_)
def from_disk(self, path, **exclude):
"""Load the current state from a directory.
"""Loads state from a directory. Modifies the object in place and
returns it.
Args:
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being saved.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
"""
path = util.ensure_path(path)
for name in path.iterdir():
@ -320,10 +379,8 @@ class Language(object):
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
Args:
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being serialized.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Language` object.
"""
props = dict(self.__dict__)
for key in exclude:
@ -334,13 +391,12 @@ class Language(object):
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
Args:
bytes_data (bytes): The data to load from.
**exclude: Prevent named attributes from being loaded.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Language): The `Language` object.
"""
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
setattr(self, key, value)
return self

View File

@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme:
"""
An entry in the vocabulary. A Lexeme has no string context --- it's a
"""An entry in the vocabulary. A `Lexeme` has no string context it's a
word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag).
"""
def __init__(self, Vocab vocab, int orth):
"""
Create a Lexeme object.
"""Create a Lexeme object.
Arguments:
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
@ -82,35 +79,28 @@ cdef class Lexeme:
return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value):
"""
Change the value of a boolean flag.
"""Change the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""
Check the value of a boolean flag.
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to query.
Returns (bool): The value of the flag.
flag_id (int): The attribute ID of the flag to query.
RETURNS (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
"""
Compute a semantic similarity estimate. Defaults to cosine over vectors.
"""Compute a semantic similarity estimate. Defaults to cosine over
vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
@ -140,6 +130,11 @@ cdef class Lexeme:
self.orth = self.c.orth
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
cdef int i
for i in range(self.vocab.vectors_length):
@ -149,6 +144,10 @@ cdef class Lexeme:
return False
property vector_norm:
"""The L2 norm of the lexeme's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
return self.c.l2_norm
@ -156,6 +155,11 @@ cdef class Lexeme:
self.c.l2_norm = value
property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
@ -196,6 +200,14 @@ cdef class Lexeme:
def __get__(self):
return self.vocab.strings[self.c.orth]
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property lower:
def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x

View File

@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value
i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id
pattern[i].attrs[1].attr = ENT_TYPE
pattern[i].attrs[1].value = label
pattern[i].nr_attr = 0
return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match'''
"""Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i]
span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id)
cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.'''
"""Match sequences of tokens, based on pattern rules."""
cdef Pool mem
cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
cdef public object _callbacks
cdef public object _acceptors
@classmethod
def load(cls, path, vocab):
"""
Load the matcher and patterns from a file path.
def __init__(self, vocab):
"""Create the Matcher.
Arguments:
path (Path):
Path to a JSON-formatted patterns file.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
"""
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = ujson.load(file_)
else:
patterns = {}
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}):
"""
Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
vocab (Vocab): The vocabulary object, which must be shared with the
documents the matcher will operate on.
RETURNS (Matcher): The newly constructed object.
"""
self._patterns = {}
self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
self._callbacks = {}
self.vocab = vocab
self.mem = Pool()
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add_entity(entity_key, attrs)
for spec in specs:
self.add_pattern(entity_key, spec, label=etype)
def __reduce__(self):
return (self.__class__, (self.vocab, self._patterns), None, None)
property n_patterns:
def __get__(self): return self.patterns.size()
def __len__(self):
"""Get the number of rules added to the matcher. Note that this only
returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None):
RETURNS (int): The number of rules.
"""
Add an entity to the matcher.
return len(self._patterns)
Arguments:
entity_key (unicode or int):
An ID for the entity.
attrs:
Attributes to associate with the Matcher.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
key (unicode): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
if if_exists not in ('raise', 'ignore', 'update'):
raise ValueError(
"Unexpected value for if_exists: %s.\n"
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
if attrs is None:
attrs = {}
entity_key = self.normalize_entity_key(entity_key)
if self.has_entity(entity_key):
if if_exists == 'raise':
raise KeyError(
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
"Set if_exists='ignore' or if_exists='update', or check with "
"matcher.has_entity()")
elif if_exists == 'ignore':
return
self._entities[entity_key] = dict(attrs)
self._patterns.setdefault(entity_key, [])
self._acceptors[entity_key] = acceptor
self._callbacks[entity_key] = on_match
return len(self._patterns)
def add_pattern(self, entity_key, token_specs, label=""):
def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the
previous ones, and the previous on_match callback is replaced. The
`on_match` callback will receive the arguments `(matcher, doc, i,
matches)`. You can also set `on_match` to `None` to not perform any
actions. A pattern consists of one or more `token_specs`, where a
`token_spec` is a dictionary mapping attribute IDs to values. Token
descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs.
"""
Add a pattern to the matcher.
for pattern in patterns:
if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"key: {key}\n")
raise ValueError(msg.format(key=key))
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
Arguments:
entity_key (unicode or int):
An ID for the entity.
token_specs:
Description of the pattern to be matched.
label:
Label to assign to the matched pattern. Defaults to "".
Returns:
None
for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, key, specs))
self._patterns[key].append(specs)
def remove(self, key):
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
key (unicode): The ID of the match rule.
"""
token_specs = list(token_specs)
if len(token_specs) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"entity_key: {entity_key}\n"
"label: {label}")
raise ValueError(msg.format(entity_key=entity_key, label=label))
entity_key = self.normalize_entity_key(entity_key)
if not self.has_entity(entity_key):
self.add_entity(entity_key)
if isinstance(label, basestring):
label = self.vocab.strings[label]
elif label is None:
label = 0
spec = _convert_strings(token_specs, self.vocab.strings)
key = self._normalize_key(key)
self._patterns.pop(key)
self._callbacks.pop(key)
cdef int i = 0
while i < self.patterns.size():
pattern_key = get_pattern_key(self.patterns.at(i))
if pattern_key == key:
self.patterns.erase(self.patterns.begin()+i)
else:
i += 1
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
self._patterns[entity_key].append((label, token_specs))
def has_key(self, key):
"""Check whether the matcher has a rule with a given key.
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
self.add_entity(entity_key, attrs=attrs, if_exists='update',
acceptor=acceptor, on_match=on_match)
for spec in specs:
self.add_pattern(entity_key, spec, label=label)
def normalize_entity_key(self, entity_key):
if isinstance(entity_key, basestring):
return self.vocab.strings[entity_key]
else:
return entity_key
def has_entity(self, entity_key):
key (string or int): The key to check.
RETURNS (bool): Whether the matcher has the rule.
"""
Check whether the matcher has an entity.
key = self._normalize_key(key)
return key in self._patterns
Arguments:
entity_key (string or int): The entity key to check.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
def get_entity(self, entity_key):
key (unicode or int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
Retrieve the attributes stored for an entity.
key = self._normalize_key(key)
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
Arguments:
entity_key (unicode or int): The entity to retrieve.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities:
return self._entities[entity_key]
else:
return None
def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn.
def __call__(self, Doc doc, acceptor=None):
docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
Find all token sequences matching the supplied patterns on the Doc.
for doc in docs:
self(doc)
yield doc
Arguments:
doc (Doc):
The document to match over.
Returns:
list
A list of (entity_key, label_id, start, end) tuples,
describing the matches. A match tuple describes a span doc[start:end].
The label_id and entity_key are both integers.
def __call__(self, Doc doc):
"""Find all token sequences matching the supplied patterns on the `Doc`.
doc (Doc): The document to match over.
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
describing the matches. A match tuple describes a span
`doc[start:end]`. The `label_id` and `key` are both integers.
"""
if acceptor is not None:
raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
"functions when you add patterns instead.")
cdef vector[StateC] partials
cdef int n_partials = 0
cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
end = token_i+1
ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
matches.append((ent_id, start, end))
partials.resize(q)
# Check whether we open any new patterns on this token
for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
end = token_i+1
ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
matches.append((ent_id, start, end))
# Look for open patterns that are actually satisfied
for state in partials:
while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
end = len(doc)
ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value
acceptor = self._acceptors.get(ent_id)
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
matches.append((ent_id, start, end))
for i, (ent_id, label, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches
def pipe(self, docs, batch_size=1000, n_threads=2):
"""
Match a stream of documents, yielding them in turn.
Arguments:
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs:
self(doc)
yield doc
def _normalize_key(self, key):
if isinstance(key, basestring):
return self.vocab.strings[key]
else:
return key
def get_bilou(length):

View File

@ -38,33 +38,71 @@ from .parts_of_speech import X
class TokenVectorEncoder(object):
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tok2vec'
@classmethod
def Model(cls, width=128, embed_size=5000, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
embed_size (int): Number of vectors in the embedding table.
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None)
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters.
EXAMPLE:
>>> from spacy.pipeline import TokenVectorEncoder
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
>>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab
self.doc2feats = doc2feats()
self.model = model
def __call__(self, docs):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
if isinstance(docs, Doc):
docs = [docs]
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
def pipe(self, stream, batch_size=128, n_threads=-1):
"""Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` objects to process.
batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
"""
for docs in cytoolz.partition_all(batch_size, stream):
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
yield from docs
def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
"""
feats = self.doc2feats(docs)
tokvecs = self.model(feats)
return tokvecs
@ -73,7 +111,26 @@ class TokenVectorEncoder(object):
for doc, tokvecs in zip(docs, tokvecses):
doc.tensor = tokvecs
def begin_update(self, docs, drop=0.):
def set_annotations(self, docs, tokvecs):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents.
"""
start = 0
for doc in docs:
doc.tensor = tokvecs[start : start + len(doc)]
start += len(doc)
def update(self, docs, golds, state=None, drop=0., sgd=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
if isinstance(docs, Doc):
docs = [docs]
feats = self.doc2feats(docs)
@ -81,14 +138,26 @@ class TokenVectorEncoder(object):
return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores):
# TODO: implement
raise NotImplementedError
def begin_training(self, gold_tuples, pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
self.doc2feats = doc2feats()
if self.model is True:
self.model = self.Model()
def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params):
yield
@ -189,9 +258,7 @@ class NeuralTagger(object):
cdef class EntityRecognizer(LinearParser):
"""
Annotate named entities on Doc objects.
"""
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
@ -203,9 +270,7 @@ cdef class EntityRecognizer(LinearParser):
cdef class BeamEntityRecognizer(BeamParser):
"""
Annotate named entities on Doc objects.
"""
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')

View File

@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t
from libc.stdint cimport uint32_t
import ujson
cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8')
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore:
"""
Map strings to and from integer IDs.
"""
"""Map strings to and from integer IDs."""
def __init__(self, strings=None, freeze=False):
"""
Create the StringStore.
"""Create the StringStore.
Arguments:
strings: A sequence of unicode strings to add to the store.
strings (iterable): A sequence of unicode strings to add to the store.
RETURNS (StringStore): The newly constructed object.
"""
self.mem = Pool()
self._map = PreshMap()
@ -106,23 +101,17 @@ cdef class StringStore:
return (StringStore, (list(self),))
def __len__(self):
"""
The number of strings in the store.
"""The number of strings in the store.
Returns:
int The number of strings in the store.
RETURNS (int): The number of strings in the store.
"""
return self.size-1
def __getitem__(self, object string_or_id):
"""
Retrieve a string from a given integer ID, or vice versa.
"""Retrieve a string from a given integer ID, or vice versa.
Arguments:
string_or_id (bytes or unicode or int):
The value to encode.
Returns:
unicode or int: The value to retrieved.
string_or_id (bytes or unicode or int): The value to encode.
Returns (unicode or int): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
@ -163,13 +152,10 @@ cdef class StringStore:
return utf8str - self.c
def __contains__(self, unicode string not None):
"""
Check whether a string is in the store.
"""Check whether a string is in the store.
Arguments:
string (unicode): The string to check.
Returns bool:
Whether the store contains the string.
string (unicode): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
if len(string) == 0:
return True
@ -177,10 +163,9 @@ cdef class StringStore:
return self._map.get(key) is not NULL
def __iter__(self):
"""
Iterate over the strings in the store, in order.
"""Iterate over the strings in the store, in order.
Yields: unicode A string in the store.
YIELDS (unicode): A string in the store.
"""
cdef int i
for i in range(self.size):
@ -195,6 +180,41 @@ cdef class StringStore:
strings.append(py_string)
return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
raise NotImplementedError()
def set_frozen(self, bint is_frozen):
# TODO
self.is_frozen = is_frozen
@ -235,40 +255,6 @@ cdef class StringStore:
self.size += 1
return &self.c[self.size-1]
def dump(self, file_):
"""
Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
file_.write(string_data)
def load(self, file_):
"""
Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern_unicode(string)
def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize,

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import ujson
from collections import defaultdict
from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *
from . import util
cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger:
"""
Annotate part-of-speech tags on Doc objects.
"""
@classmethod
def load(cls, path, vocab, require=False):
"""
Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to
# support old data.
path = util.ensure_path(path)
if (path / 'templates.json').exists():
with (path / 'templates.json').open('r', encoding='utf8') as file_:
templates = ujson.load(file_)
elif require:
raise IOError(
"Required file %s/templates.json not found when loading Tagger" % str(path))
else:
templates = cls.feature_templates
self = cls(vocab, model=None, feature_templates=templates)
if (path / 'model').exists():
self.model.load(str(path / 'model'))
elif require:
raise IOError(
"Required file %s/model not found when loading Tagger" % str(path))
return self
"""Annotate part-of-speech tags on Doc objects."""
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""
Create a Tagger.
"""Create a Tagger.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Tagger):
The newly constructed object.
vocab (Vocab): The vocabulary object. Must be shared with documents to
be processed.
model (thinc.linear.AveragedPerceptron): The statistical model.
RETURNS (Tagger): The newly constructed object.
"""
if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens):
"""
Apply the tagger, setting the POS tags onto the Doc object.
"""Apply the tagger, setting the POS tags onto the Doc object.
Arguments:
doc (Doc): The tokens to be tagged.
Returns:
None
doc (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
@ -215,34 +169,25 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2):
"""
Tag a stream of documents.
"""Tag a stream of documents.
Arguments:
stream: The sequence of documents to tag.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
stream: The sequence of documents to tag.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the Matcher implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
"""
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold, itn=0):
"""
Update the statistical model, with tags supplied for the given document.
"""Update the statistical model, with tags supplied for the given document.
Arguments:
doc (Doc):
The document to update on.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
doc (Doc): The document to update on.
gold (GoldParse): Manager for the gold-standard tags.
RETURNS (int): Number of tags predicted correctly.
"""
gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs)

View File

@ -99,8 +99,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == []
assert doc[2].is_ancestor_of(doc[7])
assert not doc[6].is_ancestor_of(doc[2])
assert doc[2].is_ancestor(doc[7])
assert not doc[6].is_ancestor(doc[2])
def test_doc_token_api_head_setter(en_tokenizer):

View File

@ -2,8 +2,6 @@
# coding: utf8
from __future__ import unicode_literals
import ujson
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool
@ -12,75 +10,31 @@ from preshed.maps cimport PreshMap
from .strings cimport hash_string
cimport cython
from . import util
from .tokens.doc cimport Doc
cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.
"""
Segment text, and create Doc objects with the discovered segment boundaries.
"""
@classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None, token_match=None):
"""
Load a Tokenizer, reading unsupplied components from the path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
A storage container for lexical types.
rules (dict):
Exceptions and special-cases for the tokenizer.
token_match:
A boolean function matching strings that becomes tokens.
prefix_search:
Signature of re.compile(string).search
suffix_search:
Signature of re.compile(string).search
infix_finditer:
Signature of re.compile(string).finditer
Returns Tokenizer
"""
path = util.ensure_path(path)
if rules is None:
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
rules = ujson.load(file_)
if prefix_search in (None, True):
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
entries = file_.read().split('\n')
prefix_search = util.compile_prefix_regex(entries).search
if suffix_search in (None, True):
with (path / 'tokenizer' / 'suffix.txt').open() as file_:
entries = file_.read().split('\n')
suffix_search = util.compile_suffix_regex(entries).search
if infix_finditer in (None, True):
with (path / 'tokenizer' / 'infix.txt').open() as file_:
entries = file_.read().split('\n')
infix_finditer = util.compile_infix_regex(entries).finditer
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
"""
Create a Tokenizer, to create Doc objects given unicode text.
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
Arguments:
vocab (Vocab):
A storage container for lexical types.
rules (dict):
Exceptions and special-cases for the tokenizer.
prefix_search:
A function matching the signature of re.compile(string).search
to match prefixes.
suffix_search:
A function matching the signature of re.compile(string).search
to match suffixes.
infix_finditer:
A function matching the signature of re.compile(string).finditer
to find infixes.
token_match:
A boolean function matching strings that becomes tokens.
vocab (Vocab): A storage container for lexical types.
rules (dict): Exceptions and special-cases for the tokenizer.
prefix_search (callable): A function matching the signature of
`re.compile(string).search` to match prefixes.
suffix_search (callable): A function matching the signature of
`re.compile(string).search` to match suffixes.
`infix_finditer` (callable): A function matching the signature of
`re.compile(string).finditer` to find infixes.
token_match (callable): A boolean function matching strings to be
recognised as tokens.
RETURNS (Tokenizer): The newly constructed object.
EXAMPLE:
>>> tokenizer = Tokenizer(nlp.vocab)
>>> tokenizer = English().Defaults.create_tokenizer(nlp)
"""
self.mem = Pool()
self._cache = PreshMap()
@ -112,13 +66,10 @@ cdef class Tokenizer:
@cython.boundscheck(False)
def __call__(self, unicode string):
"""
Tokenize a string.
"""Tokenize a string.
Arguments:
string (unicode): The string to tokenize.
Returns:
Doc A container for linguistic annotations.
string (unicode): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
raise ValueError(
@ -166,18 +117,13 @@ cdef class Tokenizer:
return tokens
def pipe(self, texts, batch_size=1000, n_threads=2):
"""
Tokenize a stream of texts.
"""Tokenize a stream of texts.
Arguments:
texts: A sequence of unicode texts.
batch_size (int):
The number of texts to accumulate in an internal buffer.
n_threads (int):
The number of threads to use, if the implementation supports
multi-threading. The default tokenizer is single-threaded.
Yields:
Doc A sequence of Doc objects, in order.
texts: A sequence of unicode texts.
batch_size (int): The number of texts to accumulate in an internal buffer.
n_threads (int): The number of threads to use, if the implementation
supports multi-threading. The default tokenizer is single-threaded.
YIELDS (Doc): A sequence of Doc objects, in order.
"""
for text in texts:
yield self(text)
@ -321,27 +267,23 @@ cdef class Tokenizer:
self._cache.set(key, cached)
def find_infix(self, unicode string):
"""
Find internal split points of the string, such as hyphens.
"""Find internal split points of the string, such as hyphens.
string (unicode): The string to segment.
Returns List[re.MatchObject]
A list of objects that have .start() and .end() methods, denoting the
placement of internal segment separators, e.g. hyphens.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens.
"""
if self.infix_finditer is None:
return 0
return list(self.infix_finditer(string))
def find_prefix(self, unicode string):
"""
Find the length of a prefix that should be segmented from the string,
"""Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the prefix if present, otherwise None.
string (unicode): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
"""
if self.prefix_search is None:
return 0
@ -349,13 +291,11 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string):
"""
Find the length of a suffix that should be segmented from the string,
"""Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match.
Arguments:
string (unicode): The string to segment.
Returns (int or None): The length of the suffix if present, otherwise None.
string (unicode): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
"""
if self.suffix_search is None:
return 0
@ -363,23 +303,17 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases):
"""
Add special-case tokenization rules.
"""
"""Add special-case tokenization rules."""
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings):
"""
Add a special-case tokenization rule.
"""Add a special-case tokenization rule.
Arguments:
string (unicode): The string to specially tokenize.
token_attrs:
A sequence of dicts, where each dict describes a token and its
attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated.
Returns None
string (unicode): The string to specially tokenize.
token_attrs (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes must
exactly match the string when they are concatenated.
"""
substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -390,3 +324,38 @@ cdef class Tokenizer:
self._specials.set(key, cached)
self._cache.set(key, cached)
self._rules[string] = substrings
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Tokenizer): The modified `Tokenizer` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object.
"""
raise NotImplementedError()

View File

@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc:
"""
A sequence of `Token` objects. Access sentences and named entities,
export annotations to numpy arrays, losslessly serialize to compressed
binary strings.
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary strings.
The `Doc` object holds an array of `TokenC` structs. The Python-level
`Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
Aside: Internals
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
doc = nlp.tokenizer(u'Some text')
Code: Construction 2
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
EXAMPLE: Construction 1
>>> doc = nlp(u'Some text')
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
"""
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
"""
Create a Doc object.
"""Create a Doc object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer).
words:
A list of unicode strings to add to the document as words. If None,
defaults to empty list.
spaces:
A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words)
vocab (Vocab): A vocabulary object, which must match any models you want
to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
RETURNS (Doc): The newly constructed object.
"""
self.vocab = vocab
size = 20
@ -158,20 +148,26 @@ cdef class Doc:
self.is_parsed = True
def __getitem__(self, object i):
"""
doc[i]
Get the Token object at position i, where i is an integer.
"""Get a `Token` or `Span` object.
i (int or tuple) The index of the token, or the slice of the document to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`.
EXAMPLE:
>>> doc[i]
Get the `Token` object at position `i`, where `i` is an integer.
Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]]
Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and
`end` are token indices. For instance,
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their
normal Python semantics.
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
>>> doc[start : end]]
Get a `Span` object, starting at position `start` and ending at
position `end`, where `start` and `end` are token indices. For
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
as `Span` objects must be contiguous (cannot have gaps). You can use
negative indices and open-ended ranges, which have their normal
Python semantics.
"""
if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +182,14 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self):
"""
for token in doc
Iterate over `Token` objects, from which the annotations can
be easily accessed. This is the main way of accessing Token
objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
underlying C data directly from Cython.
"""Iterate over `Token` objects, from which the annotations can be
easily accessed. This is the main way of accessing `Token` objects,
which are the main way annotations are accessed from Python. If faster-
than-Python speeds are required, you can instead access the annotations
as a numpy array, or access the underlying C data directly from Cython.
EXAMPLE:
>>> for token in doc
"""
cdef int i
for i in range(self.length):
@ -203,9 +199,12 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self):
"""
len(doc)
The number of tokens in the document.
"""The number of tokens in the document.
RETURNS (int): The number of tokens in the document.
EXAMPLE:
>>> len(doc)
"""
return self.length
@ -228,16 +227,12 @@ cdef class Doc:
return self
def similarity(self, other):
"""
Make a semantic similarity estimate. The default estimate is cosine
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other)
@ -246,8 +241,10 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector:
"""
A boolean value indicating whether a word vector is associated with the object.
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.user_hooks:
@ -256,10 +253,11 @@ cdef class Doc:
return any(token.has_vector for token in self)
property vector:
"""
A real-valued meaning representation. Defaults to an average of the token vectors.
"""A real-valued meaning representation. Defaults to an average of the
token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the document's semantics.
"""
def __get__(self):
if 'vector' in self.user_hooks:
@ -275,6 +273,10 @@ cdef class Doc:
self._vector = value
property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self)
@ -295,34 +297,37 @@ cdef class Doc:
return self.text
property text:
"""
A unicode representation of the document text.
"""A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document.
"""
def __get__(self):
return u''.join(t.text_with_ws for t in self)
property text_with_ws:
"""
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
"""An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document.
"""
def __get__(self):
return self.text
property ents:
"""
Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
"""Iterate over the entities in the document. Yields named-entity `Span`
objects, if the entity recognizer has been applied to the document.
Example:
from spacy.en import English
nlp = English()
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best'
assert ents[0].text == 'Mr. Best'
YIELDS (Span): Entities in the document.
EXAMPLE: Iterate over the span to get individual Token objects, or access
the label:
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> assert ents[0].label == 346
>>> assert ents[0].label_ == 'PERSON'
>>> assert ents[0].orth_ == 'Best'
>>> assert ents[0].text == 'Mr. Best'
"""
def __get__(self):
cdef int i
@ -387,12 +392,13 @@ cdef class Doc:
self.c[start].ent_iob = 3
property noun_chunks:
"""
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses.
"""Iterate over the base noun phrases in the document. Yields base
noun-phrase #[code Span] objects, if the document has been syntactically
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
not permit other NPs to be nested within it so no NP-level
coordination, no prepositional phrases, and no relative clauses.
YIELDS (Span): Noun chunks in the document.
"""
def __get__(self):
if not self.is_parsed:
@ -411,17 +417,15 @@ cdef class Doc:
yield span
property sents:
"""
Yields sentence `Span` objects. Sentence spans have no label.
To improve accuracy on informal texts, spaCy calculates sentence
boundaries from the syntactic dependency parse. If the parser is disabled,
`sents` iterator will be unavailable.
"""Iterate over the sentences in the document. Yields sentence `Span`
objects. Sentence spans have no label. To improve accuracy on informal
texts, spaCy calculates sentence boundaries from the syntactic
dependency parse. If the parser is disabled, the `sents` iterator will
be unavailable.
Example:
from spacy.en import English
nlp = English()
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
EXAMPLE:
>>> doc = nlp("This is a sentence. Here's another...")
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
"""
def __get__(self):
if 'sents' in self.user_hooks:
@ -467,24 +471,20 @@ cdef class Doc:
@cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids):
"""
Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers.
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
The values will be 32-bit integers.
Example:
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
attr_ids (list[int]): A list of attribute ID ints.
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
per word, and one column per attribute indicated in the input
`attr_ids`.
Arguments:
attr_ids (list[int]): A list of attribute ID ints.
Returns:
feat_array (numpy.ndarray[long, ndim=2]):
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
EXAMPLE:
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
>>> doc = nlp(text)
>>> # All strings mapped to integers, for easy export to numpy
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
"""
cdef int i, j
cdef attr_id_t feature
@ -499,27 +499,20 @@ cdef class Doc:
return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
"""
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID.
"""Count the frequencies of a given attribute. Produces a dict of
`{attribute (int): count (ints)}` frequencies, keyed by the values of
the given attribute ID.
Example:
from spacy.en import English
from spacy import attrs
nlp = English()
tokens = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880],
# [11880],
# [ 7561],
# [12800]])
attr_id (int): The attribute ID to key the counts.
RETURNS (dict): A dictionary mapping attributes to integer counts.
Arguments:
attr_id
int
The attribute ID to key the counts.
EXAMPLE:
>>> from spacy import attrs
>>> doc = nlp(u'apple apple orange banana')
>>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880], [11880], [7561], [12800]])
"""
cdef int i
cdef attr_t attr
@ -567,8 +560,12 @@ cdef class Doc:
self.c[i] = parsed[i]
def from_array(self, attrs, int[:, :] array):
"""
Write to a `Doc` object, from an `(M, N)` array of attributes.
"""Load attributes from a numpy array. Write to a `Doc` object, from an
`(M, N)` array of attributes.
attrs (ints): A list of attribute ID ints.
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
"""
cdef int i, col
cdef attr_id_t attr_id
@ -597,8 +594,10 @@ cdef class Doc:
return self
def to_bytes(self):
"""
Serialize, producing a byte string.
"""Serialize, i.e. export the document contents to a binary string.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
"""
return dill.dumps(
(self.text,
@ -611,8 +610,10 @@ cdef class Doc:
protocol=-1)
def from_bytes(self, data):
"""
Deserialize, loading from bytes.
"""Deserialize, i.e. import the document contents from a binary string.
data (bytes): The string to load from.
RETURNS (Doc): Itself.
"""
if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +641,16 @@ cdef class Doc:
return self
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged.
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
is merged into a single token. If `start_idx` and `end_idx `do not mark
start and end token boundaries, the document remains unchanged.
Arguments:
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token, or `None` if the start and end
indices did not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
if len(args) == 3:
@ -758,7 +754,29 @@ cdef class Doc:
return self[start]
def print_tree(self, light=False, flat=False):
"""Returns the parse trees in the JSON (Dict) format."""
"""Returns the parse trees in JSON (dict) format.
light (bool): Don't include lemmas or entities.
flat (bool): Don't include arcs or modifiers.
RETURNS (dict): Parse tree as dict.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
return parse_tree(self, light=light, flat=flat)

View File

@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc):
"""
Helper: merge adjacent entities into single tokens; modifies the doc.
"""
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
for ent in doc.ents:
ent.merge(ent.root.tag_, ent.text, ent.label_)
return doc
def format_POS(token, light, flat):
"""
Helper: form the POS output for a token.
"""
"""Helper: form the POS output for a token."""
subtree = dict([
("word", token.text),
("lemma", token.lemma_), # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
def POS_tree(root, light=False, flat=False):
"""
Helper: generate a POS tree for a root token. The doc must have
merge_ents(doc) ran on it.
"""Helper: generate a POS tree for a root token. The doc must have
`merge_ents(doc)` ran on it.
"""
subtree = format_POS(root, light=light, flat=flat)
for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False):
"""
Makes a copy of the doc, then construct a syntactic parse tree, similar to
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc.
Args:
doc: The doc for parsing.
doc (Doc): The doc for parsing.
RETURNS (dict): The parse tree.
Returns:
[parse_trees (Dict)]:
>>> from spacy.en import English
>>> nlp = English()
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],

View File

@ -20,22 +20,17 @@ from .. import about
cdef class Span:
"""
A slice from a Doc object.
"""
"""A slice from a Doc object."""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None):
"""
Create a Span object from the slice doc[start : end]
"""Create a `Span` object from the slice `doc[start : end]`.
Arguments:
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
Returns:
Span The newly constructed object.
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
RETURNS (Span): The newly constructed object.
"""
if not (0 <= start <= end <= len(doc)):
raise IndexError
@ -70,8 +65,11 @@ cdef class Span:
def __hash__(self):
return hash((self.doc, self.label, self.start_char, self.end_char))
def __len__(self):
"""Get the number of tokens in the span.
RETURNS (int): The number of tokens in the span.
"""
self._recalculate_indices()
if self.end < self.start:
return 0
@ -83,6 +81,16 @@ cdef class Span:
return self.text.encode('utf-8')
def __getitem__(self, object i):
"""Get a `Token` or a `Span` object
i (int or tuple): The index of the token within the span, or slice of
the span to get.
RETURNS (Token or Span): The token at `span[i]`.
EXAMPLE:
>>> span[0]
>>> span[1:3]
"""
self._recalculate_indices()
if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
@ -94,35 +102,31 @@ cdef class Span:
return self.doc[self.start + i]
def __iter__(self):
"""Iterate over `Token` objects.
YIELDS (Token): A `Token` object.
"""
self._recalculate_indices()
for i in range(self.start, self.end):
yield self.doc[i]
def merge(self, *args, **attributes):
"""
Retokenize the document, such that the span is merged into a single token.
"""Retokenize the document, such that the span is merged into a single
token.
Arguments:
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token.
**attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span.
RETURNS (Token): The newly merged token.
"""
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other):
"""
Make a semantic similarity estimate. The default estimate is cosine
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other)
@ -145,11 +149,9 @@ cdef class Span:
self.end = end + 1
property sent:
"""
The sentence span that this span is a part of.
"""The sentence span that this span is a part of.
Returns:
Span The sentence this is part of.
RETURNS (Span): The sentence span that the span is a part of.
"""
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
@ -166,12 +168,23 @@ cdef class Span:
return self.doc[root.l_edge : root.r_edge + 1]
property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['has_vector'](self)
return any(token.has_vector for token in self)
property vector:
"""A real-valued meaning representation. Defaults to an average of the
token vectors.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the span's semantics.
"""
def __get__(self):
if 'vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
@ -180,6 +193,10 @@ cdef class Span:
return self._vector
property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self)
@ -193,6 +210,7 @@ cdef class Span:
return self._vector_norm
property sentiment:
# TODO: docstring
def __get__(self):
if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self)
@ -200,6 +218,10 @@ cdef class Span:
return sum([token.sentiment for token in self]) / len(self)
property text:
"""A unicode representation of the span text.
RETURNS (unicode): The original verbatim text of the span.
"""
def __get__(self):
text = self.text_with_ws
if self[-1].whitespace_:
@ -207,16 +229,21 @@ cdef class Span:
return text
property text_with_ws:
"""The text content of the span with a trailing whitespace character if
the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""
def __get__(self):
return u''.join([t.text_with_ws for t in self])
property noun_chunks:
"""
Yields base noun-phrase #[code Span] objects, if the document
has been syntactically parsed. A base noun phrase, or
'NP chunk', is a noun phrase that does not permit other NPs to
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example:
"""Yields base noun-phrase `Span` objects, if the document has been
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
phrase that does not permit other NPs to be nested within it so no
NP-level coordination, no prepositional phrases, and no relative clauses.
YIELDS (Span): Base noun-phrase `Span` objects
"""
def __get__(self):
if not self.doc.is_parsed:
@ -235,49 +262,47 @@ cdef class Span:
yield span
property root:
"""
The token within the span that's highest in the parse tree. If there's a
tie, the earlist is prefered.
"""The token within the span that's highest in the parse tree.
If there's a tie, the earliest is prefered.
Returns:
Token: The root token.
RETURNS (Token): The root token.
i.e. has the shortest path to the root of the sentence (or is the root
itself). If multiple words are equally high in the tree, the first word
is taken. For example:
EXAMPLE: The root token has the shortest path to the root of the sentence
(or is the root itself). If multiple words are equally high in the
tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.')
>>> toks = nlp(u'I like New York in Autumn.')
Let's name the indices --- easier than writing "toks[4]" etc.
Let's name the indices easier than writing `toks[4]` etc.
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
The head of 'new' is 'York', and the head of 'York' is 'like'
The head of 'new' is 'York', and the head of "York" is "like"
>>> toks[new].head.orth_
'York'
>>> toks[york].head.orth_
'like'
>>> toks[new].head.text
'York'
>>> toks[york].head.text
'like'
Create a span for "New York". Its root is "York".
Create a span for "New York". Its root is "York".
>>> new_york = toks[new:york+1]
>>> new_york.root.orth_
'York'
>>> new_york = toks[new:york+1]
>>> new_york.root.text
'York'
Here's a more complicated case, raise by Issue #214
Here's a more complicated case, raised by issue #214:
>>> toks = nlp(u'to, north and south carolina')
>>> to, north, and_, south, carolina = toks
>>> south.head.text, carolina.head.text
('north', 'to')
>>> toks = nlp(u'to, north and south carolina')
>>> to, north, and_, south, carolina = toks
>>> south.head.text, carolina.head.text
('north', 'to')
Here 'south' is a child of 'north', which is a child of 'carolina'.
Carolina is the root of the span:
Here "south" is a child of "north", which is a child of "carolina".
Carolina is the root of the span:
>>> south_carolina = toks[-2:]
>>> south_carolina.root.text
'carolina'
>>> south_carolina = toks[-2:]
>>> south_carolina.root.text
'carolina'
"""
def __get__(self):
self._recalculate_indices()
@ -314,10 +339,10 @@ cdef class Span:
return self.doc[root]
property lefts:
"""
Tokens that are to the left of the span, whose head is within the Span.
""" Tokens that are to the left of the span, whose head is within the
`Span`.
Yields: Token A left-child of a token of the span.
YIELDS (Token):A left-child of a token of the span.
"""
def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order
@ -326,10 +351,10 @@ cdef class Span:
yield left
property rights:
"""
Tokens that are to the right of the Span, whose head is within the Span.
"""Tokens that are to the right of the Span, whose head is within the
`Span`.
Yields: Token A right-child of a token of the span.
YIELDS (Token): A right-child of a token of the span.
"""
def __get__(self):
for token in self:
@ -338,10 +363,9 @@ cdef class Span:
yield right
property subtree:
"""
Tokens that descend from tokens in the span, but fall outside it.
"""Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span.
YIELDS (Token): A descendant of a token within the span.
"""
def __get__(self):
for word in self.lefts:
@ -351,8 +375,9 @@ cdef class Span:
yield from word.subtree
property ent_id:
"""
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (int): The entity ID.
"""
def __get__(self):
return self.root.ent_id
@ -362,9 +387,11 @@ cdef class Span:
raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues")
property ent_id_:
"""
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
RETURNS (unicode): The entity ID.
"""
def __get__(self):
return self.root.ent_id_
@ -376,26 +403,38 @@ cdef class Span:
"tracker: http://github.com/explosion/spaCy/issues")
property orth_:
# TODO: docstring
def __get__(self):
return ''.join([t.string for t in self]).strip()
property lemma_:
"""The span's lemma.
RETURNS (unicode): The span's lemma.
"""
def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip()
property upper_:
# TODO: docstring
def __get__(self):
return ''.join([t.string.upper() for t in self]).strip()
property lower_:
# TODO: docstring
def __get__(self):
return ''.join([t.string.lower() for t in self]).strip()
property string:
# TODO: docstring
def __get__(self):
return ''.join([t.string for t in self])
property label_:
"""The span's label.
RETURNS (unicode): The span's label.
"""
def __get__(self):
return self.doc.vocab.strings[self.label]

View File

@ -23,10 +23,14 @@ from .. import about
cdef class Token:
"""
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
"""
"""An individual token i.e. a word, punctuation symbol, whitespace, etc."""
def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
vocab (Vocab): A storage container for lexical types.
doc (Doc): The parent document.
offset (int): The index of the token within the document.
"""
self.vocab = vocab
self.doc = doc
self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
return hash((self.doc, self.i))
def __len__(self):
"""
Number of unicode characters in token.text.
"""The number of unicode characters in the token, i.e. `token.text`.
RETURNS (int): The number of unicode characters in the token.
"""
return self.c.lex.length
@ -75,37 +80,35 @@ cdef class Token:
raise ValueError(op)
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
"""
Check the value of a boolean flag.
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
is_set (bool): Whether the flag is set.
flag_id (int): The ID of the flag attribute.
RETURNS (bool): Whether the flag is set.
EXAMPLE:
>>> from spacy.attrs import IS_TITLE
>>> doc = nlp(u'Give it back! He pleaded.')
>>> token = doc[0]
>>> token.check_flag(IS_TITLE)
True
"""
return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1):
"""
Get a neighboring token.
"""Get a neighboring token.
Arguments:
i (int): The relative position of the token to get. Defaults to 1.
Returns:
neighbor (Token): The token at position self.doc[self.i+i]
i (int): The relative position of the token to get. Defaults to 1.
RETURNS (Token): The token at position `self.doc[self.i+i]`.
"""
return self.doc[self.i+i]
def similarity(self, other):
"""
Compute a semantic similarity estimate. Defaults to cosine over vectors.
"""Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
other (object): The object to compare with. By default, accepts `Doc`,
`Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.
"""
if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id:
"""ID of the token's lexical type.
RETURNS (int): ID of the token's lexical type."""
def __get__(self):
return self.c.lex.id
property rank:
# TODO: add docstring
def __get__(self):
return self.c.lex.id
@ -126,10 +133,19 @@ cdef class Token:
return self.text_with_ws
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property text_with_ws:
"""The text content of the token with a trailing whitespace character if
it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""
def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
return self.c.lex.suffix
property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (int): Token lemma.
"""
def __get__(self):
return self.c.lemma
def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
self.c.dep = label
property has_vector:
"""
A boolean value indicating whether a word vector is associated with the object.
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
return False
property vector:
"""
A real-valued meaning representation.
"""A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the token's semantics.
"""
def __get__(self):
if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view)
property repvec:
def __get__(self):
raise AttributeError("repvec was renamed to vector in v0.100")
property has_repvec:
def __get__(self):
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
property vector_norm:
"""The L2 norm of the token's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
yield from word.subtree
property left_edge:
"""
The leftmost token of this token's syntactic descendents.
"""The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token)
RETURNS (Token): The first token such that `self.is_ancestor(token)`.
"""
def __get__(self):
return self.doc[self.c.l_edge]
property right_edge:
"""
The rightmost token of this token's syntactic descendents.
"""The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token)
RETURNS (Token): The last token such that `self.is_ancestor(token)`.
"""
def __get__(self):
return self.doc[self.c.r_edge]
property ancestors:
"""
A sequence of this token's syntactic ancestors.
"""A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
YIELDS (Token): A sequence of ancestor tokens such that
`ancestor.is_ancestor(self)`.
"""
def __get__(self):
cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
yield self.doc[head_ptr - (self.c - self.i)]
i += 1
def is_ancestor_of(self, descendant):
# TODO: Remove after backward compatibility check.
return self.is_ancestor(descendant)
def is_ancestor(self, descendant):
"""
Check whether this token is a parent, grandparent, etc. of another
"""Check whether this token is a parent, grandparent, etc. of another
in the dependency tree.
Arguments:
descendant (Token): Another token.
Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant.
descendant (Token): Another token.
RETURNS (bool): Whether this token is the ancestor of the descendant.
"""
if self.doc is not descendant.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head:
"""
The syntactic parent, or "governor", of this token.
"""The syntactic parent, or "governor", of this token.
Returns: Token
RETURNS (Token): The token head.
"""
def __get__(self):
"""
The token predicted by the parser to be the head of the current token.
"""The token predicted by the parser to be the head of the current
token.
"""
return self.doc[self.i + self.c.head]
def __set__(self, Token new_head):
@ -399,7 +407,7 @@ cdef class Token:
cdef int rel_newhead_i = new_head.i - self.i
# is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head)
cdef bint is_desc = old_head.is_ancestor(new_head)
cdef int new_edge
cdef Token anc, child
@ -477,10 +485,9 @@ cdef class Token:
self.c.head = rel_newhead_i
property conjuncts:
"""
A sequence of coordinated tokens, including the token itself.
"""A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token
YIELDS (Token): A coordinated token.
"""
def __get__(self):
"""Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
yield from word.conjuncts
property ent_type:
"""Named entity type.
RETURNS (int): Named entity type.
"""
def __get__(self):
return self.c.ent_type
property ent_iob:
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
is assigned.
RETURNS (int): IOB code of named entity tag.
"""
def __get__(self):
return self.c.ent_iob
property ent_type_:
"""Named entity type.
RETURNS (unicode): Named entity type.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and
"" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
"""
def __get__(self):
iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob]
property ent_id:
"""
An (integer) entity ID. Usually assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (int): ID of the entity.
"""
def __get__(self):
return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
"""
A (string) entity ID. Usually assigned by patterns in the Matcher.
"""ID of the entity the token is an instance of, if any. Usually
assigned by patterns in the Matcher.
RETURNS (unicode): ID of the entity.
"""
def __get__(self):
return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
return self.vocab.strings[self.c.lex.lang]
property lemma_:
"""Base form of the word, with no inflectional suffixes.
RETURNS (unicode): Token lemma.
"""
def __get__(self):
return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_):

View File

@ -145,7 +145,8 @@ def parse_package_meta(package_path, require=True):
def is_in_jupyter():
"""Check if user is in a Jupyter notebook. Mainly used for displaCy.
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
RETURNS (bool): True if in Jupyter, False if not.
"""

View File

@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
C-data that is shared between `Doc` objects.
"""
A map container for a language's LexemeC structs.
"""
@classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, oov_prob=True, **deprecated_kwargs):
"""
Deprecated --- replace in spaCy 2
Load the vocabulary from a path.
Arguments:
path (Path):
The path to load from.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
"""
path = util.ensure_path(path)
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
if 'vectors' in deprecated_kwargs:
raise AttributeError(
"vectors argument to Vocab.load() deprecated. "
"Install vectors after loading.")
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
tag_map = ujson.load(file_)
elif tag_map is True:
tag_map = None
if lex_attr_getters is not None \
and oov_prob is True \
and (path / 'vocab' / 'oov_prob').exists():
with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
oov_prob = float(file_.read())
lex_attr_getters[PROB] = lambda text: oov_prob
if lemmatizer is True:
lemmatizer = Lemmatizer.load(path)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer,
strings=strings_list)
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs):
"""
Create the vocabulary.
"""Create the vocabulary.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
to compute them. Defaults to `None`.
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
parts-of-speech, and optionally morphological attributes.
lemmatizer (object): A lemmatizer. Defaults to `None`.
strings (StringStore): StringStore that maps strings to integers, and
vice versa.
RETURNS (Vocab): The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
@ -148,33 +91,32 @@ cdef class Vocab:
return langfunc('_') if langfunc else ''
def __len__(self):
"""
The current number of lexemes stored.
"""The current number of lexemes stored.
RETURNS (int): The current number of lexemes stored.
"""
return self.length
def add_flag(self, flag_getter, int flag_id=-1):
"""
Set a new boolean flag to words in the vocabulary.
The flag_setter function will be called over the words currently in the
def add_flag(self, flag_getter, int flag_id=-1):
"""Set a new boolean flag to words in the vocabulary.
The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
See also:
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked.
Arguments:
flag_getter:
A function f(unicode) -> bool, to get the flag value.
flag_id (int):
An integer between 1 and 63 (inclusive), specifying the bit at which the
flag will be stored. If -1, the lowest available bit will be
chosen.
Returns:
flag_id (int): The integer ID by which the flag value can be checked.
EXAMPLE:
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
>>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True
"""
if flag_id == -1:
for bit in range(1, 64):
@ -196,9 +138,8 @@ cdef class Vocab:
return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
"""
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if string == u'':
@ -216,9 +157,8 @@ cdef class Vocab:
return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
"""
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
if necessary, using memory acquired from the given pool. If the pool
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon.
"""
if orth == 0:
@ -263,24 +203,19 @@ cdef class Vocab:
self.length += 1
def __contains__(self, unicode string):
"""
Check whether the string has an entry in the vocabulary.
"""Check whether the string has an entry in the vocabulary.
Arguments:
string (unicode): The ID string.
Returns:
bool Whether the string has an entry in the vocabulary.
string (unicode): The ID string.
RETURNS (bool) Whether the string has an entry in the vocabulary.
"""
key = hash_string(string)
lex = self._by_hash.get(key)
return lex is not NULL
def __iter__(self):
"""
Iterate over the lexemes in the vocabulary.
"""Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary.
YIELDS (Lexeme): An entry in the vocabulary.
"""
cdef attr_t orth
cdef size_t addr
@ -288,19 +223,19 @@ cdef class Vocab:
yield Lexeme(self, orth)
def __getitem__(self, id_or_string):
"""
Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new lexeme is created and stored.
"""Retrieve a lexeme, given an int ID or a unicode string. If a
previously unseen unicode string is given, a new lexeme is created and
stored.
Arguments:
id_or_string (int or unicode):
The integer ID of a word, or its unicode string.
id_or_string (int or unicode): The integer ID of a word, or its unicode
string. If `int >= Lexicon.size`, `IndexError` is raised. If
`id_or_string` is neither an int nor a unicode string, `ValueError`
is raised.
RETURNS (Lexeme): The lexeme indicated by the given ID.
If an int >= Lexicon.size, IndexError is raised. If id_or_string
is neither an int nor a unicode string, ValueError is raised.
Returns:
lexeme (Lexeme): The lexeme indicated by the given ID.
EXAMPLE:
>>> apple = nlp.vocab.strings['apple']
>>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
"""
cdef attr_t orth
if type(id_or_string) == unicode:
@ -324,15 +259,29 @@ cdef class Vocab:
return tokens
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
path = util.ensure_path(path)
if not path.exists():
path.mkdir()
strings_loc = path / 'strings.json'
with strings_loc.open('w', encoding='utf8') as file_:
self.strings.dump(file_)
self.dump(path / 'lexemes.bin')
# TODO: pickle
# self.dump(path / 'lexemes.bin')
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Vocab): The modified `Vocab` object.
"""
path = util.ensure_path(path)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_)
@ -340,6 +289,23 @@ cdef class Vocab:
self.strings[string]
self.load_lexemes(path / 'lexemes.bin')
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object.
"""
raise NotImplementedError()
def lexemes_to_bytes(self, **exclude):
cdef hash_t key
cdef size_t addr
@ -365,9 +331,7 @@ cdef class Vocab:
return byte_string
def lexemes_from_bytes(self, bytes bytes_data):
"""
Load the binary vocabulary data from the given string.
"""
"""Load the binary vocabulary data from the given string."""
cdef LexemeC* lexeme
cdef hash_t key
cdef unicode py_str
@ -391,16 +355,12 @@ cdef class Vocab:
self.length += 1
# Deprecated --- delete these once stable
def dump_vectors(self, out_loc):
"""
Save the word vectors to a binary file.
Arguments:
loc (Path): The path to save to.
Returns:
None
#"""
def dump_vectors(self, out_loc):
"""Save the word vectors to a binary file.
loc (Path): The path to save to.
"""
cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len
cdef bytes word_str
@ -424,17 +384,14 @@ cdef class Vocab:
def load_vectors(self, file_):
"""
Load vectors from a text-based file.
"""Load vectors from a text-based file.
Arguments:
file_ (buffer): The file to read from. Entries should be separated by newlines,
and each entry should be whitespace delimited. The first value of the entry
should be the word string, and subsequent entries should be the values of the
vector.
file_ (buffer): The file to read from. Entries should be separated by
newlines, and each entry should be whitespace delimited. The first value of the entry
should be the word string, and subsequent entries should be the values of the
vector.
Returns:
vec_len (int): The length of the vectors loaded.
RETURNS (int): The length of the vectors loaded.
"""
cdef LexemeC* lexeme
cdef attr_t orth
@ -464,14 +421,11 @@ cdef class Vocab:
return vec_len
def load_vectors_from_bin_loc(self, loc):
"""
Load vectors from the location of a binary file.
"""Load vectors from the location of a binary file.
Arguments:
loc (unicode): The path of the binary file to load from.
loc (unicode): The path of the binary file to load from.
Returns:
vec_len (int): The length of the vectors loaded.
RETURNS (int): The length of the vectors loaded.
"""
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
@ -526,12 +480,10 @@ cdef class Vocab:
def resize_vectors(self, int new_size):
"""
Set vectors_length to a new size, and allocate more memory for the Lexeme
vectors if necessary. The memory will be zeroed.
"""Set vectors_length to a new size, and allocate more memory for the
`Lexeme` vectors if necessary. The memory will be zeroed.
Arguments:
new_size (int): The new size of the vectors.
new_size (int): The new size of the vectors.
"""
cdef hash_t key
cdef size_t addr
@ -633,237 +585,3 @@ class VectorReadError(Exception):
"Vector size: %d\n"
"Max size: %d\n"
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
#
#Deprecated --- delete these once stable
#
# def dump_vectors(self, out_loc):
# """
# Save the word vectors to a binary file.
#
# Arguments:
# loc (Path): The path to save to.
# Returns:
# None
# #"""
# cdef int32_t vec_len = self.vectors_length
# cdef int32_t word_len
# cdef bytes word_str
# cdef char* chars
#
# cdef Lexeme lexeme
# cdef CFile out_file = CFile(out_loc, 'wb')
# for lexeme in self:
# word_str = lexeme.orth_.encode('utf8')
# vec = lexeme.c.vector
# word_len = len(word_str)
#
# out_file.write_from(&word_len, 1, sizeof(word_len))
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
#
# chars = <char*>word_str
# out_file.write_from(chars, word_len, sizeof(char))
# out_file.write_from(vec, vec_len, sizeof(float))
# out_file.close()
#
#
#
# def load_vectors(self, file_):
# """
# Load vectors from a text-based file.
#
# Arguments:
# file_ (buffer): The file to read from. Entries should be separated by newlines,
# and each entry should be whitespace delimited. The first value of the entry
# should be the word string, and subsequent entries should be the values of the
# vector.
#
# Returns:
# vec_len (int): The length of the vectors loaded.
# """
# cdef LexemeC* lexeme
# cdef attr_t orth
# cdef int32_t vec_len = -1
# cdef double norm = 0.0
#
# whitespace_pattern = re.compile(r'\s', re.UNICODE)
#
# for line_num, line in enumerate(file_):
# pieces = line.split()
# word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
# if vec_len == -1:
# vec_len = len(pieces)
# elif vec_len != len(pieces):
# raise VectorReadError.mismatched_sizes(file_, line_num,
# vec_len, len(pieces))
# orth = self.strings[word_str]
# lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
# lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
# for i, val_str in enumerate(pieces):
# lexeme.vector[i] = float(val_str)
# norm = 0.0
# for i in range(vec_len):
# norm += lexeme.vector[i] * lexeme.vector[i]
# lexeme.l2_norm = sqrt(norm)
# self.vectors_length = vec_len
# return vec_len
#
# def load_vectors_from_bin_loc(self, loc):
# """
# Load vectors from the location of a binary file.
#
# Arguments:
# loc (unicode): The path of the binary file to load from.
#
# Returns:
# vec_len (int): The length of the vectors loaded.
# """
# cdef CFile file_ = CFile(loc, b'rb')
# cdef int32_t word_len
# cdef int32_t vec_len = 0
# cdef int32_t prev_vec_len = 0
# cdef float* vec
# cdef Address mem
# cdef attr_t string_id
# cdef bytes py_word
# cdef vector[float*] vectors
# cdef int line_num = 0
# cdef Pool tmp_mem = Pool()
# while True:
# try:
# file_.read_into(&word_len, sizeof(word_len), 1)
# except IOError:
# break
# file_.read_into(&vec_len, sizeof(vec_len), 1)
# if prev_vec_len != 0 and vec_len != prev_vec_len:
# raise VectorReadError.mismatched_sizes(loc, line_num,
# vec_len, prev_vec_len)
# if 0 >= vec_len >= MAX_VEC_SIZE:
# raise VectorReadError.bad_size(loc, vec_len)
#
# chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
# vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
#
# string_id = self.strings[chars[:word_len]]
# # Insert words into vocab to add vector.
# self.get_by_orth(self.mem, string_id)
# while string_id >= vectors.size():
# vectors.push_back(EMPTY_VEC)
# assert vec != NULL
# vectors[string_id] = vec
# line_num += 1
# cdef LexemeC* lex
# cdef size_t lex_addr
# cdef double norm = 0.0
# cdef int i
# for orth, lex_addr in self._by_orth.items():
# lex = <LexemeC*>lex_addr
# if lex.lower < vectors.size():
# lex.vector = vectors[lex.lower]
# norm = 0.0
# for i in range(vec_len):
# norm += lex.vector[i] * lex.vector[i]
# lex.l2_norm = sqrt(norm)
# else:
# lex.vector = EMPTY_VEC
# self.vectors_length = vec_len
# return vec_len
#
#
#def write_binary_vectors(in_loc, out_loc):
# cdef CFile out_file = CFile(out_loc, 'wb')
# cdef Address mem
# cdef int32_t word_len
# cdef int32_t vec_len
# cdef char* chars
# with bz2.BZ2File(in_loc, 'r') as file_:
# for line in file_:
# pieces = line.split()
# word = pieces.pop(0)
# mem = Address(len(pieces), sizeof(float))
# vec = <float*>mem.ptr
# for i, val_str in enumerate(pieces):
# vec[i] = float(val_str)
#
# word_len = len(word)
# vec_len = len(pieces)
#
# out_file.write_from(&word_len, 1, sizeof(word_len))
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
#
# chars = <char*>word
# out_file.write_from(chars, len(word), sizeof(char))
# out_file.write_from(vec, vec_len, sizeof(float))
#
#
# def resize_vectors(self, int new_size):
# """
# Set vectors_length to a new size, and allocate more memory for the Lexeme
# vectors if necessary. The memory will be zeroed.
#
# Arguments:
# new_size (int): The new size of the vectors.
# """
# cdef hash_t key
# cdef size_t addr
# if new_size > self.vectors_length:
# for key, addr in self._by_hash.items():
# lex = <LexemeC*>addr
# lex.vector = <float*>self.mem.realloc(lex.vector,
# new_size * sizeof(lex.vector[0]))
# self.vectors_length = new_size
#
#
#
# def dump(self, loc=None):
# """
# Save the lexemes binary data to the given location, or
# return a byte-string with the data if loc is None.
#
# Arguments:
# loc (Path or None): The path to save to, or None.
# """
# if loc is None:
# return self.to_bytes()
# else:
# return self.to_disk(loc)
#
# def load_lexemes(self, loc):
# """
# Load the binary vocabulary data from the given location.
#
# Arguments:
# loc (Path): The path to load from.
#
# Returns:
# None
# """
# fp = CFile(loc, 'rb',
# on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
# cdef LexemeC* lexeme = NULL
# cdef SerializedLexemeC lex_data
# cdef hash_t key
# cdef unicode py_str
# cdef attr_t orth = 0
# assert sizeof(orth) == sizeof(lexeme.orth)
# i = 0
# while True:
# try:
# fp.read_into(&orth, 1, sizeof(orth))
# except IOError:
# break
# lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# # Copy data from the file into the lexeme
# fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
# Lexeme.c_from_bytes(lexeme, lex_data)
#
# lexeme.vector = EMPTY_VEC
# py_str = self.strings[lexeme.orth]
# key = hash_string(py_str)
# self._by_hash.set(key, lexeme)
# self._by_orth.set(lexeme.orth, lexeme)
# self.length += 1
# i += 1
# fp.close()

View File

@ -80,6 +80,7 @@
}
],
"ALPHA": true,
"V_CSS": "1.6",
"V_JS": "1.2",
"DEFAULT_SYNTAX": "python",

View File

@ -34,17 +34,17 @@ mixin src(url)
+a(url)
block
| #[+icon("code", 16).o-icon--inline.u-color-subtle]
| #[+icon("code", 16).o-icon--inline.u-color-theme]
//- API link (with added tag and automatically generated path)
path - [string] path to API docs page relative to /docs/api/
mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
block
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
| #[+icon("book", 18).o-icon--inline.u-color-theme]
//- Help icon with tooltip
@ -104,15 +104,31 @@ mixin button(url, trusted, ...style)
language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS
mixin code(label, language)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}")&attributes(attributes)
mixin code(label, language, icon)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || "")
+icon(icon, 18)
code.c-code-block__content
block
//- Code blocks to display old/new versions
mixin code-old()
+code(false, false, "reject").o-block-small
block
mixin code-new()
+code(false, false, "accept").o-block-small
block
//- CodePen embed
slug - [string] ID of CodePen demo (taken from URL)
height - [integer] height of demo embed iframe
@ -164,6 +180,16 @@ mixin tag()
block
//- "Requires model" tag with tooltip and list of capabilities
...capabs - [string] Required model capabilities, e.g. "vectors".
mixin tag-model(...capabs)
- var intro = "To use this functionality, spaCy needs a model to be installed"
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
+tag Requires model
+help(intro + ext + ".").u-color-theme
//- List
type - [string] "numbers", "letters", "roman" (bulleted list if none set)
start - [integer] start number

View File

@ -9,6 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
ul.c-nav__menu
if ALPHA
- var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
each url, item in NAVIGATION
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
+a(url)=item

View File

@ -10,6 +10,14 @@ main.o-main.o-main--sidebar.o-main--aside
if tag
+tag=tag
if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
| This page is part of the alpha documentation for spaCy v2.0
| and does not reflect the state of the latest stable release.
| #[+a("#") See here] for more information on how to install
| and test the new version. To read the official docs for
| v1.x, #[+a("https://spacy.io/docs") go here].
!=yield
+grid.o-content.u-text

View File

@ -35,7 +35,10 @@ html(lang="en")
link(rel="shortcut icon" href="/assets/img/favicon.ico")
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
if SUBSECTION == "usage"
if ALPHA && SECTION == "docs"
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
else if SUBSECTION == "usage"
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
else

View File

@ -13,6 +13,17 @@
white-space: pre
direction: ltr
&.c-code-block--has-icon
padding: 0
display: flex
.c-code-block__icon
padding: 0 0 0 1rem
display: flex
justify-content: center
align-items: center
border-left: 6px solid
//- Code block content
@ -26,8 +37,8 @@
*:not(.c-code-block) > code
font: normal 600 0.8em/#{1} $font-code
background: rgba($color-front, 0.05)
box-shadow: 1px 1px 0 rgba($color-front, 0.1)
background: darken($color-theme-light, 5)
box-shadow: 1px 1px 0 rgba($color-front, 0.05)
text-shadow: 1px 1px 0 rgba($color-back, 0.5)
color: $color-front
padding: 0.1em 0.5em

View File

@ -13,7 +13,7 @@
background: rgba($color-subtle-light, 0.35)
&.c-table__row--foot
background: rgba($color-theme, 0.025)
background: $color-theme-light
border-top: 2px solid $color-theme
.c-table__cell:first-child

View File

@ -11,9 +11,8 @@
background: $color-front
border-radius: 2px
color: $color-back
font-family: inherit
font-size: 1.3rem
line-height: 1.25
font: normal 1.3rem/#{1.25} $font-primary
text-transform: none
opacity: 0
padding: 0.5em 0.75em
transform: translateX(-50%) translateY(-2px)

View File

@ -26,8 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
// Colors
$colors: ( blue: #09a3d5, red: #d9515d )
$colors-light: (blue: #cceaf4, red: #f9d7da)
$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )
$color-back: #fff !default
$color-front: #1a1e23 !default
@ -35,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default
$color-theme: map-get($colors, $theme)
$color-theme-dark: darken(map-get($colors, $theme), 5)
$color-theme-light: map-get($colors-light, $theme)
$color-theme-light: rgba($color-theme, 0.05)
$color-subtle: #ddd !default
$color-subtle-light: #f6f6f6 !default

View File

@ -0,0 +1,4 @@
//- 💫 STYLESHEET (GREEN)
$theme: green
@import style

View File

@ -30,5 +30,11 @@
<symbol id="help" viewBox="0 0 24 24">
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
</symbol>
<symbol id="reject" viewBox="0 0 24 24">
<path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"/>
</symbol>
<symbol id="accept" viewBox="0 0 24 24">
<path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
</symbol>
</defs>
</svg>

Before

Width:  |  Height:  |  Size: 5.4 KiB

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 216 KiB

View File

@ -2,8 +2,13 @@
"sidebar": {
"Introduction": {
"Facts & Figures": "./",
"Languages": "language-models",
"Philosophy": "philosophy"
"Languages": "language-models"
},
"Top-level": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Command line": "cli"
},
"Classes": {
"Doc": "doc",
@ -21,9 +26,6 @@
"GoldParse": "goldparse"
},
"Other": {
"Command line": "cli",
"displaCy": "displacy",
"Utility Functions": "util",
"Annotation Specs": "annotation",
"Feature Scheme": "features"
}
@ -43,6 +45,26 @@
"title": "Philosophy"
},
"spacy": {
"title": "spaCy top-level functions",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"next": "util"
},
"util": {
"title": "Utility Functions",
"next": "cli"
},
"cli": {
"title": "Command Line Interface"
},
"language": {
"title": "Language",
"tag": "class"
@ -113,20 +135,6 @@
"tag": "class"
},
"cli": {
"title": "Command Line Interface",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module"
},
"util": {
"title": "Utility Functions"
},
"annotation": {
"title": "Annotation Specifications"
},

View File

@ -71,6 +71,44 @@ include _annotation/_dep-labels
include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme
p
| spaCy translates character offsets into the BILUO scheme, in order to
| decide the cost of each action given the current state of the entity
| recognizer. The costs are then used to calculate the gradient of the
| loss, to train the model.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+h(2, "json-input") JSON input format for training
p

View File

@ -10,11 +10,11 @@ p
+aside("Why python -m?")
| The problem with a global entry point is that it's resolved by looking up
| entries in your #[code PATH] environment variable. This can give you
| unexpected results, especially when using #[code virtualenv]. For
| instance, you may have spaCy installed on your system but not in your
| current environment. The command will then execute the wrong
| spaCy installation. #[code python -m] prevents fallbacks to system modules
| and makes sure the correct version of spaCy is used.
| unexpected results, like executing the wrong spaCy installation
| (especially when using #[code virtualenv]). #[code python -m] prevents
| fallbacks to system modules and makes sure the correct spaCy version is
| used. If you hate typing it every time, we recommend creating an
| #[code alias] instead.
+h(2, "download") Download
@ -45,13 +45,24 @@ p
+cell flag
+cell Show help message and available arguments.
+infobox("Important note")
| The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints
| detailed messages in case things go wrong. It's #[strong not recommended]
| to use this command as part of an automated process. If you know which
| model your project needs, you should consider a
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
| uploading the model to a local PyPi installation and fetching it straight
| from there. This will also allow you to add it as a versioned package
| dependency to your project.
+h(2, "link") Link
p
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
| either a Python package or a local directory. This will let you load
| models from any location via #[code spacy.load()].
| models from any location using a custom name via
| #[+api("spacy#load") #[code spacy.load()]].
+code(false, "bash").
python -m spacy link [origin] [link_name] [--force]
@ -92,7 +103,7 @@ p
+row
+cell #[code model]
+cell positional
+cell Shortcut link of model (optional).
+cell A model, i.e. shortcut link, package name or path (optional).
+row
+cell #[code --markdown], #[code -md]
@ -114,7 +125,7 @@ p
| the input file. Currently only supports #[code .conllu].
+code(false, "bash").
python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+table(["Argument", "Type", "Description"])
+row
@ -128,7 +139,7 @@ p
+cell Output directory for converted JSON file.
+row
+cell #[code --n_sents], #[code -n]
+cell #[code --n-sents], #[code -n]
+cell option
+cell Number of sentences per document.
@ -191,7 +202,7 @@ p
| #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
+table(["Argument", "Type", "Description"])
+row
@ -215,27 +226,37 @@ p
+cell Location of JSON-formatted dev data (optional).
+row
+cell #[code --n_iter], #[code -n]
+cell #[code --n-iter], #[code -n]
+cell option
+cell Number of iterations (default: #[code 15]).
+row
+cell #[code --parser_L1], #[code -L]
+cell #[code --nsents]
+cell option
+cell Number of sentences (default: #[code 0]).
+row
+cell #[code --parser-L1], #[code -L]
+cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]).
+row
+cell #[code --no_tagger], #[code -T]
+cell #[code --use-gpu], #[code -g]
+cell flag
+cell Use GPU.
+row
+cell #[code --no-tagger], #[code -T]
+cell flag
+cell Don't train tagger.
+row
+cell #[code --no_parser], #[code -P]
+cell #[code --no-parser], #[code -P]
+cell flag
+cell Don't train parser.
+row
+cell #[code --no_ner], #[code -N]
+cell #[code --no-ner], #[code -N]
+cell flag
+cell Don't train NER.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate syntactic dependencies on #[code Doc] objects.
+h(2, "load") DependencyParser.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code DependencyParser]
+cell The newly constructed object.
+h(2, "init") DependencyParser.__init__
+tag method
@ -47,7 +21,7 @@ p Create a #[code DependencyParser].
+cell The statistical model.
+footrow
+cell return
+cell returns
+cell #[code DependencyParser]
+cell The newly constructed object.
@ -65,7 +39,7 @@ p
+cell The document to be processed.
+footrow
+cell return
+cell returns
+cell #[code None]
+cell -
@ -93,7 +67,7 @@ p Process a stream of documents.
| parallel.
+footrow
+cell yield
+cell yields
+cell #[code Doc]
+cell Documents, in order.
@ -114,7 +88,7 @@ p Update the statistical model.
+cell The gold-standard annotations, to calculate the loss.
+footrow
+cell return
+cell returns
+cell int
+cell The loss on this example.
@ -130,6 +104,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
+cell The document to step through.
+footrow
+cell return
+cell returns
+cell #[code StepwiseState]
+cell A state object, to step through the annotation process.

View File

@ -8,7 +8,7 @@ p
| #[+a("/docs/usage/visualizers") visualizing spaCy].
+h(2, "serve") serve
+h(2, "serve") displacy.serve
+tag method
p
@ -60,7 +60,7 @@ p
+cell Port to serve visualization.
+cell #[code 5000]
+h(2, "render") render
+h(2, "render") displacy.render
+tag method
p Render a dependency parse tree or named entity visualization.
@ -112,7 +112,7 @@ p Render a dependency parse tree or named entity visualization.
+cell #[code {}]
+footrow
+cell return
+cell returns
+cell unicode
+cell Rendered HTML markup.
+cell
@ -218,7 +218,7 @@ p
+cell #[code colors]
+cell dict
+cell
| Color overrides. Entity types in lowercase should be mapped to
| Color overrides. Entity types in uppercase should be mapped to
| color names or values.
+cell #[code {}]

View File

@ -4,9 +4,508 @@ include ../../_includes/_mixins
p A container for accessing linguistic annotations.
p
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
| Access sentences and named entities, export annotations to numpy arrays,
| losslessly serialize to compressed binary strings. The #[code Doc] object
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
| they don't own the data themselves.
+aside-code("Example").
# Construction 1
doc = nlp(u'Some text')
# Construction 2
from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
+h(2, "init") Doc.__init__
+tag method
p
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
| object is via the #[code nlp] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell returns
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p
| Get a #[+api("token") #[code Token]] object at position #[code i], where
| #[code i] is an integer. Negative indexing is supported, and follows the
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell returns
+cell #[code Token]
+cell The token at #[code doc[i]].
p
| Get a #[+api("span") #[code Span]] object, starting at position
| #[code start] (token index) and ending at position #[code end] (token
| index).
p
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
| You can use negative indices and open-ended ranges, which have their
| normal Python semantics.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell returns
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p
| Iterate over #[code Token] objects, from which the annotations can be
| easily accessed.
+aside-code("Example").
doc = nlp(u'Give it back')
assert [t.text for t in doc] == [u'Give', u'it', u'back']
p
| This is the main way of accessing #[+api("token") #[code Token]] objects,
| which are the main way annotations are accessed from Python. If
| faster-than-Python speeds are required, you can instead access the
| annotations as a numpy array, or access the underlying C data directly
| from Cython.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert len(doc) == 7
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
+tag-model("vectors")
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
apples = nlp(u'I like apples')
oranges = nlp(u'I like oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "count_by") Doc.count_by
+tag method
p
| Count the frequencies of a given attribute. Produces a dict of
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
| of the given attribute ID.
+aside-code("Example").
from spacy.attrs import ORTH
doc = nlp(u'apple apple orange banana')
assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
doc.to_array([attrs.ORTH])
# array([[11880], [11880], [7561], [12800]])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell returns
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "from_array") Doc.from_array
+tag method
p
| Load attributes from a numpy array. Write to a #[code Doc] object, from
| an #[code (M, N)] array of attributes.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp(text)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab)
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code attrs]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell returns
+cell #[code Doc]
+cell Itself.
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Serialize, i.e. export the document contents to a binary string.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
doc_bytes = doc.to_bytes()
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc], including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Deserialize, i.e. import the document contents from a binary string.
+aside-code("Example").
from spacy.tokens import Doc
text = u'Give it back! He pleaded.'
doc = nlp(text)
bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(bytes)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
+cell bytes
+cell The string to load from.
+footrow
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+aside-code("Example").
doc = nlp(u'Los Angeles start.')
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
assert [t.text for t in doc] == [u'Los Angeles', u'start', u'.']
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell returns
+cell #[code Token]
+cell
| The newly merged token, or #[code None] if the start and end
| indices did not fall at token boundaries
+h(2, "print_tree") Doc.print_tree
+tag method
+tag-model("parse")
p
| Returns the parse trees in JSON (dict) format. Especially useful for
| web applications.
+aside-code("Example").
doc = nlp('Alice ate the pizza.')
trees = doc.print_tree()
# {'modifiers': [
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+table(["Name", "Type", "Description"])
+row
+cell #[code light]
+cell bool
+cell Don't include lemmas or entities.
+row
+cell #[code flat]
+cell bool
+cell Don't include arcs or modifiers.
+footrow
+cell returns
+cell dict
+cell Parse tree as dict.
+h(2, "ents") Doc.ents
+tag property
+tag-model("NER")
p
| Iterate over the entities in the document. Yields named-entity
| #[code Span] objects, if the entity recognizer has been applied to the
| document.
+aside-code("Example").
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].text == 'Mr. Best'
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
+tag-model("parse")
p
| Iterate over the base noun phrases in the document. Yields base
| noun-phrase #[code Span] objects, if the document has been syntactically
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
| permit other NPs to be nested within it so no NP-level coordination, no
| prepositional phrases, and no relative clauses.
+aside-code("Example").
doc = nlp(u'A phrase with another phrase occurs.')
chunks = list(doc.noun_chunks)
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Noun chunks in the document.
+h(2, "sents") Doc.sents
+tag property
+tag-model("parse")
p
| Iterate over the sentences in the document. Sentence spans have no label.
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
| from the syntactic dependency parse. If the parser is disabled,
| the #[code sents] iterator will be unavailable.
+aside-code("Example").
doc = nlp(u"This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Sentences in the document.
+h(2, "has_vector") Doc.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the document has a vector data attached.
+h(2, "vector") Doc.vector
+tag property
+tag-model("vectors")
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
apples = nlp(u'I like apples')
assert doc.vector.dtype == 'float32'
assert doc.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "vector_norm") Doc.vector_norm
+tag property
+tag-model("vectors")
p
| The L2 norm of the document's vector representation.
+aside-code("Example").
doc1 = nlp(u'I like apples')
doc2 = nlp(u'I like oranges')
doc1.vector_norm # 4.54232424414368
doc2.vector_norm # 3.304373298575751
assert doc1.vector_norm != doc2.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell A unicode representation of the document text.
+row
+cell #[code text_with_ws]
+cell unicode
+cell
| An alias of #[code Doc.text], provided for duck-type compatibility
| with #[code Span] and #[code Token].
+row
+cell #[code mem]
+cell #[code Pool]
@ -17,6 +516,11 @@ p A container for accessing linguistic annotations.
+cell #[code Vocab]
+cell The store of lexical types.
+row
+cell #[code tensor]
+cell object
+cell Container for dense vector representations.
+row
+cell #[code user_data]
+cell -
@ -59,358 +563,3 @@ p A container for accessing linguistic annotations.
+cell
| A dictionary that allows customisation of properties of
| #[code Span] children.
+h(2, "init") Doc.__init__
+tag method
p Construct a #[code Doc] object.
+aside("Note")
| The most common way to get a #[code Doc] object is via the #[code nlp]
| object. This method is usually only used for deserialization or preset
| tokenization.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p Get a #[code Span] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p Iterate over #[code Token] objects.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS,
attrs.ENT_TYPE, attrs.IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "count_by") Doc.count_by
+tag method
p Count the frequencies of a given attribute.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "from_array") Doc.from_array
+tag method
p Load attributes from a numpy array.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code values]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Export the document contents to a binary string.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc] including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Import the document contents from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code byte_string]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell The #[code self] variable.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or None if the start and end
| indices did not fall at token boundaries
+h(2, "read_bytes") Doc.read_bytes
+tag staticmethod
p A static method, used to read serialized #[code Doc] objects from a file.
+aside-code("Example").
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
+table(["Name", "Type", "Description"])
+row
+cell file
+cell buffer
+cell A binary buffer to read the serialized annotations from.
+footrow
+cell yield
+cell bytes
+cell Binary strings from with documents can be loaded.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "sents") Doc.sents
+tag property
p Iterate over the sentences in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "ents") Doc.ents
+tag property
p Iterate over the entities in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
p
| Iterate over the base noun phrases in the document. A base noun phrase,
| or "NP chunk", is a noun phrase that does not permit other NPs to be
| nested within it.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document
+h(2, "vector") Doc.vector
+tag property
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "has_vector") Doc.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate named entities on #[code Doc] objects.
+h(2, "load") EntityRecognizer.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code EntityRecognizer]
+cell The newly constructed object.
+h(2, "init") EntityRecognizer.__init__
+tag method
@ -47,7 +21,7 @@ p Create an #[code EntityRecognizer].
+cell The statistical model.
+footrow
+cell return
+cell returns
+cell #[code EntityRecognizer]
+cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
+cell The document to be processed.
+footrow
+cell return
+cell returns
+cell #[code None]
+cell -
@ -91,7 +65,7 @@ p Process a stream of documents.
| parallel.
+footrow
+cell yield
+cell yields
+cell #[code Doc]
+cell Documents, in order.
@ -112,7 +86,7 @@ p Update the statistical model.
+cell The gold-standard annotations, to calculate the loss.
+footrow
+cell return
+cell returns
+cell int
+cell The loss on this example.
@ -128,6 +102,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
+cell The document to step through.
+footrow
+cell return
+cell returns
+cell #[code StepwiseState]
+cell A state object, to step through the annotation process.

View File

@ -4,6 +4,72 @@ include ../../_includes/_mixins
p Collection for training annotations.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse.
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the annotations refer to.
+row
+cell #[code words]
+cell iterable
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell iterable
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell iterable
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell iterable
+cell A sequence of strings, representing the syntactic relation types.
+row
+cell #[code entities]
+cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+footrow
+cell returns
+cell #[code GoldParse]
+cell The newly constructed object.
+h(2, "len") GoldParse.__len__
+tag method
p Get the number of gold-standard tokens.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of gold-standard tokens.
+h(2, "is_projective") GoldParse.is_projective
+tag property
p
| Whether the provided syntactic annotations form a projective dependency
| tree.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether annotations form projective tree.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
@ -37,67 +103,57 @@ p Collection for training annotations.
+cell list
+cell The alignment from gold tokenization to candidate tokenization.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse.
+h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the annotations refer to.
+row
+cell #[code words]
+cell -
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell -
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell -
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell -
+cell A sequence of strings, representing the syntactic relation types.
+cell
| The document that the entity offsets refer to. The output tags
| will refer to the token boundaries within the document.
+row
+cell #[code entities]
+cell -
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+cell iterable
+cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
+footrow
+cell return
+cell #[code GoldParse]
+cell The newly constructed object.
+cell returns
+cell list
+cell
| Unicode strings, describing the
| #[+a("/docs/api/annotation#biluo") BILUO] tags.
+h(2, "len") GoldParse.__len__
+tag method
p Get the number of gold-standard tokens.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of gold-standard tokens.
+h(2, "is_projective") GoldParse.is_projective
+tag property
p
| Whether the provided syntactic annotations form a projective dependency
| tree.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether annotations form projective tree.

View File

@ -2,79 +2,69 @@
include ../../_includes/_mixins
p A text processing pipeline.
p
| A text-processing pipeline. Usually you'll load this once per process,
| and pass the instance around your application.
+h(2, "attributes") Attributes
+h(2, "init") Language.__init__
+tag method
p Initialise a #[code Language] object.
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
from spacy.lang.en import English
nlp = English()
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A container for the lexical types.
+row
+cell #[code tokenizer]
+cell #[code Tokenizer]
+cell Find word boundaries and create #[code Doc] object.
+row
+cell #[code tagger]
+cell #[code Tagger]
+cell Annotate #[code Doc] objects with POS tags.
+row
+cell #[code parser]
+cell #[code DependencyParser]
+cell Annotate #[code Doc] objects with syntactic dependencies.
+row
+cell #[code entity]
+cell #[code EntityRecognizer]
+cell Annotate #[code Doc] objects with named entities.
+row
+cell #[code matcher]
+cell #[code Matcher]
+cell Rule-based sequence matcher.
+cell
| A #[code Vocab] object. If #[code True], a vocab is created via
| #[code Language.Defaults.create_vocab].
+row
+cell #[code make_doc]
+cell #[code lambda text: Doc]
+cell Create a #[code Doc] object from unicode text.
+cell callable
+cell
| A function that takes text and returns a #[code Doc] object.
| Usually a #[code Tokenizer].
+row
+cell #[code pipeline]
+cell -
+cell Sequence of annotation functions.
+cell list
+cell
| A list of annotation processes or IDs of annotation, processes,
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
| up in #[code Language.Defaults.factories].
+h(2, "init") Language.__init__
+tag method
p Create or load the pipeline.
+table(["Name", "Type", "Description"])
+row
+cell #[code **overrides]
+cell -
+cell Keyword arguments indicating which defaults to override.
+cell #[code meta]
+cell dict
+cell
| Custom meta data for the #[code Language] class. Is written to by
| models to add model meta data.
+footrow
+cell return
+cell returns
+cell #[code Language]
+cell The newly constructed object.
+h(2, "call") Language.__call__
+tag method
p Apply the pipeline to a single text.
p
| Apply the pipeline to some text. The text can span multiple sentences,
| and can contain arbtrary whitespace. Alignment into the original string
| is preserved.
+aside-code("Example").
from spacy.en import English
nlp = English()
doc = nlp('An example sentence. Another example sentence.')
doc[0].orth_, doc[0].head.tag_
# ('An', 'NN')
doc = nlp(u'An example sentence. Another sentence.')
assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')
+table(["Name", "Type", "Description"])
+row
@ -83,24 +73,104 @@ p Apply the pipeline to a single text.
+cell The text to be processed.
+row
+cell #[code tag]
+cell bool
+cell Whether to apply the part-of-speech tagger.
+row
+cell #[code parse]
+cell bool
+cell Whether to apply the syntactic dependency parser.
+row
+cell #[code entity]
+cell bool
+cell Whether to apply the named entity recognizer.
+cell #[code **disabled]
+cell -
+cell Elements of the pipeline that should not be run.
+footrow
+cell return
+cell returns
+cell #[code Doc]
+cell A container for accessing the linguistic annotations.
+cell A container for accessing the annotations.
+h(2, "update") Language.update
+tag method
p Update the models in the pipeline.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code docs]
+cell iterable
+cell A batch of #[code Doc] objects.
+row
+cell #[code golds]
+cell iterable
+cell A batch of #[code GoldParse] objects.
+row
+cell #[code drop]
+cell float
+cell The dropout rate.
+row
+cell #[code sgd]
+cell callable
+cell An optimizer.
+footrow
+cell returns
+cell dict
+cell Results from the update.
+h(2, "begin_training") Language.begin_training
+tag contextmanager
p
| Allocate models, pre-process training data and acquire a trainer and
| optimizer. Used as a contextmanager.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code gold_tuples]
+cell iterable
+cell Gold-standard training data.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+footrow
+cell yields
+cell tuple
+cell A trainer and an optimizer.
+h(2, "use_params") Language.use_params
+tag contextmanager
+tag method
p
| Replace weights of models in the pipeline with those provided in the
| params dictionary. Can be used as a contextmanager, in which case, models
| go back to their original weights after the block.
+aside-code("Example").
with nlp.use_params(optimizer.averages):
nlp.to_disk('/tmp/checkpoint')
+table(["Name", "Type", "Description"])
+row
+cell #[code params]
+cell dict
+cell A dictionary of parameters keyed by model ID.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+h(2, "pipe") Language.pipe
+tag method
@ -133,22 +203,142 @@ p
+cell The number of texts to buffer.
+footrow
+cell yield
+cell yields
+cell #[code Doc]
+cell Containers for accessing the linguistic annotations.
+cell Documents in the order of the original text.
+h(2, "save_to_directory") Language.save_to_directory
+h(2, "to_disk") Language.to_disk
+tag method
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
p Save the current state to a directory.
+aside-code("Example").
nlp.to_disk('/path/to/models')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell string or pathlib path
+cell Path to save the model.
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.language import Language
nlp = Language().from_disk('/path/to/models')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell return
+cell #[code None]
+cell returns
+cell #[code Language]
+cell The modified #[code Language] object.
+h(2, "to_bytes") Language.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
nlp_bytes = nlp.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Language] object.
+h(2, "from_bytes") Language.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.lang.en import English
nlp_bytes = nlp.to_bytes()
nlp2 = English()
nlp2.from_bytes(nlp_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Language]
+cell The #[code Language] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A container for the lexical types.
+row
+cell #[code make_doc]
+cell #[code lambda text: Doc]
+cell Create a #[code Doc] object from unicode text.
+row
+cell #[code pipeline]
+cell list
+cell Sequence of annotation functions.
+row
+cell #[code meta]
+cell dict
+cell
| Custom meta data for the Language class. If a model is loaded,
| contains meta data of the model.
+h(2, "class-attributes") Class attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code Defaults]
+cell class
+cell
| Settings, data and factory methods for creating the
| #[code nlp] object and processing pipeline.
+row
+cell #[code lang]
+cell unicode
+cell
| Two-letter language ID, i.e.
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].

View File

@ -2,7 +2,154 @@
include ../../_includes/_mixins
p An entry in the vocabulary.
p
| An entry in the vocabulary. A #[code Lexeme] has no string context it's
| a word-type, as opposed to a word token. It therefore has no
| part-of-speech tag, dependency parse, or lemma (if lemmatization depends
| on the part-of-speech tag).
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+footrow
+cell returns
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+aside-code("Example").
COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
is_my_library = lambda text: text in ['spaCy', 'Thinc']
MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+footrow
+cell returns
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apple = nlp.vocab[u'apple']
orange = nlp.vocab[u'orange']
apple_orange = apple.similarity(orange)
orange_apple = orange.similarity(apple)
assert apple_orange == orange_apple
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "has_vector") Lexeme.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| lexeme.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the lexeme has a vector data attached.
+h(2, "vector") Lexeme.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.vector.dtype == 'float32'
assert apple.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the lexeme's vector representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
pasta = nlp.vocab[u'pasta']
apple.vector_norm # 7.1346845626831055
pasta.vector_norm # 7.759851932525635
assert apple.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
@ -12,6 +159,16 @@ p An entry in the vocabulary.
+cell #[code Vocab]
+cell
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code lex_id]
+cell int
+cell ID of the lexeme's lexical type.
+row
+cell #[code lower]
+cell int
@ -124,116 +281,9 @@ p An entry in the vocabulary.
+row
+cell #[code prob]
+cell float
+cell Smoothed log probability estimate of token's type.
+cell Smoothed log probability estimate of lexeme's type.
+row
+cell #[code sentiment]
+cell float
+cell A scalar value indicating the positivity or negativity of the token.
+row
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+footrow
+cell return
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+footrow
+cell return
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "vector") Lexeme.vector
+tag property
p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A real-valued meaning representation.
+h(2, "has_vector") Lexeme.has_vector
+tag property
p A boolean value indicating whether a word vector is associated with the object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether a word vector is associated with the object.
+cell A scalar value indicating the positivity or negativity of the lexeme.

View File

@ -4,31 +4,26 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules.
+h(2, "load") Matcher.load
+tag classmethod
p Load the matcher and patterns from a file path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell Path to a JSON-formatted patterns file.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary that the documents to match over will refer to.
+footrow
+cell return
+cell #[code Matcher]
+cell The newly constructed object.
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
| is now called #[+api("matcher#get") #[code matcher.get]].
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
| and #[code Matcher.has_entity] (now redundant) have been removed.
+h(2, "init") Matcher.__init__
+tag method
p Create the Matcher.
p Create the rule-based #[code Matcher].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
matcher = Matcher(nlp.vocab)
+table(["Name", "Type", "Description"])
+row
@ -41,17 +36,38 @@ p Create the Matcher.
+row
+cell #[code patterns]
+cell dict
+cell Patterns to add to the matcher.
+cell Patterns to add to the matcher, keyed by ID.
+footrow
+cell return
+cell returns
+cell #[code Matcher]
+cell The newly constructed object.
+h(2, "call") Matcher.__call__
+tag method
p Find all token sequences matching the supplied patterns on the Doc.
p Find all token sequences matching the supplied patterns on the #[code Doc].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
matcher = Matcher(nlp.vocab)
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
matcher.add("HelloWorld", on_match=None, pattern)
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"])
+row
@ -60,23 +76,28 @@ p Find all token sequences matching the supplied patterns on the Doc.
+cell The document to match over.
+footrow
+cell return
+cell returns
+cell list
+cell
| A list of#[code (entity_key, label_id, start, end)] tuples,
| describing the matches. A match tuple describes a
| #[code span doc[start:end]]. The #[code label_id] and
| #[code entity_key] are both integers.
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
+h(2, "pipe") Matcher.pipe
+tag method
p Match a stream of documents, yielding them in turn.
+aside-code("Example").
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
pass
+table(["Name", "Type", "Description"])
+row
+cell #[code docs]
+cell -
+cell iterable
+cell A stream of documents.
+row
@ -93,87 +114,132 @@ p Match a stream of documents, yielding them in turn.
| multi-threading.
+footrow
+cell yield
+cell yields
+cell #[code Doc]
+cell Documents, in order.
+h(2, "add_entity") Matcher.add_entity
+h(2, "len") Matcher.__len__
+tag method
p Add an entity to the matcher.
p
| Get the number of rules added to the matcher. Note that this only returns
| the number of rules (identical with the number of IDs), not the number
| of individual patterns.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert len(matcher) == 0
matcher.add('Rule', None, [{ORTH: 'test'}])
assert len(matcher) == 1
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of rules.
+h(2, "contains") Matcher.__contains__
+tag method
p Check whether the matcher contains rules for a match ID.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert 'Rule' in matcher == False
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
+table(["Name", "Type", "Description"])
+row
+cell #[code entity_key]
+cell unicode / int
+cell An ID for the entity.
+row
+cell #[code attrs]
+cell -
+cell Attributes to associate with the Matcher.
+row
+cell #[code if_exists]
+cell #[code key]
+cell unicode
+cell
| #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
| what happens if the entity ID already exists. Defaults to
| #[code 'raise'].
+cell The match ID.
+footrow
+cell returns
+cell int
+cell Whether the matcher contains rules for this match ID.
+h(2, "add") Matcher.add
+tag method
p
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
| a callback function to act on the matches. The callback function will
| receive the arguments #[code matcher], #[code doc], #[code i] and
| #[code matches]. If a pattern already exists for the given ID, the
| patterns will be extended. An #[code on_match] callback will be
| overwritten.
+aside-code("Example").
def on_match(matcher, doc, id, matches):
print('Matched!', matches)
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
doc = nlp(u'HELLO WORLD on Google Maps.')
matches = matcher(doc)
+table(["Name", "Type", "Description"])
+row
+cell #[code acceptor]
+cell -
+cell Callback function to filter matches of the entity.
+cell #[code match_id]
+cell unicode
+cell An ID for the thing you're matching.
+row
+cell #[code on_match]
+cell -
+cell Callback function to act on matches of the entity.
+cell callable or #[code None]
+cell
| Callback function to act on matches. Takes the arguments
| #[code matcher], #[code doc], #[code i] and #[code matches].
+footrow
+cell return
+cell #[code None]
+cell -
+row
+cell #[code *patterns]
+cell list
+cell
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+h(2, "add_pattern") Matcher.add_pattern
+h(2, "remove") Matcher.remove
+tag method
p Add a pattern to the matcher.
p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
| ID does not exist.
+aside-code("Example").
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
matcher.remove('Rule')
assert 'Rule' in matcher == False
+table(["Name", "Type", "Description"])
+row
+cell #[code entity_key]
+cell unicode / int
+cell An ID for the entity.
+cell #[code key]
+cell unicode
+cell The ID of the match rule.
+row
+cell #[code token_specs]
+cell -
+cell Description of the pattern to be matched.
+row
+cell #[code label]
+cell unicode / int
+cell Label to assign to the matched pattern. Defaults to #[code ""].
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "has_entity") Matcher.has_entity
+h(2, "get") Matcher.get
+tag method
p Check whether the matcher has an entity.
p
| Retrieve the pattern stored for a key. Returns the rule as an
| #[code (on_match, patterns)] tuple containing the callback and available
| patterns.
+aside-code("Example").
pattern = [{ORTH: 'test'}]
matcher.add('Rule', None, pattern)
(on_match, patterns) = matcher.get('Rule')
assert patterns = [pattern]
+table(["Name", "Type", "Description"])
+row
+cell #[code entity_key]
+cell unicode / int
+cell The entity key to check.
+cell #[code key]
+cell unicode
+cell The ID of the match rule.
+footrow
+cell return
+cell bool
+cell Whether the matcher has the entity.
+cell returns
+cell tuple
+cell The rule, as an #[code (on_match, patterns)] tuple.

View File

@ -0,0 +1,95 @@
//- 💫 DOCS > API > SPACY
include ../../_includes/_mixins
+h(2, "load") spacy.load
+tag function
+tag-model
p
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be
| determined based on the model's settings.
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode or #[code Path]
+cell Model to load, i.e. shortcut link, package name or path.
+footrow
+cell returns
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
+h(2, "info") spacy.info
+tag function
p
| The same as the #[+api("cli#info") #[code info] command]. Pretty-print
| information about your installation, models and local setup from within
| spaCy. To get the model meta data as a dictionary instead, you can
| use the #[code meta] attribute on your #[code nlp] object with a
| loaded model, e.g. #[code nlp['meta']].
+aside-code("Example").
spacy.info()
spacy.info('en')
spacy.info('de', markdown=True)
+table(["Name", "Type", "Description"])
+row
+cell #[code model]
+cell unicode
+cell A model, i.e. shortcut link, package name or path (optional).
+row
+cell #[code markdown]
+cell bool
+cell Print information as Markdown.
+h(2, "explain") spacy.explain
+tag function
p
| Get a description for a given POS tag, dependency label or entity type.
| For a list of available terms, see
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
+aside-code("Example").
spacy.explain('NORP')
# Nationalities or religious or political groups
doc = nlp(u'Hello world')
for word in doc:
print(word.text, word.tag_, spacy.explain(word.tag_))
# Hello UH interjection
# world NN noun, singular or mass
+table(["Name", "Type", "Description"])
+row
+cell #[code term]
+cell unicode
+cell Term to explain.
+footrow
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.

View File

@ -2,66 +2,18 @@
include ../../_includes/_mixins
p A slice from a #[code Doc] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code start]
+cell int
+cell The token offset for the start of the span.
+row
+cell #[code end]
+cell int
+cell The token offset for the end of the span.
+row
+cell #[code start_char]
+cell int
+cell The character offset for the start of the span.
+row
+cell #[code end_char]
+cell int
+cell The character offset for the end of the span.
+row
+cell #[code label]
+cell int
+cell The span's label.
+row
+cell #[code label_]
+cell unicode
+cell The span's label.
+row
+cell #[code lemma_]
+cell unicode
+cell The span's lemma.
+row
+cell #[code ent_id]
+cell int
+cell The integer ID of the named entity the token is an instance of.
+row
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.
p A slice from a #[+api("doc") #[code Doc]] object.
+h(2, "init") Span.__init__
+tag method
p Create a Span object from the #[code slice doc[start : end]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert [t.text for t in span] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
@ -89,7 +41,7 @@ p Create a Span object from the #[code slice doc[start : end]].
+cell A meaning representation of the span.
+footrow
+cell return
+cell returns
+cell #[code Span]
+cell The newly constructed object.
@ -98,6 +50,11 @@ p Create a Span object from the #[code slice doc[start : end]].
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1].text == 'back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
@ -105,12 +62,17 @@ p Get a #[code Token] object.
+cell The index of the token within the span.
+footrow
+cell return
+cell returns
+cell #[code Token]
+cell The token at #[code span[i]].
p Get a #[code Span] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1:3].text == 'back!'
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
@ -118,7 +80,7 @@ p Get a #[code Span] object.
+cell The slice of the span to get.
+footrow
+cell return
+cell returns
+cell #[code Span]
+cell The span at #[code span[start : end]].
@ -127,9 +89,14 @@ p Get a #[code Span] object.
p Iterate over #[code Token] objects.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert [t.text for t in span] == ['it', 'back', '!']
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell #[code Token]
+cell A #[code Token] object.
@ -138,19 +105,33 @@ p Iterate over #[code Token] objects.
p Get the number of tokens in the span.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert len(span) == 3
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell returns
+cell int
+cell The number of tokens in the span.
+h(2, "similarity") Span.similarity
+tag method
+tag-model("vectors")
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
doc = nlp(u'green apples and red oranges')
green_apples = doc[:2]
red_oranges = doc[3:]
apples_oranges = green_apples.similarity(red_oranges)
oranges_apples = red_oranges.similarity(green_apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
@ -160,7 +141,7 @@ p
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
@ -178,87 +159,205 @@ p Retokenize the document, such that the span is merged into a single token.
| are inherited from the syntactic root token of the span.
+footrow
+cell return
+cell returns
+cell #[code Token]
+cell The newly merged token.
+h(2, "text") Span.text
+tag property
p A unicode representation of the span text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the span.
+h(2, "text_with_ws") Span.text_with_ws
+tag property
p
| The text content of the span with a trailing whitespace character if the
| last token has one.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The text content of the span (with trailing whitespace).
+h(2, "sent") Span.sent
+tag property
p The sentence span that this span is a part of.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Span]
+cell The sentence this is part of.
+h(2, "root") Span.root
+tag property
+tag-model("parse")
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
i, like, new, york, in_, autumn, dot = range(len(doc))
assert doc[new].head.text == 'York'
assert doc[york].head.text == 'like'
new_york = doc[new&#58;york+1]
assert new_york.root.text == 'York'
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell returns
+cell #[code Token]
+cell The root token.
+h(2, "lefts") Span.lefts
+tag property
+tag-model("parse")
p Tokens that are to the left of the span, whose head is within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3:7].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell #[code Token]
+cell A left-child of a token of the span.
+h(2, "rights") Span.rights
+tag property
+tag-model("parse")
p Tokens that are to the right of the span, whose head is within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[2:4].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell #[code Token]
+cell A right-child of a token of the span.
+h(2, "subtree") Span.subtree
+tag property
+tag-model("parse")
p Tokens that descend from tokens in the span, but fall outside it.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
subtree = [t.text for t in doc[:3].subtree]
assert subtree == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell #[code Token]
+cell A descendant of a token within the span.
+h(2, "has_vector") Span.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc[1:].has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the span has a vector data attached.
+h(2, "vector") Span.vector
+tag property
+tag-model("vectors")
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc[1:].vector.dtype == 'float32'
assert doc[1:].vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the span's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
+tag-model("vectors")
p
| The L2 norm of the span's vector representation.
+aside-code("Example").
doc = nlp(u'I like apples')
doc[1:].vector_norm # 4.800883928527915
doc[2:].vector_norm # 6.895897646384268
assert doc[1:].vector_norm != doc[2:].vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code sent]
+cell #[code Span]
+cell The sentence span that this span is a part of.
+row
+cell #[code start]
+cell int
+cell The token offset for the start of the span.
+row
+cell #[code end]
+cell int
+cell The token offset for the end of the span.
+row
+cell #[code start_char]
+cell int
+cell The character offset for the start of the span.
+row
+cell #[code end_char]
+cell int
+cell The character offset for the end of the span.
+row
+cell #[code text]
+cell unicode
+cell A unicode representation of the span text.
+row
+cell #[code text_with_ws]
+cell unicode
+cell
| The text content of the span with a trailing whitespace character
| if the last token has one.
+row
+cell #[code label]
+cell int
+cell The span's label.
+row
+cell #[code label_]
+cell unicode
+cell The span's label.
+row
+cell #[code lemma_]
+cell unicode
+cell The span's lemma.
+row
+cell #[code ent_id]
+cell int
+cell The integer ID of the named entity the token is an instance of.
+row
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.

View File

@ -7,16 +7,22 @@ p Map strings to and from integer IDs.
+h(2, "init") StringStore.__init__
+tag method
p Create the #[code StringStore].
p
| Create the #[code StringStore]. Note that a newly initialised store will
| always include an empty string #[code ''] at position #[code 0].
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore([u'apple', u'orange'])
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell -
+cell iterable
+cell A sequence of unicode strings to add to the store.
+footrow
+cell return
+cell returns
+cell #[code StringStore]
+cell The newly constructed object.
@ -25,9 +31,13 @@ p Create the #[code StringStore].
p Get the number of strings in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert len(stringstore) == 2
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell returns
+cell int
+cell The number of strings in the store.
@ -36,22 +46,32 @@ p Get the number of strings in the store.
p Retrieve a string from a given integer ID, or vice versa.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
int_id = stringstore[u'apple'] # 1
assert stringstore[int_id] == u'apple'
+table(["Name", "Type", "Description"])
+row
+cell #[code string_or_id]
+cell bytes / unicode / int
+cell bytes, unicode or int
+cell The value to encode.
+footrow
+cell return
+cell unicode / int
+cell The value to retrieved.
+cell returns
+cell unicode or int
+cell The value to be retrieved.
+h(2, "contains") StringStore.__contains__
+tag method
p Check whether a string is in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert u'apple' in stringstore == True
assert u'cherry' in stringstore == False
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
@ -59,49 +79,108 @@ p Check whether a string is in the store.
+cell The string to check.
+footrow
+cell return
+cell returns
+cell bool
+cell Whether the store contains the string.
+h(2, "iter") StringStore.__iter__
+tag method
p Iterate over the strings in the store, in order.
p
| Iterate over the strings in the store, in order. Note that a newly
| initialised store will always include an empty string #[code ''] at
| position #[code 0].
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
all_strings = [s for s in stringstore]
assert all_strings == [u'', u'apple', u'orange']
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell unicode
+cell A string in the store.
+h(2, "dump") StringStore.dump
+h(2, "to_disk") StringStore.to_disk
+tag method
p Save the strings to a JSON file.
p Save the current state to a directory.
+aside-code("Example").
stringstore.to_disk('/path/to/strings')
+table(["Name", "Type", "Description"])
+row
+cell #[code file]
+cell buffer
+cell The file to save the strings.
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "load") StringStore.load
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Load the strings from a JSON file.
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore().from_disk('/path/to/strings')
+table(["Name", "Type", "Description"])
+row
+cell #[code file]
+cell buffer
+cell The file from which to load the strings.
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell return
+cell #[code None]
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
store_bytes = stringstore.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.strings import StringStore
store_bytes = stringstore.to_bytes()
new_store = StringStore().from_bytes(store_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code StringStore]
+cell The #[code StringStore] object.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate part-of-speech tags on #[code Doc] objects.
+h(2, "load") Tagger.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code Tagger]
+cell The newly constructed object.
+h(2, "init") Tagger.__init__
+tag method
@ -47,7 +21,7 @@ p Create a #[code Tagger].
+cell The statistical model.
+footrow
+cell return
+cell returns
+cell #[code Tagger]
+cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the tagger, setting the POS tags onto the #[code Doc] object.
+cell The tokens to be tagged.
+footrow
+cell return
+cell returns
+cell #[code None]
+cell -
@ -91,7 +65,7 @@ p Tag a stream of documents.
| parallel.
+footrow
+cell yield
+cell yields
+cell #[code Doc]
+cell Documents, in order.
@ -112,6 +86,6 @@ p Update the statistical model, with tags supplied for the given document.
+cell Manager for the gold-standard tags.
+footrow
+cell return
+cell returns
+cell int
+cell Number of tags predicted correctly.

View File

@ -4,9 +4,296 @@ include ../../_includes/_mixins
p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert token.text == u'Give'
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p The number of unicode characters in the token, i.e. #[code token.text].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert len(token) == 4
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
from spacy.attrs import IS_TITLE
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert token.check_flag(IS_TITLE) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "similarity") Token.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apples, _, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_nbor = doc[0].nbor()
assert give_nbor.text == u'it'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell returns
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]].
+h(2, "is_ancestor") Token.is_ancestor
+tag method
+tag-model("parse")
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give = doc[0]
it = doc[1]
assert give.is_ancestor(it)
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell returns
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "ancestors") Token.ancestors
+tag property
+tag-model("parse")
p The rightmost token of this token's syntactic descendants.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
it_ancestors = doc[1].ancestors
assert [t.text for t in it_ancestors] == [u'Give']
he_ancestors = doc[4].ancestors
assert [t.text for t in he_ancestors] == [u'pleaded']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].
+h(2, "conjuncts") Token.conjuncts
+tag property
+tag-model("parse")
p A sequence of coordinated tokens, including the token itself.
+aside-code("Example").
doc = nlp(u'I like apples and oranges')
apples_conjuncts = doc[2].conjuncts
assert [t.text for t in apples_conjuncts] == [u'oranges']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
+tag-model("parse")
p A sequence of the token's immediate syntactic children.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_children = doc[0].children
assert [t.text for t in give_children] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
+tag-model("parse")
p A sequence of all the token's syntactic descendents.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_subtree = doc[0].subtree
assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "has_vector") Token.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| token.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "vector") Token.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.vector.dtype == 'float32'
assert apples.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the token's vector representation.
+aside-code("Example").
doc = nlp(u'I like apples and pasta')
apples = doc[2]
pasta = doc[4]
apples.vector_norm # 6.89589786529541
pasta.vector_norm # 7.759851932525635
assert apples.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+row
+cell #[code vocab]
+cell #[code Vocab]
@ -17,14 +304,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code head]
+cell #[code Token]
+cell The syntactic parent, or "governor", of this token.
+row
+cell #[code left_edge]
+cell #[code Token]
+cell The leftmost token of this token's syntactic descendants.
+row
+cell #[code right_edge]
+cell #[code Token]
+cell The rightmost token of this token's syntactic descendents.
+row
+cell #[code i]
+cell int
+cell The index of the token within the parent document.
+row
+cell #[code ent_type]
+cell int
+cell Named entity type.
+row
+cell #[code ent_type_]
+cell unicode
@ -42,19 +346,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell unicode
+cell
| IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it inside an
| entity, #[code "O"] means it is outside an entity, and
| means the token begins an entity, #[code "I"] means it is inside
| an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set.
+row
+cell #[code ent_id]
+cell int
+cell ID of the entity the token is an instance of, if any.
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row
+cell #[code ent_id_]
+cell unicode
+cell ID of the entity the token is an instance of, if any.
+cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row
+cell #[code lemma]
@ -229,232 +537,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell return
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p Get the number of unicode characters in the token.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell return
+cell bool
+cell Whether the flag is set.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell return
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]]
+h(2, "similarity") Token.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "is_ancestor") Token.is_ancestor
+tag method
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell return
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "vector") Token.vector
+tag property
p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "has_vector") Token.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "head") Token.head
+tag property
p The syntactic parent, or "governor", of this token.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The head.
+h(2, "conjuncts") Token.conjuncts
+tag property
p A sequence of coordinated tokens, including the token itself.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
p A sequence of the token's immediate syntactic children.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
p A sequence of all the token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "left_edge") Token.left_edge
+tag property
p The leftmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The first token such that #[code self.is_ancestor(token)].
+h(2, "right_edge") Token.right_edge
+tag property
p The rightmost token of this token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The last token such that #[code self.is_ancestor(token)].
+h(2, "ancestors") Token.ancestors
+tag property
p The rightmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].

View File

@ -6,6 +6,283 @@ p
| Segment text, and create #[code Doc] objects with the discovered segment
| boundaries.
+h(2, "init") Tokenizer.__init__
+tag method
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+aside-code("Example").
# Construction 1
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Construction 2
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+row
+cell #[code token_match]
+cell callable
+cell A boolean function matching strings to be recognised as tokens.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "call") Tokenizer.__call__
+tag method
p Tokenize a string.
+aside-code("Example").
tokens = tokenizer(u'This is a sentence')
assert len(tokens) == 4
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to tokenize.
+footrow
+cell returns
+cell #[code Doc]
+cell A container for linguistic annotations.
+h(2, "pipe") Tokenizer.pipe
+tag method
p Tokenize a stream of texts.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in tokenizer.pipe(texts, batch_size=50):
pass
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode texts.
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to accumulate in an internal buffer.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads to use, if the implementation supports
| multi-threading. The default tokenizer is single-threaded.
+footrow
+cell yields
+cell #[code Doc]
+cell A sequence of Doc objects, in order.
+h(2, "find_infix") Tokenizer.find_infix
+tag method
p Find internal split points of the string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to split.
+footrow
+cell returns
+cell list
+cell
| A list of #[code re.MatchObject] objects that have #[code .start()]
| and #[code .end()] methods, denoting the placement of internal
| segment separators, e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix
+tag method
p
| Find the length of a prefix that should be segmented from the string, or
| #[code None] if no prefix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int
+cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix
+tag method
p
| Find the length of a suffix that should be segmented from the string, or
| #[code None] if no suffix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int / #[code None]
+cell The length of the suffix if present, otherwise #[code None].
+h(2, "add_special_case") Tokenizer.add_special_case
+tag method
p
| Add a special-case tokenization rule. This mechanism is also used to add
| custom tokenizer exceptions to the language data. See the usage workflow
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
| for more details and examples.
+aside-code("Example").
from spacy.attrs import ORTH, LEMMA
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
tokenizer.add_special_case(case)
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to specially tokenize.
+row
+cell #[code token_attrs]
+cell iterable
+cell
| A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
@ -35,215 +312,3 @@ p
| A function to find internal segment separators, e.g. hyphens.
| Returns a (possibly empty) list of #[code re.MatchObject]
| objects.
+h(2, "load") Tokenizer.load
+tag classmethod
p Load a #[code Tokenizer], reading unsupplied components from the path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+footrow
+cell return
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "init") Tokenizer.__init__
+tag method
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+footrow
+cell return
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "call") Tokenizer.__call__
+tag method
p Tokenize a string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to tokenize.
+footrow
+cell return
+cell #[code Doc]
+cell A container for linguistic annotations.
+h(2, "pipe") Tokenizer.pipe
+tag method
p Tokenize a stream of texts.
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode texts.
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to accumulate in an internal buffer.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads to use, if the implementation supports
| multi-threading. The default tokenizer is single-threaded.
+footrow
+cell yield
+cell #[code Doc]
+cell A sequence of Doc objects, in order.
+h(2, "find_infix") Tokenizer.find_infix
+tag method
p Find internal split points of the string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to split.
+footrow
+cell return
+cell #[code List[re.MatchObject]]
+cell
| A list of objects that have #[code .start()] and #[code .end()]
| methods, denoting the placement of internal segment separators,
| e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix
+tag method
p
| Find the length of a prefix that should be segmented from the string, or
| #[code None] if no prefix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell return
+cell int / #[code None]
+cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix
+tag method
p
| Find the length of a suffix that should be segmented from the string, or
| #[code None] if no suffix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell return
+cell int / #[code None]
+cell The length of the suffix if present, otherwise #[code None].
+h(2, "add_special_case") Tokenizer.add_special_case
+tag method
p Add a special-case tokenization rule.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to specially tokenize.
+row
+cell #[code token_attrs]
+cell -
+cell
| A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+footrow
+cell return
+cell #[code None]
+cell -

View File

@ -14,7 +14,7 @@ p
| recommend having additional tests in place if your application depends on
| any of spaCy's utilities.
+h(2, "get_data_path") get_data_path
+h(2, "get_data_path") util.get_data_path
+tag function
p
@ -28,11 +28,11 @@ p
+cell Only return path if it exists, otherwise return #[code None].
+footrow
+cell return
+cell returns
+cell #[code Path] / #[code None]
+cell Data path or #[code None].
+h(2, "set_data_path") set_data_path
+h(2, "set_data_path") util.set_data_path
+tag function
p
@ -49,7 +49,7 @@ p
+cell unicode or #[code Path]
+cell Path to new data directory.
+h(2, "get_lang_class") get_lang_class
+h(2, "get_lang_class") util.get_lang_class
+tag function
p
@ -70,11 +70,11 @@ p
+cell Two-letter language code, e.g. #[code 'en'].
+footrow
+cell return
+cell returns
+cell #[code Language]
+cell Language class.
+h(2, "resolve_model_path") resolve_model_path
+h(2, "resolve_model_path") util.resolve_model_path
+tag function
p Resolve a model name or string to a model path.
@ -90,11 +90,11 @@ p Resolve a model name or string to a model path.
+cell Package name, shortcut link or model path.
+footrow
+cell return
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+h(2, "is_package") is_package
+h(2, "is_package") util.is_package
+tag function
p
@ -112,11 +112,11 @@ p
+cell Name of package.
+footrow
+cell return
+cell returns
+cell #[code bool]
+cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") get_model_package_path
+h(2, "get_model_package_path") util.get_model_package_path
+tag function
p
@ -134,11 +134,11 @@ p
+cell Name of installed package.
+footrow
+cell return
+cell returns
+cell #[code Path]
+cell Path to model data directory.
+h(2, "parse_package_meta") parse_package_meta
+h(2, "parse_package_meta") util.parse_package_meta
+tag function
p
@ -163,11 +163,31 @@ p
+cell If #[code True], raise error if no #[code meta.json] is found.
+footrow
+cell return
+cell returns
+cell dict / #[code None]
+cell Model meta data or #[code None].
+h(2, "update_exc") update_exc
+h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
| notebook by detecting the IPython kernel. Mainly used for the
| #[+api("displacy") #[code displacy]] visualizer.
+aside-code("Example").
html = '&lt;h1&gt;Hello world!&lt;/h1&gt;'
if util.is_in_jupyter():
from IPython.core.display import display, HTML
return display(HTML(html))
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell #[code True] if in Jupyter, #[code False] if not.
+h(2, "update_exc") util.update_exc
+tag function
p
@ -194,12 +214,12 @@ p
+cell Exception dictionaries to add to the base exceptions, in order.
+footrow
+cell return
+cell returns
+cell dict
+cell Combined tokenizer exceptions.
+h(2, "prints") prints
+h(2, "prints") util.prints
+tag function
p

View File

@ -7,59 +7,6 @@ p
| #[code Vocab] instance also provides access to the #[code StringStore],
| and owns underlying C-data that is shared between #[code Doc] objects.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.
+row
+cell #[code vectors_length]
+cell int
+cell The dimensionality of the word vectors, if present.
+h(2, "load") Vocab.load
+tag classmethod
p Load the vocabulary from a path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code lex_attr_getters]
+cell dict
+cell
| A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None].
+row
+cell #[code lemmatizer]
+cell -
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code tag_map]
+cell dict
+cell
| A dictionary mapping fine-grained tags to coarse-grained
| parts-of-speech, and optionally morphological attributes.
+row
+cell #[code oov_prob]
+cell float
+cell The default probability for out-of-vocabulary words.
+footrow
+cell return
+cell #[code Vocab]
+cell The newly constructed object.
+h(2, "init") Vocab.__init__
+tag method
@ -73,11 +20,6 @@ p Create the vocabulary.
| A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None].
+row
+cell #[code lemmatizer]
+cell -
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code tag_map]
+cell dict
@ -86,23 +28,34 @@ p Create the vocabulary.
| parts-of-speech, and optionally morphological attributes.
+row
+cell #[code oov_prob]
+cell float
+cell The default probability for out-of-vocabulary words.
+cell #[code lemmatizer]
+cell object
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code strings]
+cell #[code StringStore]
+cell
| A #[code StringStore] that maps strings to integers, and vice
| versa.
+footrow
+cell return
+cell returns
+cell #[code Vocab]
+cell The newly constructed object.
+h(2, "len") Vocab.__len__
+tag method
p Get the number of lexemes in the vocabulary.
p Get the current number of lexemes in the vocabulary.
+aside-code("Example").
doc = nlp(u'This is a sentence.')
assert len(nlp.vocab) > 0
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell returns
+cell int
+cell The number of lexems in the vocabulary.
@ -113,6 +66,10 @@ p
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
| unseen unicode string is given, a new lexeme is created and stored.
+aside-code("Example").
apple = nlp.vocab.strings['apple']
assert nlp.vocab[apple] == nlp.vocab[u'apple']
+table(["Name", "Type", "Description"])
+row
+cell #[code id_or_string]
@ -120,25 +77,37 @@ p
+cell The integer ID of a word, or its unicode string.
+footrow
+cell return
+cell returns
+cell #[code Lexeme]
+cell The lexeme indicated by the given ID.
+h(2, "iter") Span.__iter__
+h(2, "iter") Vocab.__iter__
+tag method
p Iterate over the lexemes in the vocabulary.
+aside-code("Example").
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell yields
+cell #[code Lexeme]
+cell An entry in the vocabulary.
+h(2, "contains") Vocab.__contains__
+tag method
p Check whether the string has an entry in the vocabulary.
p
| Check whether the string has an entry in the vocabulary. To get the ID
| for a given string, you need to look it up in
| #[+api("vocab#attributes") #[code vocab.strings]].
+aside-code("Example").
apple = nlp.vocab.strings['apple']
oov = nlp.vocab.strings['dskfodkfos']
assert apple in nlp.vocab
assert oov not in nlp.vocab
+table(["Name", "Type", "Description"])
+row
@ -147,32 +116,27 @@ p Check whether the string has an entry in the vocabulary.
+cell The ID string.
+footrow
+cell return
+cell returns
+cell bool
+cell Whether the string has an entry in the vocabulary.
+h(2, "resize_vectors") Vocab.resize_vectors
+tag method
p
| Set #[code vectors_length] to a new size, and allocate more memory for
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
+table(["Name", "Type", "Description"])
+row
+cell #[code new_size]
+cell int
+cell The new size of the vectors.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "add_flag") Vocab.add_flag
+tag method
p Set a new boolean flag to words in the vocabulary.
p
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
| function will be called over the words currently in the vocab, and then
| applied to new words as they occur. You'll then be able to access the flag
| value on each token, using #[code token.check_flag(flag_id)].
+aside-code("Example").
def is_my_product(text):
products = [u'spaCy', u'Thinc', u'displaCy']
return text in products
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
doc = nlp(u'I like spaCy')
assert doc[2].check_flag(MY_PRODUCT) == True
+table(["Name", "Type", "Description"])
+row
@ -189,90 +153,104 @@ p Set a new boolean flag to words in the vocabulary.
| available bit will be chosen.
+footrow
+cell return
+cell returns
+cell int
+cell The integer ID by which the flag value can be checked.
+h(2, "dump") Vocab.dump
+h(2, "to_disk") Vocab.to_disk
+tag method
p Save the lexemes binary data to the given location.
p Save the current state to a directory.
+aside-code("Example").
nlp.vocab.to_disk('/path/to/vocab')
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell #[code Path]
+cell The path to load from.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "load_lexemes") Vocab.load_lexemes
+tag method
p
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell unicode
+cell Path to load the lexemes.bin file from.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "dump_vectors") Vocab.dump_vectors
+tag method
p Save the word vectors to a binary file.
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell #[code Path]
+cell The path to save to.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "load_vectors") Vocab.load_vectors
+tag method
p Load vectors from a text-based file.
+table(["Name", "Type", "Description"])
+row
+cell #[code file_]
+cell buffer
+cell #[code path]
+cell unicode or #[code Path]
+cell
| The file to read from. Entries should be separated by newlines,
| and each entry should be whitespace delimited. The first value
| of the entry should be the word string, and subsequent entries
| should be the values of the vector.
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+footrow
+cell return
+cell int
+cell The length of the vectors loaded.
+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
+h(2, "from_disk") Vocab.from_disk
+tag method
p Load vectors from the location of a binary file.
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.vocab import Vocab
vocab = Vocab().from_disk('/path/to/vocab')
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell unicode
+cell The path of the binary file to load from.
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell return
+cell int
+cell The length of the vectors loaded.
+cell returns
+cell #[code Vocab]
+cell The modified #[code Vocab] object.
+h(2, "to_bytes") Vocab.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
vocab_bytes = nlp.vocab.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Vocab] object.
+h(2, "from_bytes") Vocab.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.vocab import Vocab
vocab_bytes = nlp.vocab.to_bytes()
vocab = Vocab()
vocab.from_bytes(vocab_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Vocab]
+cell The #[code Vocab] object.
+h(2, "attributes") Attributes
+aside-code("Example").
apple_id = nlp.vocab.strings['apple']
assert type(apple_id) == int
PERSON = nlp.vocab.strings['PERSON']
assert type(PERSON) == int
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.

View File

@ -56,20 +56,22 @@ p
from ...attrs import LANG
from ...util import update_exc
# create Defaults class in the module scope (necessary for pickling!)
class XxxxxDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
# optional: replace flags with custom functions, e.g. like_num()
lex_attr_getters.update(LEX_ATTRS)
# merge base exceptions and custom tokenizer exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
# create actual Language class
class Xxxxx(Language):
lang = 'xx' # language ISO code
# override defaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
# optional: replace flags with custom functions, e.g. like_num()
lex_attr_getters.update(LEX_ATTRS)
# merge base exceptions and custom tokenizer exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
Defaults = XxxxxDefaults # override defaults
# set default export this allows the language class to be lazy-loaded
__all__ = ['Xxxxx']

View File

@ -141,11 +141,11 @@ p
include ../api/_annotation/_named-entities
+aside("Install")
| The #[+api("load") spacy.load()] function configures a pipeline that
| The #[+api("load") #[code spacy.load()]] function configures a pipeline that
| includes all of the available annotators for the given ID. In the example
| above, the #[code 'en'] ID tells spaCy to load the default English
| pipeline. If you have installed the data with
| #[code python -m spacy.en.download] this will include the entity
| #[code python -m spacy download en], this will include the entity
| recognition model.
+h(2, "updating") Training and updating

View File

@ -4,58 +4,190 @@ include ../../_includes/_mixins
p
| spaCy features a rule-matching engine that operates over tokens, similar
| to regular expressions. The rules can refer to token annotations and
| flags, and matches support callbacks to accept, modify and/or act on the
| match. The rule matcher also allows you to associate patterns with
| entity IDs, to allow some basic entity linking or disambiguation.
| to regular expressions. The rules can refer to token annotations (e.g.
| the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
| The rule matcher also lets you pass in a custom callback
| to act on matches for example, to merge entities and apply custom labels.
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
p Here's a minimal example. We first add a pattern that specifies three tokens:
+aside("What about \"real\" regular expressions?")
+list("numbers")
+item A token whose lower-case form matches "hello"
+item A token whose #[code is_punct] flag is set to #[code True]
+item A token whose lower-case form matches "world"
+h(2, "adding-patterns") Adding patterns
p
| Once we've added the pattern, we can use the #[code matcher] as a
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
| of #[code spacy.attrs].
| Let's say we want to enable spaCy to find a combination of three tokens:
+list("numbers")
+item
| A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
| or "HELLO".
+item
| A token whose #[strong #[code is_punct] flag is set to #[code True]],
| i.e. any punctuation.
+item
| A token whose #[strong lower-case form matches "world"], e.g. "World"
| or "WORLD".
+code.
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]
doc = nlp(u'Hello, world!')
p
| First, we initialise the #[code Matcher] with a vocab. The matcher must
| always share the same vocab with the documents it will operate on. We
| can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
| our custom pattern. The second argument lets you pass in an optional
| callback function to invoke on a successful match. For now, we set it
| to #[code None].
+code.
import spacy
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
doc = nlp(u'Hello, world! Hello world!')
matches = matcher(doc)
p
| The returned matches include the ID, to let you associate the matches
| with the patterns. You can also group multiple patterns together, which
| is useful when you have a knowledge base of entities you want to match,
| and you want to write multiple patterns for each entity.
+h(2, "entities-patterns") Entities and patterns
| The matcher returns a list of #[code (match_id, start, end)] tuples in
| this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
| #[code doc[0:2]] of our original document. Optionally, we could also
| choose to add more than one pattern, for example to also match sequences
| without punctuation between "hello" and "world":
+code.
matcher.add_entity(
"GoogleNow", # Entity ID -- Helps you act on the match.
{"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
)
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
matcher.add_pattern(
"GoogleNow", # Entity ID -- Created if doesn't exist.
[ # The pattern is a list of *Token Specifiers*.
{ # This Token Specifier matches tokens whose orth field is "Google"
ORTH: "Google"
},
{ # This Token Specifier matches tokens whose orth field is "Now"
ORTH: "Now"
}
],
label=None # Can associate a label to the pattern-match, to handle it better.
)
p
| By default, the matcher will only return the matches and
| #[strong not do anything else], like merge entities or assign labels.
| This is all up to you and can be defined individually for each pattern,
| by passing in a callback function as the #[code on_match] argument on
| #[code add()]. This is useful, because it lets you write entirely custom
| and #[strong pattern-specific logic]. For example, you might want to
| merge #[em some] patterns into one token, while adding entity labels for
| other pattern types. You shouldn't have to create different matchers for
| each of those processes.
+h(2, "on_match") Adding #[code on_match] rules
p
| To move on to a more realistic example, let's say you're working with a
| large corpus of blog articles, and you want to match all mentions of
| "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
| To be safe, you only match on the uppercase versions, in case someone has
| written it as "Google i/o". You also add a second pattern with an added
| #[code {IS_DIGIT: True}] token this will make sure you also match on
| "Google I/O 2017". If your pattern matches, spaCy should execute your
| custom callback function #[code add_event_ent].
+code.
import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', on_match=add_event_ent,
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
# Get the current match and create tuple of entity label, start and end.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
p
| In addition to mentions of "Google I/O", your data also contains some
| annoying pre-processing artefacts, like leftover HTML line breaks
| (e.g. #[code &lt;br&gt;] or #[code &lt;BR/&gt;]). While you're at it,
| you want to merge those into one token and flag them, to make sure you
| can easily ignore them later. So you add a second pattern and pass in a
| function #[code merge_and_flag]:
+code.
matcher.add('BAD_HTML', on_match=merge_and_flag,
[{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
[{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}])
# Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
+aside("Tip: Visualizing matches")
| When working with entities, you can use #[+api("displacy") displaCy]
| to quickly generate a NER visualization from your updated #[code Doc],
| which can be exported as an HTML file:
+code.o-no-block.
from spacy import displacy
html = displacy.render(doc, style='ent', page=True,
options={'ents': ['EVENT']})
| For more info and examples, see the usage workflow on
| #[+a("/docs/usage/visualizers") visualizing spaCy].
p
| We can now call the matcher on our documents. The patterns will be
| matched in the order they occur in the text.
+code.
doc = nlp(LOTS_OF_TEXT)
matcher(doc)
+h(3, "on_match-callback") The callback function
p
| The matcher will first collect all matches over the document. It will
| then iterate over the matches, lookup the callback for the entity ID
| that was matched, and invoke it. When the callback is invoked, it is
| passed four arguments: the matcher itself, the document, the position of
| the current match, and the total list of matches. This allows you to
| write callbacks that consider the entire set of matched phrases, so that
| you can resolve overlaps and other conflicts in whatever way you prefer.
+table(["Argument", "Type", "Description"])
+row
+cell #[code matcher]
+cell #[code Matcher]
+cell The matcher instance.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the matcher was used on.
+row
+cell #[code i]
+cell int
+cell Index of the current match (#[code matches[i]]).
+row
+cell #[code matches]
+cell list
+cell
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
+h(2, "quantifiers") Using quantifiers
@ -82,78 +214,4 @@ p
p
| There are no nested or scoped quantifiers. You can build those
| behaviours with acceptors and
| #[+api("matcher#add_entity") #[code on_match]] callbacks.
+h(2, "acceptor-functions") Acceptor functions
p
| The #[code acceptor] keyword of #[code matcher.add_entity()] allows you to
| pass a function to reject or modify matches. The function you pass should
| take five arguments: #[code doc], #[code ent_id], #[code label], #[code start],
| and #[code end]. You can return a falsey value to reject the match, or
| return a 4-tuple #[code (ent_id, label, start, end)].
+code.
from spacy.tokens.doc import Doc
def trim_title(doc, ent_id, label, start, end):
if doc[start].check_flag(IS_TITLE_TERM):
return (ent_id, label, start+1, end)
else:
return (ent_id, label, start, end)
titles = set(title.lower() for title in [u'Mr.', 'Dr.', 'Ms.', u'Admiral'])
IS_TITLE_TERM = matcher.vocab.add_flag(lambda string: string.lower() in titles)
matcher.add_entity('PersonName', acceptor=trim_title)
matcher.add_pattern('PersonName', [{LOWER: 'mr.'}, {LOWER: 'cruise'}])
matcher.add_pattern('PersonName', [{LOWER: 'dr.'}, {LOWER: 'seuss'}])
doc = Doc(matcher.vocab, words=[u'Mr.', u'Cruise', u'likes', 'Dr.', u'Seuss'])
for ent_id, label, start, end in matcher(doc):
print(doc[start:end].text)
# Cruise
# Seuss
p
| Passing an #[code acceptor] function allows you to match patterns with
| arbitrary logic that can't easily be expressed by a finite-state machine.
| You can look at the entirety of the
| matched phrase, and its context in the document, and decide to move
| the boundaries or reject the match entirely.
+h(2, "callback-functions") Callback functions
p
| In spaCy &lt;1.0, the #[code Matcher] automatically tagged matched phrases
| with entity types. Since spaCy 1.0, the matcher no longer acts on matches
| automatically. By default, the match list is returned for the user to action.
| However, it's often more convenient to register the required actions as a
| callback. You can do this by passing a function to the #[code on_match]
| keyword argument of #[code matcher.add_entity].
+aside-code("Example").
def merge_phrases(matcher, doc, i, matches):
'''
Merge a phrase. We have to be careful here because we'll change the token indices.
To avoid problems, merge all the phrases once we're called on the last match.
'''
if i != len(matches)-1:
return None
# Get Span objects
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
for ent_id, label, span in spans:
span.merge(label=label, tag='NNP' if label else span.root.tag_)
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
matcher(doc)
print([w.text for w in doc])
# [u'Google Now', u'is', u'being', u'rebranded']
p
| The matcher will first collect all matches over the document. It will
| then iterate over the matches, look-up the callback for the entity ID
| that was matched, and invoke it. When the callback is invoked, it is
| passed four arguments: the matcher itself, the document, the position of
| the current match, and the total list of matches. This allows you to
| write callbacks that consider the entire set of matched phrases, so that
| you can resolve overlaps and other conflicts in whatever way you prefer.
| behaviours with #[code on_match] callbacks.

View File

@ -2,9 +2,218 @@
include ../../_includes/_mixins
p
| We also re-wrote a large part of the documentation and usage workflows,
| and added more examples.
+h(2, "features") New features
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
from spacy import displacy
doc = nlp(u'This is a sentence about Facebook.')
displacy.serve(doc, style='dep') # run the web server
html = displacy.render(doc, style='ent') # generate HTML
p
| Our popular dependency and named entity visualizers are now an official
| part of the spaCy library! displaCy can run a simple web server, or
| generate raw HTML markup or SVG files to be exported. You can pass in one
| or more docs, and customise the style. displaCy also auto-detects whether
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
| visualizations in your notebook.
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+h(3, "features-loading") Loading
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code path] keyword argument is now deprecated.
p
| The #[code Language] class to initialise will be determined based on the
| model's settings. If no model is found, spaCy will let you know and won't
| just return an empty #[code Language] object anymore. If you want a blank
| language, you can always import the class directly, e.g.
| #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-language") Improved language data and processing pipelines
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
+aside-code("Example").
LOOKUP = {
"aba": "abar",
"ababa": "abar",
"ababais": "abar",
"ababan": "abar",
"ababanes": "ababán"
}
p
| spaCy now supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma. To determine a token's
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
| the lookup table, and should be returned by the #[code create_lemmatizer]
| classmethod of the language's defaults.
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
assert len(matcher) == 1
assert 'HelloWorld' in matcher
p
| Patterns can now be added to the matcher by calling
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
| callback function to be invoked on each match, and one or more patterns.
| This allows you to write powerful, pattern-specific logic using only one
| matcher. For example, you might only want to merge some entity types,
| and set custom flags for other matched patterns.
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(3, "features-serializer") Serialization
+infobox
| #[strong API:] #[+api("serializer") #[code Serializer]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-models") Neural network models for English, German, French and Spanish
+infobox
| #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
| #[strong Usage:] #[+a("/docs/usage/models") Models]
+h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell
| #[code Vocab.load]
| #[code Vocab.load_lexemes]
| #[code Vocab.load_vectors]
| #[code Vocab.load_vectors_from_bin_loc]
+cell
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+row
+cell
| #[code Vocab.dump]
| #[code Vocab.dump_vectors]
+cell
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+row
+cell
| #[code StringStore.load]
+cell
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+row
+cell
| #[code StringStore.dump]
+cell
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Matcher.load]
+cell -
+row
+cell
| #[code Matcher.add_pattern]
| #[code Matcher.add_entity]
+cell #[+api("matcher#add") #[code Matcher.add]]
+row
+cell #[code Matcher.get_entity]
+cell #[+api("matcher#get") #[code Matcher.get]]
+row
+cell #[code Matcher.has_entity]
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+row
+cell #[code Doc.read_bytes]
+cell
+row
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+h(2, "migrating") Migrating from spaCy 1.x