Merge docstrings

This commit is contained in:
Matthew Honnibal 2017-05-21 13:46:23 -05:00
commit 5db89053aa
68 changed files with 4137 additions and 3113 deletions

View File

@ -14,3 +14,4 @@ regex==2017.4.5
ftfy>=4.4.2,<5.0.0 ftfy>=4.4.2,<5.0.0
pytest>=3.0.6,<4.0.0 pytest>=3.0.6,<4.0.0
pip>=9.0.0,<10.0.0 pip>=9.0.0,<10.0.0
mock>=2.0.0,<3.0.0

View File

@ -20,7 +20,17 @@ def download(model, direct=False):
compatibility = get_compatibility() compatibility = get_compatibility()
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
link(model_name, model, force=True) try:
link(model_name, model, force=True)
except:
# Dirty, but since spacy.download and the auto-linking is mostly
# a convenience wrapper, it's best to show a success message and
# loading instructions, even if linking fails.
prints("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load "
"the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful")
def get_json(url, desc): def get_json(url, desc):

View File

@ -11,15 +11,14 @@ from .. import util
def info(model=None, markdown=False): def info(model=None, markdown=False):
if model: if model:
data_path = util.get_data_path() model_path = util.resolve_model_path(model)
data = util.parse_package_meta(data_path / model, require=True) meta = util.parse_package_meta(model_path)
model_path = Path(__file__).parent / data_path / model
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
data['link'] = path2str(model_path) meta['link'] = path2str(model_path)
data['source'] = path2str(model_path.resolve()) meta['source'] = path2str(model_path.resolve())
else: else:
data['source'] = path2str(model_path) meta['source'] = path2str(model_path)
print_info(data, 'model %s' % model, markdown) print_info(meta, 'model %s' % model, markdown)
else: else:
data = {'spaCy version': about.__version__, data = {'spaCy version': about.__version__,
'Location': path2str(Path(__file__).parent.parent), 'Location': path2str(Path(__file__).parent.parent),

View File

@ -306,25 +306,17 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
""" """Create a GoldParse.
Create a GoldParse.
Arguments: doc (Doc): The document the annotations refer to.
doc (Doc): words (iterable): A sequence of unicode word strings.
The document the annotations refer to. tags (iterable): A sequence of strings, representing tag annotations.
words: heads (iterable): A sequence of integers, representing syntactic head offsets.
A sequence of unicode word strings. deps (iterable): A sequence of strings, representing the syntactic relation types.
tags: entities (iterable): A sequence of named entity annotations, either as
A sequence of strings, representing tag annotations. BILUO tag strings, or as `(start_char, end_char, label)` tuples,
heads: representing the entity positions.
A sequence of integers, representing syntactic head offsets. RETURNS (GoldParse): The newly constructed object.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
""" """
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
@ -389,55 +381,45 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
""" """Get the number of gold-standard tokens.
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. RETURNS (int): The number of gold-standard tokens.
""" """
return self.length return self.length
@property @property
def is_projective(self): def is_projective(self):
""" """Whether the provided syntactic annotations form a projective
Whether the provided syntactic annotations form a projective dependency dependency tree.
tree.
""" """
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)
def biluo_tags_from_offsets(doc, entities): def biluo_tags_from_offsets(doc, entities):
""" """Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (BILUO).
scheme (biluo).
Arguments: doc (Doc): The document that the entity offsets refer to. The output tags
doc (Doc): will refer to the token boundaries within the document.
The document that the entity offsets refer to. The output tags will entities (iterable): A sequence of `(start, end, label)` triples. `start` and
refer to the token boundaries within the document. `end` should be character-offset integers denoting the slice into the
original string.
entities (sequence): RETURNS (list): A list of unicode strings, describing the tags. Each tag
A sequence of (start, end, label) triples. start and end should be string will be of the form either "", "O" or "{action}-{label}", where
character-offset integers denoting the slice into the original string. action is one of "B", "I", "L", "U". The string "-" is used where the
entity offsets don't align with the tokenization in the `Doc` object. The
training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Returns: EXAMPLE:
tags (list): >>> text = 'I like London.'
A list of unicode strings, describing the tags. Each tag string will >>> entities = [(len('I like '), len('I like London'), 'LOC')]
be of the form either "", "O" or "{action}-{label}", where action is one >>> doc = nlp.tokenizer(text)
of "B", "I", "L", "U". The string "-" is used where the entity >>> tags = biluo_tags_from_offsets(doc, entities)
offsets don't align with the tokenization in the Doc object. The >>> assert tags == ['O', 'O', 'U-LOC', 'O']
training algorithm will view these as missing values. "O" denotes
a non-entity token. "B" denotes the beginning of a multi-token entity,
"I" the inside of an entity of three or more tokens, and "L" the end
of an entity of two or more tokens. "U" denotes a single-token entity.
Example:
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = nlp.tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
""" """
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx+len(token): token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc}

View File

@ -13,21 +13,23 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class BengaliDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
class Bengali(Language): class Bengali(Language):
lang = 'bn' lang = 'bn'
Defaults = BengaliDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'bn'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = TAG_MAP
stop_words = STOP_WORDS
lemma_rules = LEMMA_RULES
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
__all__ = ['Bengali'] __all__ = ['Bengali']

View File

@ -10,15 +10,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class DanishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Danish(Language): class Danish(Language):
lang = 'da' lang = 'da'
Defaults = DanishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'da'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Danish'] __all__ = ['Danish']

View File

@ -14,21 +14,23 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class GermanDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class German(Language): class German(Language):
lang = 'de' lang = 'de'
Defaults = GermanDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'de'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
tag_map = dict(TAG_MAP)
stop_words = set(STOP_WORDS)
syntax_iterators = dict(SYNTAX_ITERATORS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['German'] __all__ = ['German']

View File

@ -32,7 +32,6 @@ class EnglishDefaults(Language.Defaults):
class English(Language): class English(Language):
lang = 'en' lang = 'en'
Defaults = EnglishDefaults Defaults = EnglishDefaults

View File

@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults):
class Spanish(Language): class Spanish(Language):
lang = 'es' lang = 'es'
Defaults = SpanishDefaults Defaults = SpanishDefaults
__all__ = ['Spanish'] __all__ = ['Spanish']

View File

@ -10,15 +10,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class FinnishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Finnish(Language): class Finnish(Language):
lang = 'fi' lang = 'fi'
Defaults = FinnishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fi'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Finnish'] __all__ = ['Finnish']

View File

@ -13,22 +13,24 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class FrenchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class French(Language): class French(Language):
lang = 'fr' lang = 'fr'
Defaults = FrenchDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'fr'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
infixes = tuple(TOKENIZER_INFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['French'] __all__ = ['French']

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class HebrewDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Hebrew(Language): class Hebrew(Language):
lang = 'he' lang = 'he'
Defaults = HebrewDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'he'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Hebrew'] __all__ = ['Hebrew']

View File

@ -13,23 +13,25 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class HungarianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Hungarian(Language): class Hungarian(Language):
lang = 'hu' lang = 'hu'
Defaults = HungarianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'hu'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
token_match = TOKEN_MATCH
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Hungarian'] __all__ = ['Hungarian']

View File

@ -11,19 +11,21 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class ItalianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Italian(Language): class Italian(Language):
lang = 'it' lang = 'it'
Defaults = ItalianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'it'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Italian'] __all__ = ['Italian']

View File

@ -11,15 +11,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class NorwegianDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Norwegian(Language): class Norwegian(Language):
lang = 'nb' lang = 'nb'
Defaults = NorwegianDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nb'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Norwegian'] __all__ = ['Norwegian']

View File

@ -9,16 +9,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class DutchDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Dutch(Language): class Dutch(Language):
lang = 'nl' lang = 'nl'
Defaults = DutchDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'nl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Dutch'] __all__ = ['Dutch']

View File

@ -9,15 +9,17 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class PolishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
class Polish(Language): class Polish(Language):
lang = 'pl' lang = 'pl'
Defaults = PolishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pl'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
stop_words = set(STOP_WORDS)
__all__ = ['Polish'] __all__ = ['Polish']

View File

@ -13,20 +13,22 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class PortugueseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Portuguese(Language): class Portuguese(Language):
lang = 'pt' lang = 'pt'
Defaults = PortugueseDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'pt'
lex_attr_getters.update(LEX_ATTRS)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Portuguese'] __all__ = ['Portuguese']

View File

@ -13,19 +13,21 @@ from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
class SwedishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
class Swedish(Language): class Swedish(Language):
lang = 'sv' lang = 'sv'
Defaults = SwedishDefaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'sv'
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
@classmethod
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(LOOKUP)
__all__ = ['Swedish'] __all__ = ['Swedish']

View File

@ -116,14 +116,30 @@ class BaseDefaults(object):
class Language(object): class Language(object):
""" """A text-processing pipeline. Usually you'll load this once per process,
A text-processing pipeline. Usually you'll load this once per process, and and pass the instance around your application.
pass the instance around your program.
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code.
""" """
Defaults = BaseDefaults Defaults = BaseDefaults
lang = None lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}): def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
`Language.Defaults.create_vocab`.
make_doc (callable): A function that takes text and returns a `Doc`
object. Usually a `Tokenizer`.
pipeline (list): A list of annotation processes or IDs of annotation,
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
up in `Language.Defaults.factories`.
meta (dict): Custom meta data for the Language class. Is written to by
models to add model meta data.
RETURNS (Language): The newly constructed object.
"""
self.meta = dict(meta) self.meta = dict(meta)
if vocab is True: if vocab is True:
@ -147,22 +163,17 @@ class Language(object):
self.pipeline = [] self.pipeline = []
def __call__(self, text, **disabled): def __call__(self, text, **disabled):
""" """'Apply the pipeline to some text. The text can span multiple sentences,
Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string
and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
Args: text (unicode): The text to be processed.
text (unicode): The text to be processed. **disabled: Elements of the pipeline that should not be run.
RETURNS (Doc): A container for accessing the annotations.
Returns: EXAMPLE:
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.') >>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_ >>> tokens[0].text, tokens[0].head.tag_
('An', 'NN') ('An', 'NN')
""" """
doc = self.make_doc(text) doc = self.make_doc(text)
@ -174,6 +185,21 @@ class Language(object):
return doc return doc
def update(self, docs, golds, drop=0., sgd=None): def update(self, docs, golds, drop=0., sgd=None):
"""Update the models in the pipeline.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
grads = {} grads = {}
def get_grads(W, dW, key=None): def get_grads(W, dW, key=None):
grads[key] = (W, dW) grads[key] = (W, dW)
@ -204,7 +230,20 @@ class Language(object):
for doc, gold in docs_golds: for doc, gold in docs_golds:
yield doc, gold yield doc, gold
def begin_training(self, get_gold_tuples, **cfg): def begin_training(self, gold_tuples, **cfg):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer. Used as a contextmanager.
gold_tuples (iterable): Gold-standard training data.
**cfg: Config parameters.
YIELDS (tuple): A trainer and an optimizer.
EXAMPLE:
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
>>> for epoch in trainer.epochs(gold):
>>> for docs, golds in epoch:
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
# Populate vocab # Populate vocab
for _, annots_brackets in get_gold_tuples(): for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets: for annots, _ in annots_brackets:
@ -233,6 +272,17 @@ class Language(object):
@contextmanager @contextmanager
def use_params(self, params, **cfg): def use_params(self, params, **cfg):
"""Replace weights of models in the pipeline with those provided in the
params dictionary. Can be used as a contextmanager, in which case,
models go back to their original weights after the block.
params (dict): A dictionary of parameters keyed by model ID.
**cfg: Config parameters.
EXAMPLE:
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')] in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib # TODO: Having trouble with contextlib
@ -250,16 +300,20 @@ class Language(object):
pass pass
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled): def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
""" """Process texts as a stream, and yield `Doc` objects in order. Supports
Process texts as a stream, and yield Doc objects in order. GIL-free multi-threading.
Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process.
n_threads (int): The number of worker threads to use. If -1, OpenMP will
decide how many to use at run time. Default is 2.
batch_size (int): The number of texts to buffer.
**disabled: Pipeline components to exclude.
YIELDS (Doc): Documents in the order of the original text.
Arguments: EXAMPLE:
texts (iterator) >>> texts = [u'One document.', u'...', u'Lots of documents']
tag (bool) >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
parse (bool) >>> assert doc.is_parsed
entity (bool)
""" """
#docs = (self.make_doc(text) for text in texts) #docs = (self.make_doc(text) for text in texts)
docs = texts docs = texts
@ -267,7 +321,6 @@ class Language(object):
name = getattr(proc, 'name', None) name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]: if name in disabled and not disabled[name]:
continue continue
if hasattr(proc, 'pipe'): if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
else: else:
@ -278,11 +331,12 @@ class Language(object):
def to_disk(self, path, **exclude): def to_disk(self, path, **exclude):
"""Save the current state to a directory. """Save the current state to a directory.
Args: path (unicode or Path): A path to a directory, which will be created if
path: A path to a directory, which will be created if it doesn't it doesn't exist. Paths may be either strings or `Path`-like objects.
exist. Paths may be either strings or pathlib.Path-like **exclude: Named attributes to prevent from being saved.
objects.
**exclude: Prevent named attributes from being saved. EXAMPLE:
>>> nlp.to_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
@ -301,12 +355,17 @@ class Language(object):
dill.dump(props, file_) dill.dump(props, file_)
def from_disk(self, path, **exclude): def from_disk(self, path, **exclude):
"""Load the current state from a directory. """Loads state from a directory. Modifies the object in place and
returns it.
Args: path (unicode or Path): A path to a directory. Paths may be either
path: A path to a directory. Paths may be either strings or strings or `Path`-like objects.
pathlib.Path-like objects. **exclude: Named attributes to prevent from being loaded.
**exclude: Prevent named attributes from being saved. RETURNS (Language): The modified `Language` object.
EXAMPLE:
>>> from spacy.language import Language
>>> nlp = Language().from_disk('/path/to/models')
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
for name in path.iterdir(): for name in path.iterdir():
@ -320,10 +379,8 @@ class Language(object):
def to_bytes(self, **exclude): def to_bytes(self, **exclude):
"""Serialize the current state to a binary string. """Serialize the current state to a binary string.
Args: **exclude: Named attributes to prevent from being serialized.
path: A path to a directory. Paths may be either strings or RETURNS (bytes): The serialized form of the `Language` object.
pathlib.Path-like objects.
**exclude: Prevent named attributes from being serialized.
""" """
props = dict(self.__dict__) props = dict(self.__dict__)
for key in exclude: for key in exclude:
@ -334,13 +391,12 @@ class Language(object):
def from_bytes(self, bytes_data, **exclude): def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string. """Load state from a binary string.
Args: bytes_data (bytes): The data to load from.
bytes_data (bytes): The data to load from. **exclude: Named attributes to prevent from being loaded.
**exclude: Prevent named attributes from being loaded. RETURNS (Language): The `Language` object.
""" """
props = dill.loads(bytes_data) props = dill.loads(bytes_data)
for key, value in props.items(): for key, value in props.items():
if key not in exclude: if key not in exclude:
setattr(self, key, value) setattr(self, key, value)
return self return self

View File

@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
cdef class Lexeme: cdef class Lexeme:
""" """An entry in the vocabulary. A `Lexeme` has no string context it's a
An entry in the vocabulary. A Lexeme has no string context --- it's a
word-type, as opposed to a word token. It therefore has no part-of-speech word-type, as opposed to a word token. It therefore has no part-of-speech
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
""" """Create a Lexeme object.
Create a Lexeme object.
Arguments: vocab (Vocab): The parent vocabulary
vocab (Vocab): The parent vocabulary orth (int): The orth id of the lexeme.
orth (int): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object. Returns (Lexeme): The newly constructd object.
""" """
self.vocab = vocab self.vocab = vocab
@ -82,35 +79,28 @@ cdef class Lexeme:
return self.c.orth return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
""" """Change the value of a boolean flag.
Change the value of a boolean flag.
Arguments: flag_id (int): The attribute ID of the flag to set.
flag_id (int): The attribute ID of the flag to set. value (bool): The new value of the flag.
value (bool): The new value of the flag.
""" """
Lexeme.c_set_flag(self.c, flag_id, value) Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id): def check_flag(self, attr_id_t flag_id):
""" """Check the value of a boolean flag.
Check the value of a boolean flag.
Arguments: flag_id (int): The attribute ID of the flag to query.
flag_id (int): The attribute ID of the flag to query. RETURNS (bool): The value of the flag.
Returns (bool): The value of the flag.
""" """
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
""" """Compute a semantic similarity estimate. Defaults to cosine over
Compute a semantic similarity estimate. Defaults to cosine over vectors. vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other: `Span`, `Token` and `Lexeme` objects.
The object to compare with. By default, accepts Doc, Span, RETURNS (float): A scalar similarity score. Higher is more similar.
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
""" """
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
@ -140,6 +130,11 @@ cdef class Lexeme:
self.orth = self.c.orth self.orth = self.c.orth
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self): def __get__(self):
cdef int i cdef int i
for i in range(self.vocab.vectors_length): for i in range(self.vocab.vectors_length):
@ -149,6 +144,10 @@ cdef class Lexeme:
return False return False
property vector_norm: property vector_norm:
"""The L2 norm of the lexeme's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
return self.c.l2_norm return self.c.l2_norm
@ -156,6 +155,11 @@ cdef class Lexeme:
self.c.l2_norm = value self.c.l2_norm = value
property vector: property vector:
"""A real-valued meaning representation.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the lexeme's semantics.
"""
def __get__(self): def __get__(self):
cdef int length = self.vocab.vectors_length cdef int length = self.vocab.vectors_length
if length == 0: if length == 0:
@ -196,6 +200,14 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]
property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self):
return self.orth_
property lower: property lower:
def __get__(self): return self.c.lower def __get__(self): return self.c.lower
def __set__(self, int x): self.c.lower = x def __set__(self, int x): self.c.lower = x

View File

@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC ctypedef pair[int, TokenPatternC_ptr] StateC
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label, cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
object token_specs) except NULL: object token_specs) except NULL:
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC)) pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
cdef int i cdef int i
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
pattern[i].attrs[j].attr = attr pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value pattern[i].attrs[j].value = value
i = len(token_specs) i = len(token_specs)
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC)) pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id pattern[i].attrs[0].value = entity_id
pattern[i].attrs[1].attr = ENT_TYPE
pattern[i].attrs[1].value = label
pattern[i].nr_attr = 0 pattern[i].nr_attr = 0
return pattern return pattern
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
return id_attr.value
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
for attr in pattern.attrs[:pattern.nr_attr]: for attr in pattern.attrs[:pattern.nr_attr]:
if get_token_attr(token, attr.attr) != attr.value: if get_token_attr(token, attr.attr) != attr.value:
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches): def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match''' """Callback to merge a phrase on match."""
ent_id, label, start, end = matches[i] ent_id, label, start, end = matches[i]
span = doc[start : end] span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id) span.merge(ent_type=label, ent_id=ent_id)
cdef class Matcher: cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.''' """Match sequences of tokens, based on pattern rules."""
cdef Pool mem cdef Pool mem
cdef vector[TokenPatternC*] patterns cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab cdef readonly Vocab vocab
@ -175,37 +181,12 @@ cdef class Matcher:
cdef public object _callbacks cdef public object _callbacks
cdef public object _acceptors cdef public object _acceptors
@classmethod def __init__(self, vocab):
def load(cls, path, vocab): """Create the Matcher.
"""
Load the matcher and patterns from a file path.
Arguments: vocab (Vocab): The vocabulary object, which must be shared with the
path (Path): documents the matcher will operate on.
Path to a JSON-formatted patterns file. RETURNS (Matcher): The newly constructed object.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
"""
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = ujson.load(file_)
else:
patterns = {}
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}):
"""
Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
""" """
self._patterns = {} self._patterns = {}
self._entities = {} self._entities = {}
@ -213,144 +194,111 @@ cdef class Matcher:
self._callbacks = {} self._callbacks = {}
self.vocab = vocab self.vocab = vocab
self.mem = Pool() self.mem = Pool()
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
self.add_entity(entity_key, attrs)
for spec in specs:
self.add_pattern(entity_key, spec, label=etype)
def __reduce__(self): def __reduce__(self):
return (self.__class__, (self.vocab, self._patterns), None, None) return (self.__class__, (self.vocab, self._patterns), None, None)
property n_patterns: def __len__(self):
def __get__(self): return self.patterns.size() """Get the number of rules added to the matcher. Note that this only
returns the number of rules (identical with the number of IDs), not the
number of individual patterns.
def add_entity(self, entity_key, attrs=None, if_exists='raise', RETURNS (int): The number of rules.
acceptor=None, on_match=None):
""" """
Add an entity to the matcher. return len(self._patterns)
Arguments: def __contains__(self, key):
entity_key (unicode or int): """Check whether the matcher contains rules for a match ID.
An ID for the entity.
attrs: key (unicode): The match ID.
Attributes to associate with the Matcher. RETURNS (bool): Whether the matcher contains rules for this match ID.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
""" """
if if_exists not in ('raise', 'ignore', 'update'): return len(self._patterns)
raise ValueError(
"Unexpected value for if_exists: %s.\n"
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
if attrs is None:
attrs = {}
entity_key = self.normalize_entity_key(entity_key)
if self.has_entity(entity_key):
if if_exists == 'raise':
raise KeyError(
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
"Set if_exists='ignore' or if_exists='update', or check with "
"matcher.has_entity()")
elif if_exists == 'ignore':
return
self._entities[entity_key] = dict(attrs)
self._patterns.setdefault(entity_key, [])
self._acceptors[entity_key] = acceptor
self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add(self, key, on_match, *patterns):
"""Add a match-rule to the matcher.
A match-rule consists of: an ID key, an on_match callback, and one or
more patterns. If the key exists, the patterns are appended to the
previous ones, and the previous on_match callback is replaced. The
`on_match` callback will receive the arguments `(matcher, doc, i,
matches)`. You can also set `on_match` to `None` to not perform any
actions. A pattern consists of one or more `token_specs`, where a
`token_spec` is a dictionary mapping attribute IDs to values. Token
descriptors can also include quantifiers. There are currently important
known problems with the quantifiers see the docs.
""" """
Add a pattern to the matcher. for pattern in patterns:
if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"key: {key}\n")
raise ValueError(msg.format(key=key))
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._callbacks[key] = on_match
Arguments: for pattern in patterns:
entity_key (unicode or int): specs = _convert_strings(pattern, self.vocab.strings)
An ID for the entity. self.patterns.push_back(init_pattern(self.mem, key, specs))
token_specs: self._patterns[key].append(specs)
Description of the pattern to be matched.
label: def remove(self, key):
Label to assign to the matched pattern. Defaults to "". """Remove a rule from the matcher. A KeyError is raised if the key does
Returns: not exist.
None
key (unicode): The ID of the match rule.
""" """
token_specs = list(token_specs) key = self._normalize_key(key)
if len(token_specs) == 0: self._patterns.pop(key)
msg = ("Cannot add pattern for zero tokens to matcher.\n" self._callbacks.pop(key)
"entity_key: {entity_key}\n" cdef int i = 0
"label: {label}") while i < self.patterns.size():
raise ValueError(msg.format(entity_key=entity_key, label=label)) pattern_key = get_pattern_key(self.patterns.at(i))
entity_key = self.normalize_entity_key(entity_key) if pattern_key == key:
if not self.has_entity(entity_key): self.patterns.erase(self.patterns.begin()+i)
self.add_entity(entity_key) else:
if isinstance(label, basestring): i += 1
label = self.vocab.strings[label]
elif label is None:
label = 0
spec = _convert_strings(token_specs, self.vocab.strings)
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec)) def has_key(self, key):
self._patterns[entity_key].append((label, token_specs)) """Check whether the matcher has a rule with a given key.
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None): key (string or int): The key to check.
self.add_entity(entity_key, attrs=attrs, if_exists='update', RETURNS (bool): Whether the matcher has the rule.
acceptor=acceptor, on_match=on_match)
for spec in specs:
self.add_pattern(entity_key, spec, label=label)
def normalize_entity_key(self, entity_key):
if isinstance(entity_key, basestring):
return self.vocab.strings[entity_key]
else:
return entity_key
def has_entity(self, entity_key):
""" """
Check whether the matcher has an entity. key = self._normalize_key(key)
return key in self._patterns
Arguments: def get(self, key, default=None):
entity_key (string or int): The entity key to check. """Retrieve the pattern stored for a key.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities
def get_entity(self, entity_key): key (unicode or int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
""" """
Retrieve the attributes stored for an entity. key = self._normalize_key(key)
if key not in self._patterns:
return default
return (self._callbacks[key], self._patterns[key])
Arguments: def pipe(self, docs, batch_size=1000, n_threads=2):
entity_key (unicode or int): The entity to retrieve. """Match a stream of documents, yielding them in turn.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities:
return self._entities[entity_key]
else:
return None
def __call__(self, Doc doc, acceptor=None): docs (iterable): A stream of documents.
batch_size (int): The number of documents to accumulate into a working set.
n_threads (int): The number of threads with which to work on the buffer
in parallel, if the `Matcher` implementation supports multi-threading.
YIELDS (Doc): Documents, in order.
""" """
Find all token sequences matching the supplied patterns on the Doc. for doc in docs:
self(doc)
yield doc
Arguments: def __call__(self, Doc doc):
doc (Doc): """Find all token sequences matching the supplied patterns on the `Doc`.
The document to match over.
Returns: doc (Doc): The document to match over.
list RETURNS (list): A list of `(key, label_id, start, end)` tuples,
A list of (entity_key, label_id, start, end) tuples, describing the matches. A match tuple describes a span
describing the matches. A match tuple describes a span doc[start:end]. `doc[start:end]`. The `label_id` and `key` are both integers.
The label_id and entity_key are both integers.
""" """
if acceptor is not None:
raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
"functions when you add patterns instead.")
cdef vector[StateC] partials cdef vector[StateC] partials
cdef int n_partials = 0 cdef int n_partials = 0
cdef int q = 0 cdef int q = 0
@ -388,13 +336,7 @@ cdef class Matcher:
end = token_i+1 end = token_i+1
ent_id = state.second[1].attrs[0].value ent_id = state.second[1].attrs[0].value
label = state.second[1].attrs[1].value label = state.second[1].attrs[1].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
partials.resize(q) partials.resize(q)
# Check whether we open any new patterns on this token # Check whether we open any new patterns on this token
for pattern in self.patterns: for pattern in self.patterns:
@ -419,13 +361,7 @@ cdef class Matcher:
end = token_i+1 end = token_i+1
ent_id = pattern[1].attrs[0].value ent_id = pattern[1].attrs[0].value
label = pattern[1].attrs[1].value label = pattern[1].attrs[1].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
# Look for open patterns that are actually satisfied # Look for open patterns that are actually satisfied
for state in partials: for state in partials:
while state.second.quantifier in (ZERO, ZERO_PLUS): while state.second.quantifier in (ZERO, ZERO_PLUS):
@ -435,36 +371,19 @@ cdef class Matcher:
end = len(doc) end = len(doc)
ent_id = state.second.attrs[0].value ent_id = state.second.attrs[0].value
label = state.second.attrs[0].value label = state.second.attrs[0].value
acceptor = self._acceptors.get(ent_id) matches.append((ent_id, start, end))
if acceptor is None:
matches.append((ent_id, label, start, end))
else:
match = acceptor(doc, ent_id, label, start, end)
if match:
matches.append(match)
for i, (ent_id, label, start, end) in enumerate(matches): for i, (ent_id, label, start, end) in enumerate(matches):
on_match = self._callbacks.get(ent_id) on_match = self._callbacks.get(ent_id)
if on_match is not None: if on_match is not None:
on_match(self, doc, i, matches) on_match(self, doc, i, matches)
# TODO: only return (match_id, start, end)
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def _normalize_key(self, key):
""" if isinstance(key, basestring):
Match a stream of documents, yielding them in turn. return self.vocab.strings[key]
else:
Arguments: return key
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs:
self(doc)
yield doc
def get_bilou(length): def get_bilou(length):

View File

@ -38,33 +38,71 @@ from .parts_of_speech import X
class TokenVectorEncoder(object): class TokenVectorEncoder(object):
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.''' """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
name = 'tok2vec' name = 'tok2vec'
@classmethod @classmethod
def Model(cls, width=128, embed_size=5000, **cfg): def Model(cls, width=128, embed_size=5000, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
embed_size (int): Number of vectors in the embedding table.
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
width = util.env_opt('token_vector_width', width) width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size) embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, preprocess=None) return Tok2Vec(width, embed_size, preprocess=None)
def __init__(self, vocab, model=True, **cfg): def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
initialisation.
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
instance with the `Doc` objects it will process.
model (Model): A `Model` instance or `True` allocate one later.
**cfg: Config parameters.
EXAMPLE:
>>> from spacy.pipeline import TokenVectorEncoder
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
>>> tok2vec.model = tok2vec.Model(128, 5000)
"""
self.vocab = vocab self.vocab = vocab
self.doc2feats = doc2feats() self.doc2feats = doc2feats()
self.model = model self.model = model
def __call__(self, docs): def __call__(self, docs):
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
model. Vectors are set to the `Doc.tensor` attribute.
docs (Doc or iterable): One or more documents to add vectors to.
RETURNS (dict or None): Intermediate computations.
"""
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
tokvecses = self.predict(docs) tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses) self.set_annotations(docs, tokvecses)
def pipe(self, stream, batch_size=128, n_threads=-1): def pipe(self, stream, batch_size=128, n_threads=-1):
"""Process `Doc` objects as a stream.
stream (iterator): A sequence of `Doc` objects to process.
batch_size (int): Number of `Doc` objects to group.
n_threads (int): Number of threads.
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
"""
for docs in cytoolz.partition_all(batch_size, stream): for docs in cytoolz.partition_all(batch_size, stream):
tokvecses = self.predict(docs) tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses) self.set_annotations(docs, tokvecses)
yield from docs yield from docs
def predict(self, docs): def predict(self, docs):
"""Return a single tensor for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the documents.
"""
feats = self.doc2feats(docs) feats = self.doc2feats(docs)
tokvecs = self.model(feats) tokvecs = self.model(feats)
return tokvecs return tokvecs
@ -73,7 +111,26 @@ class TokenVectorEncoder(object):
for doc, tokvecs in zip(docs, tokvecses): for doc, tokvecs in zip(docs, tokvecses):
doc.tensor = tokvecs doc.tensor = tokvecs
def begin_update(self, docs, drop=0.): def set_annotations(self, docs, tokvecs):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the documents.
"""
start = 0
for doc in docs:
doc.tensor = tokvecs[start : start + len(doc)]
start += len(doc)
def update(self, docs, golds, state=None, drop=0., sgd=None):
"""Update the model.
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
drop (float): The droput rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
"""
if isinstance(docs, Doc): if isinstance(docs, Doc):
docs = [docs] docs = [docs]
feats = self.doc2feats(docs) feats = self.doc2feats(docs)
@ -81,14 +138,26 @@ class TokenVectorEncoder(object):
return tokvecs, bp_tokvecs return tokvecs, bp_tokvecs
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
# TODO: implement
raise NotImplementedError raise NotImplementedError
def begin_training(self, gold_tuples, pipeline=None): def begin_training(self, gold_tuples, pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
optimizer.
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
self.doc2feats = doc2feats() self.doc2feats = doc2feats()
if self.model is True: if self.model is True:
self.model = self.Model() self.model = self.Model()
def use_params(self, params): def use_params(self, params):
"""Replace weights of models in the pipeline with those provided in the
params dictionary.
params (dict): A dictionary of parameters keyed by model ID.
"""
with self.model.use_params(params): with self.model.use_params(params):
yield yield
@ -189,9 +258,7 @@ class NeuralTagger(object):
cdef class EntityRecognizer(LinearParser): cdef class EntityRecognizer(LinearParser):
""" """Annotate named entities on Doc objects."""
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')
@ -203,9 +270,7 @@ cdef class EntityRecognizer(LinearParser):
cdef class BeamEntityRecognizer(BeamParser): cdef class BeamEntityRecognizer(BeamParser):
""" """Annotate named entities on Doc objects."""
Annotate named entities on Doc objects.
"""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')

View File

@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t
import ujson
cpdef hash_t hash_string(unicode string) except 0: cpdef hash_t hash_string(unicode string) except 0:
chars = string.encode('utf8') chars = string.encode('utf8')
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
cdef class StringStore: cdef class StringStore:
""" """Map strings to and from integer IDs."""
Map strings to and from integer IDs.
"""
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
""" """Create the StringStore.
Create the StringStore.
Arguments: strings (iterable): A sequence of unicode strings to add to the store.
strings: A sequence of unicode strings to add to the store. RETURNS (StringStore): The newly constructed object.
""" """
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
@ -106,23 +101,17 @@ cdef class StringStore:
return (StringStore, (list(self),)) return (StringStore, (list(self),))
def __len__(self): def __len__(self):
""" """The number of strings in the store.
The number of strings in the store.
Returns: RETURNS (int): The number of strings in the store.
int The number of strings in the store.
""" """
return self.size-1 return self.size-1
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
""" """Retrieve a string from a given integer ID, or vice versa.
Retrieve a string from a given integer ID, or vice versa.
Arguments: string_or_id (bytes or unicode or int): The value to encode.
string_or_id (bytes or unicode or int): Returns (unicode or int): The value to be retrieved.
The value to encode.
Returns:
unicode or int: The value to retrieved.
""" """
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0 return 0
@ -163,13 +152,10 @@ cdef class StringStore:
return utf8str - self.c return utf8str - self.c
def __contains__(self, unicode string not None): def __contains__(self, unicode string not None):
""" """Check whether a string is in the store.
Check whether a string is in the store.
Arguments: string (unicode): The string to check.
string (unicode): The string to check. RETURNS (bool): Whether the store contains the string.
Returns bool:
Whether the store contains the string.
""" """
if len(string) == 0: if len(string) == 0:
return True return True
@ -177,10 +163,9 @@ cdef class StringStore:
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
""" """Iterate over the strings in the store, in order.
Iterate over the strings in the store, in order.
Yields: unicode A string in the store. YIELDS (unicode): A string in the store.
""" """
cdef int i cdef int i
for i in range(self.size): for i in range(self.size):
@ -195,6 +180,41 @@ cdef class StringStore:
strings.append(py_string) strings.append(py_string)
return (StringStore, (strings,), None, None, None) return (StringStore, (strings,), None, None, None)
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `StringStore` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (StringStore): The `StringStore` object.
"""
raise NotImplementedError()
def set_frozen(self, bint is_frozen): def set_frozen(self, bint is_frozen):
# TODO # TODO
self.is_frozen = is_frozen self.is_frozen = is_frozen
@ -235,40 +255,6 @@ cdef class StringStore:
self.size += 1 self.size += 1
return &self.c[self.size-1] return &self.c[self.size-1]
def dump(self, file_):
"""
Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = ujson.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
# TODO: OOV?
file_.write(string_data)
def load(self, file_):
"""
Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = ujson.load(file_)
if strings == ['']:
return None
cdef unicode string
for string in strings:
# explicit None/len check instead of simple truth testing
# (bug in Cython <= 0.23.4)
if string is not None and len(string):
self.intern_unicode(string)
def _realloc(self): def _realloc(self):
# We want to map straight to pointers, but they'll be invalidated if # We want to map straight to pointers, but they'll be invalidated if
# we resize our array. So, first we remap to indices, then we resize, # we resize our array. So, first we remap to indices, then we resize,

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from collections import defaultdict from collections import defaultdict
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *
from . import util
cpdef enum: cpdef enum:
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger: cdef class Tagger:
""" """Annotate part-of-speech tags on Doc objects."""
Annotate part-of-speech tags on Doc objects.
"""
@classmethod
def load(cls, path, vocab, require=False):
"""
Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to
# support old data.
path = util.ensure_path(path)
if (path / 'templates.json').exists():
with (path / 'templates.json').open('r', encoding='utf8') as file_:
templates = ujson.load(file_)
elif require:
raise IOError(
"Required file %s/templates.json not found when loading Tagger" % str(path))
else:
templates = cls.feature_templates
self = cls(vocab, model=None, feature_templates=templates)
if (path / 'model').exists():
self.model.load(str(path / 'model'))
elif require:
raise IOError(
"Required file %s/model not found when loading Tagger" % str(path))
return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
""" """Create a Tagger.
Create a Tagger.
Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents to
vocab (Vocab): be processed.
The vocabulary object. Must be shared with documents to be processed. model (thinc.linear.AveragedPerceptron): The statistical model.
model (thinc.linear.AveragedPerceptron): RETURNS (Tagger): The newly constructed object.
The statistical model.
Returns (Tagger):
The newly constructed object.
""" """
if model is None: if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates), model = TaggerModel(cfg.get('features', self.feature_templates),
@ -186,13 +144,9 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
""" """Apply the tagger, setting the POS tags onto the Doc object.
Apply the tagger, setting the POS tags onto the Doc object.
Arguments: doc (Doc): The tokens to be tagged.
doc (Doc): The tokens to be tagged.
Returns:
None
""" """
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
@ -215,34 +169,25 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
""" """Tag a stream of documents.
Tag a stream of documents.
Arguments: Arguments:
stream: The sequence of documents to tag. stream: The sequence of documents to tag.
batch_size (int): batch_size (int): The number of documents to accumulate into a working set.
The number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer
n_threads (int): in parallel, if the Matcher implementation supports multi-threading.
The number of threads with which to work on the buffer in parallel, YIELDS (Doc): Documents, in order.
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
""" """
for doc in stream: for doc in stream:
self(doc) self(doc)
yield doc yield doc
def update(self, Doc tokens, GoldParse gold, itn=0): def update(self, Doc tokens, GoldParse gold, itn=0):
""" """Update the statistical model, with tags supplied for the given document.
Update the statistical model, with tags supplied for the given document.
Arguments: doc (Doc): The document to update on.
doc (Doc): gold (GoldParse): Manager for the gold-standard tags.
The document to update on. RETURNS (int): Number of tags predicted correctly.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
""" """
gold_tag_strs = gold.tags gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs) assert len(tokens) == len(gold_tag_strs)

View File

@ -99,8 +99,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
assert [t.text for t in doc[1].ancestors] == ["saw"] assert [t.text for t in doc[1].ancestors] == ["saw"]
assert [t.text for t in doc[2].ancestors] == [] assert [t.text for t in doc[2].ancestors] == []
assert doc[2].is_ancestor_of(doc[7]) assert doc[2].is_ancestor(doc[7])
assert not doc[6].is_ancestor_of(doc[2]) assert not doc[6].is_ancestor(doc[2])
def test_doc_token_api_head_setter(en_tokenizer): def test_doc_token_api_head_setter(en_tokenizer):

View File

@ -2,8 +2,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import ujson
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -12,75 +10,31 @@ from preshed.maps cimport PreshMap
from .strings cimport hash_string from .strings cimport hash_string
cimport cython cimport cython
from . import util
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
cdef class Tokenizer: cdef class Tokenizer:
"""Segment text, and create Doc objects with the discovered segment
boundaries.
""" """
Segment text, and create Doc objects with the discovered segment boundaries.
"""
@classmethod
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
infix_finditer=None, token_match=None):
"""
Load a Tokenizer, reading unsupplied components from the path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
A storage container for lexical types.
rules (dict):
Exceptions and special-cases for the tokenizer.
token_match:
A boolean function matching strings that becomes tokens.
prefix_search:
Signature of re.compile(string).search
suffix_search:
Signature of re.compile(string).search
infix_finditer:
Signature of re.compile(string).finditer
Returns Tokenizer
"""
path = util.ensure_path(path)
if rules is None:
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
rules = ujson.load(file_)
if prefix_search in (None, True):
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
entries = file_.read().split('\n')
prefix_search = util.compile_prefix_regex(entries).search
if suffix_search in (None, True):
with (path / 'tokenizer' / 'suffix.txt').open() as file_:
entries = file_.read().split('\n')
suffix_search = util.compile_suffix_regex(entries).search
if infix_finditer in (None, True):
with (path / 'tokenizer' / 'infix.txt').open() as file_:
entries = file_.read().split('\n')
infix_finditer = util.compile_infix_regex(entries).finditer
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
""" """Create a `Tokenizer`, to create `Doc` objects given unicode text.
Create a Tokenizer, to create Doc objects given unicode text.
Arguments: vocab (Vocab): A storage container for lexical types.
vocab (Vocab): rules (dict): Exceptions and special-cases for the tokenizer.
A storage container for lexical types. prefix_search (callable): A function matching the signature of
rules (dict): `re.compile(string).search` to match prefixes.
Exceptions and special-cases for the tokenizer. suffix_search (callable): A function matching the signature of
prefix_search: `re.compile(string).search` to match suffixes.
A function matching the signature of re.compile(string).search `infix_finditer` (callable): A function matching the signature of
to match prefixes. `re.compile(string).finditer` to find infixes.
suffix_search: token_match (callable): A boolean function matching strings to be
A function matching the signature of re.compile(string).search recognised as tokens.
to match suffixes. RETURNS (Tokenizer): The newly constructed object.
infix_finditer:
A function matching the signature of re.compile(string).finditer EXAMPLE:
to find infixes. >>> tokenizer = Tokenizer(nlp.vocab)
token_match: >>> tokenizer = English().Defaults.create_tokenizer(nlp)
A boolean function matching strings that becomes tokens.
""" """
self.mem = Pool() self.mem = Pool()
self._cache = PreshMap() self._cache = PreshMap()
@ -112,13 +66,10 @@ cdef class Tokenizer:
@cython.boundscheck(False) @cython.boundscheck(False)
def __call__(self, unicode string): def __call__(self, unicode string):
""" """Tokenize a string.
Tokenize a string.
Arguments: string (unicode): The string to tokenize.
string (unicode): The string to tokenize. RETURNS (Doc): A container for linguistic annotations.
Returns:
Doc A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
raise ValueError( raise ValueError(
@ -166,18 +117,13 @@ cdef class Tokenizer:
return tokens return tokens
def pipe(self, texts, batch_size=1000, n_threads=2): def pipe(self, texts, batch_size=1000, n_threads=2):
""" """Tokenize a stream of texts.
Tokenize a stream of texts.
Arguments: texts: A sequence of unicode texts.
texts: A sequence of unicode texts. batch_size (int): The number of texts to accumulate in an internal buffer.
batch_size (int): n_threads (int): The number of threads to use, if the implementation
The number of texts to accumulate in an internal buffer. supports multi-threading. The default tokenizer is single-threaded.
n_threads (int): YIELDS (Doc): A sequence of Doc objects, in order.
The number of threads to use, if the implementation supports
multi-threading. The default tokenizer is single-threaded.
Yields:
Doc A sequence of Doc objects, in order.
""" """
for text in texts: for text in texts:
yield self(text) yield self(text)
@ -321,27 +267,23 @@ cdef class Tokenizer:
self._cache.set(key, cached) self._cache.set(key, cached)
def find_infix(self, unicode string): def find_infix(self, unicode string):
""" """Find internal split points of the string, such as hyphens.
Find internal split points of the string, such as hyphens.
string (unicode): The string to segment. string (unicode): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
Returns List[re.MatchObject] and `.end()` methods, denoting the placement of internal segment
A list of objects that have .start() and .end() methods, denoting the separators, e.g. hyphens.
placement of internal segment separators, e.g. hyphens.
""" """
if self.infix_finditer is None: if self.infix_finditer is None:
return 0 return 0
return list(self.infix_finditer(string)) return list(self.infix_finditer(string))
def find_prefix(self, unicode string): def find_prefix(self, unicode string):
""" """Find the length of a prefix that should be segmented from the string,
Find the length of a prefix that should be segmented from the string,
or None if no prefix rules match. or None if no prefix rules match.
Arguments: string (unicode): The string to segment.
string (unicode): The string to segment. RETURNS (int): The length of the prefix if present, otherwise `None`.
Returns (int or None): The length of the prefix if present, otherwise None.
""" """
if self.prefix_search is None: if self.prefix_search is None:
return 0 return 0
@ -349,13 +291,11 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def find_suffix(self, unicode string): def find_suffix(self, unicode string):
""" """Find the length of a suffix that should be segmented from the string,
Find the length of a suffix that should be segmented from the string,
or None if no suffix rules match. or None if no suffix rules match.
Arguments: string (unicode): The string to segment.
string (unicode): The string to segment. Returns (int): The length of the suffix if present, otherwise `None`.
Returns (int or None): The length of the suffix if present, otherwise None.
""" """
if self.suffix_search is None: if self.suffix_search is None:
return 0 return 0
@ -363,23 +303,17 @@ cdef class Tokenizer:
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, special_cases): def _load_special_tokenization(self, special_cases):
""" """Add special-case tokenization rules."""
Add special-case tokenization rules.
"""
for chunk, substrings in sorted(special_cases.items()): for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings) self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings): def add_special_case(self, unicode string, substrings):
""" """Add a special-case tokenization rule.
Add a special-case tokenization rule.
Arguments: string (unicode): The string to specially tokenize.
string (unicode): The string to specially tokenize. token_attrs (iterable): A sequence of dicts, where each dict describes
token_attrs: a token and its attributes. The `ORTH` fields of the attributes must
A sequence of dicts, where each dict describes a token and its exactly match the string when they are concatenated.
attributes. The ORTH fields of the attributes must exactly match
the string when they are concatenated.
Returns None
""" """
substrings = list(substrings) substrings = list(substrings)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -390,3 +324,38 @@ cdef class Tokenizer:
self._specials.set(key, cached) self._specials.set(key, cached)
self._cache.set(key, cached) self._cache.set(key, cached)
self._rules[string] = substrings self._rules[string] = substrings
def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
raise NotImplementedError()
def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Tokenizer): The modified `Tokenizer` object.
"""
raise NotImplementedError()
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Tokenizer` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Tokenizer): The `Tokenizer` object.
"""
raise NotImplementedError()

View File

@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc: cdef class Doc:
""" """A sequence of Token objects. Access sentences and named entities, export
A sequence of `Token` objects. Access sentences and named entities, annotations to numpy arrays, losslessly serialize to compressed binary strings.
export annotations to numpy arrays, losslessly serialize to compressed The `Doc` object holds an array of `TokenC` structs. The Python-level
binary strings. `Token` and `Span` objects are views of this array, i.e. they don't own
the data themselves.
Aside: Internals EXAMPLE: Construction 1
The `Doc` object holds an array of `TokenC` structs. >>> doc = nlp(u'Some text')
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
doc = nlp.tokenizer(u'Some text')
Code: Construction 2
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
Construction 2
>>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
""" """
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
""" """Create a Doc object.
Create a Doc object.
Arguments: vocab (Vocab): A vocabulary object, which must match any models you want
vocab: to use (e.g. tokenizer, parser, entity recognizer).
A Vocabulary object, which must match any models you want to words (list or None): A list of unicode strings to add to the document
use (e.g. tokenizer, parser, entity recognizer). as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words: words. True means that the word is followed by a space, False means
A list of unicode strings to add to the document as words. If None, it is not. If `None`, defaults to `[True]*len(words)`
defaults to empty list. RETURNS (Doc): The newly constructed object.
spaces:
A list of boolean values, of the same length as words. True
means that the word is followed by a space, False means it is not.
If None, defaults to [True]*len(words)
""" """
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
@ -158,20 +148,26 @@ cdef class Doc:
self.is_parsed = True self.is_parsed = True
def __getitem__(self, object i): def __getitem__(self, object i):
""" """Get a `Token` or `Span` object.
doc[i]
Get the Token object at position i, where i is an integer. i (int or tuple) The index of the token, or the slice of the document to get.
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
`doc[start : end]`.
EXAMPLE:
>>> doc[i]
Get the `Token` object at position `i`, where `i` is an integer.
Negative indexing is supported, and follows the usual Python Negative indexing is supported, and follows the usual Python
semantics, i.e. doc[-2] is doc[len(doc) - 2]. semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
doc[start : end]]
Get a `Span` object, starting at position `start` >>> doc[start : end]]
and ending at position `end`, where `start` and Get a `Span` object, starting at position `start` and ending at
`end` are token indices. For instance, position `end`, where `start` and `end` are token indices. For
`doc[2:5]` produces a span consisting of instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) Stepped slices (e.g. `doc[start : end : step]`) are not supported,
are not supported, as `Span` objects must be contiguous (cannot have gaps). as `Span` objects must be contiguous (cannot have gaps). You can use
You can use negative indices and open-ended ranges, which have their negative indices and open-ended ranges, which have their normal
normal Python semantics. Python semantics.
""" """
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
@ -186,14 +182,14 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self) return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
""" """Iterate over `Token` objects, from which the annotations can be
for token in doc easily accessed. This is the main way of accessing `Token` objects,
Iterate over `Token` objects, from which the annotations can which are the main way annotations are accessed from Python. If faster-
be easily accessed. This is the main way of accessing Token than-Python speeds are required, you can instead access the annotations
objects, which are the main way annotations are accessed from as a numpy array, or access the underlying C data directly from Cython.
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the EXAMPLE:
underlying C data directly from Cython. >>> for token in doc
""" """
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
@ -203,9 +199,12 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self) yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
""" """The number of tokens in the document.
len(doc)
The number of tokens in the document. RETURNS (int): The number of tokens in the document.
EXAMPLE:
>>> len(doc)
""" """
return self.length return self.length
@ -228,16 +227,12 @@ cdef class Doc:
return self return self
def similarity(self, other): def similarity(self, other):
""" """Make a semantic similarity estimate. The default estimate is cosine
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other (object): The object to compare with. By default, accepts Doc, `Span`, `Token` and `Lexeme` objects.
Span, Token and Lexeme objects. RETURNS (float): A scalar similarity score. Higher is more similar.
Return:
score (float): A scalar similarity score. Higher is more similar.
""" """
if 'similarity' in self.user_hooks: if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other) return self.user_hooks['similarity'](self, other)
@ -246,8 +241,10 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
""" """A boolean value indicating whether a word vector is associated with
A boolean value indicating whether a word vector is associated with the object. the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
@ -256,10 +253,11 @@ cdef class Doc:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
""" """A real-valued meaning representation. Defaults to an average of the
A real-valued meaning representation. Defaults to an average of the token vectors. token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32'] RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the document's semantics.
""" """
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks: if 'vector' in self.user_hooks:
@ -275,6 +273,10 @@ cdef class Doc:
self._vector = value self._vector = value
property vector_norm: property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
if 'vector_norm' in self.user_hooks: if 'vector_norm' in self.user_hooks:
return self.user_hooks['vector_norm'](self) return self.user_hooks['vector_norm'](self)
@ -295,34 +297,37 @@ cdef class Doc:
return self.text return self.text
property text: property text:
""" """A unicode representation of the document text.
A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document.
""" """
def __get__(self): def __get__(self):
return u''.join(t.text_with_ws for t in self) return u''.join(t.text_with_ws for t in self)
property text_with_ws: property text_with_ws:
""" """An alias of `Doc.text`, provided for duck-type compatibility with
An alias of Doc.text, provided for duck-type compatibility with Span and Token. `Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document.
""" """
def __get__(self): def __get__(self):
return self.text return self.text
property ents: property ents:
""" """Iterate over the entities in the document. Yields named-entity `Span`
Yields named-entity `Span` objects, if the entity recognizer objects, if the entity recognizer has been applied to the document.
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
Example: YIELDS (Span): Entities in the document.
from spacy.en import English
nlp = English() EXAMPLE: Iterate over the span to get individual Token objects, or access
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') the label:
ents = list(tokens.ents)
assert ents[0].label == 346 >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
assert ents[0].label_ == 'PERSON' >>> ents = list(tokens.ents)
assert ents[0].orth_ == 'Best' >>> assert ents[0].label == 346
assert ents[0].text == 'Mr. Best' >>> assert ents[0].label_ == 'PERSON'
>>> assert ents[0].orth_ == 'Best'
>>> assert ents[0].text == 'Mr. Best'
""" """
def __get__(self): def __get__(self):
cdef int i cdef int i
@ -387,12 +392,13 @@ cdef class Doc:
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
property noun_chunks: property noun_chunks:
""" """Iterate over the base noun phrases in the document. Yields base
Yields base noun-phrase #[code Span] objects, if the document noun-phrase #[code Span] objects, if the document has been syntactically
has been syntactically parsed. A base noun phrase, or parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
'NP chunk', is a noun phrase that does not permit other NPs to not permit other NPs to be nested within it so no NP-level
be nested within it so no NP-level coordination, no prepositional coordination, no prepositional phrases, and no relative clauses.
phrases, and no relative clauses.
YIELDS (Span): Noun chunks in the document.
""" """
def __get__(self): def __get__(self):
if not self.is_parsed: if not self.is_parsed:
@ -411,17 +417,15 @@ cdef class Doc:
yield span yield span
property sents: property sents:
""" """Iterate over the sentences in the document. Yields sentence `Span`
Yields sentence `Span` objects. Sentence spans have no label. objects. Sentence spans have no label. To improve accuracy on informal
To improve accuracy on informal texts, spaCy calculates sentence texts, spaCy calculates sentence boundaries from the syntactic
boundaries from the syntactic dependency parse. If the parser is disabled, dependency parse. If the parser is disabled, the `sents` iterator will
`sents` iterator will be unavailable. be unavailable.
Example: EXAMPLE:
from spacy.en import English >>> doc = nlp("This is a sentence. Here's another...")
nlp = English() >>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
""" """
def __get__(self): def __get__(self):
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
@ -467,24 +471,20 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
""" """Given a list of M attribute IDs, export the tokens to a numpy
Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document.
`ndarray` of shape (N, M), where `N` is the length The values will be 32-bit integers.
of the document. The values will be 32-bit integers.
Example: attr_ids (list[int]): A list of attribute ID ints.
from spacy import attrs RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
doc = nlp(text) per word, and one column per attribute indicated in the input
# All strings mapped to integers, for easy export to numpy `attr_ids`.
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments: EXAMPLE:
attr_ids (list[int]): A list of attribute ID ints. >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
>>> doc = nlp(text)
Returns: >>> # All strings mapped to integers, for easy export to numpy
feat_array (numpy.ndarray[long, ndim=2]): >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
A feature matrix, with one row per word, and one column per attribute
indicated in the input attr_ids.
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
@ -499,27 +499,20 @@ cdef class Doc:
return output return output
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
""" """Count the frequencies of a given attribute. Produces a dict of
Produce a dict of {attribute (int): count (ints)} frequencies, keyed `{attribute (int): count (ints)}` frequencies, keyed by the values of
by the values of the given attribute ID. the given attribute ID.
Example: attr_id (int): The attribute ID to key the counts.
from spacy.en import English RETURNS (dict): A dictionary mapping attributes to integer counts.
from spacy import attrs
nlp = English()
tokens = nlp(u'apple apple orange banana')
tokens.count_by(attrs.ORTH)
# {12800L: 1, 11880L: 2, 7561L: 1}
tokens.to_array([attrs.ORTH])
# array([[11880],
# [11880],
# [ 7561],
# [12800]])
Arguments: EXAMPLE:
attr_id >>> from spacy import attrs
int >>> doc = nlp(u'apple apple orange banana')
The attribute ID to key the counts. >>> tokens.count_by(attrs.ORTH)
{12800L: 1, 11880L: 2, 7561L: 1}
>>> tokens.to_array([attrs.ORTH])
array([[11880], [11880], [7561], [12800]])
""" """
cdef int i cdef int i
cdef attr_t attr cdef attr_t attr
@ -567,8 +560,12 @@ cdef class Doc:
self.c[i] = parsed[i] self.c[i] = parsed[i]
def from_array(self, attrs, int[:, :] array): def from_array(self, attrs, int[:, :] array):
""" """Load attributes from a numpy array. Write to a `Doc` object, from an
Write to a `Doc` object, from an `(M, N)` array of attributes. `(M, N)` array of attributes.
attrs (ints): A list of attribute ID ints.
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
RETURNS (Doc): Itself.
""" """
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
@ -597,8 +594,10 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
""" """Serialize, i.e. export the document contents to a binary string.
Serialize, producing a byte string.
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
all annotations.
""" """
return dill.dumps( return dill.dumps(
(self.text, (self.text,
@ -611,8 +610,10 @@ cdef class Doc:
protocol=-1) protocol=-1)
def from_bytes(self, data): def from_bytes(self, data):
""" """Deserialize, i.e. import the document contents from a binary string.
Deserialize, loading from bytes.
data (bytes): The string to load from.
RETURNS (Doc): Itself.
""" """
if self.length != 0: if self.length != 0:
raise ValueError("Cannot load into non-empty Doc") raise ValueError("Cannot load into non-empty Doc")
@ -640,21 +641,16 @@ cdef class Doc:
return self return self
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
""" """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
Retokenize the document, such that the span at doc.text[start_idx : end_idx] is merged into a single token. If `start_idx` and `end_idx `do not mark
is merged into a single token. If start_idx and end_idx do not mark start start and end token boundaries, the document remains unchanged.
and end token boundaries, the document remains unchanged.
Arguments: start_idx (int): The character index of the start of the slice to merge.
start_idx (int): The character index of the start of the slice to merge. end_idx (int): The character index after the end of the slice to merge.
end_idx (int): The character index after the end of the slice to merge. **attributes: Attributes to assign to the merged token. By default,
**attributes: attributes are inherited from the syntactic root token of the span.
Attributes to assign to the merged token. By default, attributes RETURNS (Token): The newly merged token, or `None` if the start and end
are inherited from the syntactic root token of the span. indices did not fall at token boundaries.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:
@ -758,7 +754,29 @@ cdef class Doc:
return self[start] return self[start]
def print_tree(self, light=False, flat=False): def print_tree(self, light=False, flat=False):
"""Returns the parse trees in the JSON (Dict) format.""" """Returns the parse trees in JSON (dict) format.
light (bool): Don't include lemmas or entities.
flat (bool): Don't include arcs or modifiers.
RETURNS (dict): Parse tree as dict.
EXAMPLE:
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> trees[1]
{'modifiers': [
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
{'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
"""
return parse_tree(self, light=light, flat=flat) return parse_tree(self, light=light, flat=flat)

View File

@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
def merge_ents(doc): def merge_ents(doc):
""" """Helper: merge adjacent entities into single tokens; modifies the doc."""
Helper: merge adjacent entities into single tokens; modifies the doc.
"""
for ent in doc.ents: for ent in doc.ents:
ent.merge(ent.root.tag_, ent.text, ent.label_) ent.merge(ent.root.tag_, ent.text, ent.label_)
return doc return doc
def format_POS(token, light, flat): def format_POS(token, light, flat):
""" """Helper: form the POS output for a token."""
Helper: form the POS output for a token.
"""
subtree = dict([ subtree = dict([
("word", token.text), ("word", token.text),
("lemma", token.lemma_), # trigger ("lemma", token.lemma_), # trigger
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
def POS_tree(root, light=False, flat=False): def POS_tree(root, light=False, flat=False):
""" """Helper: generate a POS tree for a root token. The doc must have
Helper: generate a POS tree for a root token. The doc must have `merge_ents(doc)` ran on it.
merge_ents(doc) ran on it.
""" """
subtree = format_POS(root, light=light, flat=flat) subtree = format_POS(root, light=light, flat=flat)
for c in root.children: for c in root.children:
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
def parse_tree(doc, light=False, flat=False): def parse_tree(doc, light=False, flat=False):
""" """Makes a copy of the doc, then construct a syntactic parse tree, similar to
Makes a copy of the doc, then construct a syntactic parse tree, similar to
the one used in displaCy. Generates the POS tree for all sentences in a doc. the one used in displaCy. Generates the POS tree for all sentences in a doc.
Args: doc (Doc): The doc for parsing.
doc: The doc for parsing. RETURNS (dict): The parse tree.
Returns: EXAMPLE:
[parse_trees (Dict)]: >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
>>> trees = doc.print_tree()
>>> from spacy.en import English >>> trees[1]
>>> nlp = English() {'modifiers': [
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
>>> trees = doc.print_tree() 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] {'modifiers': [
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
'POS_fine': 'NN', 'lemma': 'pizza'},
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
'POS_fine': 'VBD', 'lemma': 'eat'}
""" """
doc_clone = Doc(doc.vocab, words=[w.text for w in doc]) doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],

View File

@ -20,22 +20,17 @@ from .. import about
cdef class Span: cdef class Span:
""" """A slice from a Doc object."""
A slice from a Doc object.
"""
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None): vector_norm=None):
""" """Create a `Span` object from the slice `doc[start : end]`.
Create a Span object from the slice doc[start : end]
Arguments: doc (Doc): The parent document.
doc (Doc): The parent document. start (int): The index of the first token of the span.
start (int): The index of the first token of the span. end (int): The index of the first token after the span.
end (int): The index of the first token after the span. label (int): A label to attach to the Span, e.g. for named entities.
label (int): A label to attach to the Span, e.g. for named entities. vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. RETURNS (Span): The newly constructed object.
Returns:
Span The newly constructed object.
""" """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
raise IndexError raise IndexError
@ -70,8 +65,11 @@ cdef class Span:
def __hash__(self): def __hash__(self):
return hash((self.doc, self.label, self.start_char, self.end_char)) return hash((self.doc, self.label, self.start_char, self.end_char))
def __len__(self): def __len__(self):
"""Get the number of tokens in the span.
RETURNS (int): The number of tokens in the span.
"""
self._recalculate_indices() self._recalculate_indices()
if self.end < self.start: if self.end < self.start:
return 0 return 0
@ -83,6 +81,16 @@ cdef class Span:
return self.text.encode('utf-8') return self.text.encode('utf-8')
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or a `Span` object
i (int or tuple): The index of the token within the span, or slice of
the span to get.
RETURNS (Token or Span): The token at `span[i]`.
EXAMPLE:
>>> span[0]
>>> span[1:3]
"""
self._recalculate_indices() self._recalculate_indices()
if isinstance(i, slice): if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step) start, end = normalize_slice(len(self), i.start, i.stop, i.step)
@ -94,35 +102,31 @@ cdef class Span:
return self.doc[self.start + i] return self.doc[self.start + i]
def __iter__(self): def __iter__(self):
"""Iterate over `Token` objects.
YIELDS (Token): A `Token` object.
"""
self._recalculate_indices() self._recalculate_indices()
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self.doc[i] yield self.doc[i]
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
""" """Retokenize the document, such that the span is merged into a single
Retokenize the document, such that the span is merged into a single token. token.
Arguments: **attributes: Attributes to assign to the merged token. By default,
**attributes: attributes are inherited from the syntactic root token of the span.
Attributes to assign to the merged token. By default, attributes RETURNS (Token): The newly merged token.
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token.
""" """
return self.doc.merge(self.start_char, self.end_char, *args, **attributes) return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other): def similarity(self, other):
""" """Make a semantic similarity estimate. The default estimate is cosine
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors. similarity using an average of word vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other (object): The object to compare with. By default, accepts Doc, `Span`, `Token` and `Lexeme` objects.
Span, Token and Lexeme objects. RETURNS (float): A scalar similarity score. Higher is more similar.
Return:
score (float): A scalar similarity score. Higher is more similar.
""" """
if 'similarity' in self.doc.user_span_hooks: if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other) self.doc.user_span_hooks['similarity'](self, other)
@ -145,11 +149,9 @@ cdef class Span:
self.end = end + 1 self.end = end + 1
property sent: property sent:
""" """The sentence span that this span is a part of.
The sentence span that this span is a part of.
Returns: RETURNS (Span): The sentence span that the span is a part of.
Span The sentence this is part of.
""" """
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
@ -166,12 +168,23 @@ cdef class Span:
return self.doc[root.l_edge : root.r_edge + 1] return self.doc[root.l_edge : root.r_edge + 1]
property has_vector: property has_vector:
"""A boolean value indicating whether a word vector is associated with
the object.
RETURNS (bool): Whether a word vector is associated with the object.
"""
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_span_hooks: if 'has_vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['has_vector'](self) return self.doc.user_span_hooks['has_vector'](self)
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
"""A real-valued meaning representation. Defaults to an average of the
token vectors.
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the span's semantics.
"""
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_span_hooks: if 'vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self) return self.doc.user_span_hooks['vector'](self)
@ -180,6 +193,10 @@ cdef class Span:
return self._vector return self._vector
property vector_norm: property vector_norm:
"""The L2 norm of the document's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
if 'vector_norm' in self.doc.user_span_hooks: if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['vector'](self) return self.doc.user_span_hooks['vector'](self)
@ -193,6 +210,7 @@ cdef class Span:
return self._vector_norm return self._vector_norm
property sentiment: property sentiment:
# TODO: docstring
def __get__(self): def __get__(self):
if 'sentiment' in self.doc.user_span_hooks: if 'sentiment' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sentiment'](self) return self.doc.user_span_hooks['sentiment'](self)
@ -200,6 +218,10 @@ cdef class Span:
return sum([token.sentiment for token in self]) / len(self) return sum([token.sentiment for token in self]) / len(self)
property text: property text:
"""A unicode representation of the span text.
RETURNS (unicode): The original verbatim text of the span.
"""
def __get__(self): def __get__(self):
text = self.text_with_ws text = self.text_with_ws
if self[-1].whitespace_: if self[-1].whitespace_:
@ -207,16 +229,21 @@ cdef class Span:
return text return text
property text_with_ws: property text_with_ws:
"""The text content of the span with a trailing whitespace character if
the last token has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""
def __get__(self): def __get__(self):
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property noun_chunks: property noun_chunks:
""" """Yields base noun-phrase `Span` objects, if the document has been
Yields base noun-phrase #[code Span] objects, if the document syntactically parsed. A base noun phrase, or "NP chunk", is a noun
has been syntactically parsed. A base noun phrase, or phrase that does not permit other NPs to be nested within it so no
'NP chunk', is a noun phrase that does not permit other NPs to NP-level coordination, no prepositional phrases, and no relative clauses.
be nested within it so no NP-level coordination, no prepositional
phrases, and no relative clauses. For example: YIELDS (Span): Base noun-phrase `Span` objects
""" """
def __get__(self): def __get__(self):
if not self.doc.is_parsed: if not self.doc.is_parsed:
@ -235,49 +262,47 @@ cdef class Span:
yield span yield span
property root: property root:
""" """The token within the span that's highest in the parse tree.
The token within the span that's highest in the parse tree. If there's a If there's a tie, the earliest is prefered.
tie, the earlist is prefered.
Returns: RETURNS (Token): The root token.
Token: The root token.
i.e. has the shortest path to the root of the sentence (or is the root EXAMPLE: The root token has the shortest path to the root of the sentence
itself). If multiple words are equally high in the tree, the first word (or is the root itself). If multiple words are equally high in the
is taken. For example: tree, the first word is taken. For example:
>>> toks = nlp(u'I like New York in Autumn.') >>> toks = nlp(u'I like New York in Autumn.')
Let's name the indices --- easier than writing "toks[4]" etc. Let's name the indices easier than writing `toks[4]` etc.
>>> i, like, new, york, in_, autumn, dot = range(len(toks)) >>> i, like, new, york, in_, autumn, dot = range(len(toks))
The head of 'new' is 'York', and the head of 'York' is 'like' The head of 'new' is 'York', and the head of "York" is "like"
>>> toks[new].head.orth_ >>> toks[new].head.text
'York' 'York'
>>> toks[york].head.orth_ >>> toks[york].head.text
'like' 'like'
Create a span for "New York". Its root is "York". Create a span for "New York". Its root is "York".
>>> new_york = toks[new:york+1] >>> new_york = toks[new:york+1]
>>> new_york.root.orth_ >>> new_york.root.text
'York' 'York'
Here's a more complicated case, raise by Issue #214 Here's a more complicated case, raised by issue #214:
>>> toks = nlp(u'to, north and south carolina') >>> toks = nlp(u'to, north and south carolina')
>>> to, north, and_, south, carolina = toks >>> to, north, and_, south, carolina = toks
>>> south.head.text, carolina.head.text >>> south.head.text, carolina.head.text
('north', 'to') ('north', 'to')
Here 'south' is a child of 'north', which is a child of 'carolina'. Here "south" is a child of "north", which is a child of "carolina".
Carolina is the root of the span: Carolina is the root of the span:
>>> south_carolina = toks[-2:] >>> south_carolina = toks[-2:]
>>> south_carolina.root.text >>> south_carolina.root.text
'carolina' 'carolina'
""" """
def __get__(self): def __get__(self):
self._recalculate_indices() self._recalculate_indices()
@ -314,10 +339,10 @@ cdef class Span:
return self.doc[root] return self.doc[root]
property lefts: property lefts:
""" """ Tokens that are to the left of the span, whose head is within the
Tokens that are to the left of the span, whose head is within the Span. `Span`.
Yields: Token A left-child of a token of the span. YIELDS (Token):A left-child of a token of the span.
""" """
def __get__(self): def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order for token in reversed(self): # Reverse, so we get the tokens in order
@ -326,10 +351,10 @@ cdef class Span:
yield left yield left
property rights: property rights:
""" """Tokens that are to the right of the Span, whose head is within the
Tokens that are to the right of the Span, whose head is within the Span. `Span`.
Yields: Token A right-child of a token of the span. YIELDS (Token): A right-child of a token of the span.
""" """
def __get__(self): def __get__(self):
for token in self: for token in self:
@ -338,10 +363,9 @@ cdef class Span:
yield right yield right
property subtree: property subtree:
""" """Tokens that descend from tokens in the span, but fall outside it.
Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span. YIELDS (Token): A descendant of a token within the span.
""" """
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
@ -351,8 +375,9 @@ cdef class Span:
yield from word.subtree yield from word.subtree
property ent_id: property ent_id:
""" """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
An (integer) entity ID. Usually assigned by patterns in the Matcher.
RETURNS (int): The entity ID.
""" """
def __get__(self): def __get__(self):
return self.root.ent_id return self.root.ent_id
@ -362,9 +387,11 @@ cdef class Span:
raise NotImplementedError( raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on the issue " "Can't yet set ent_id from Span. Vote for this feature on the issue "
"tracker: http://github.com/explosion/spaCy/issues") "tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
""" """A (string) entity ID. Usually assigned by patterns in the `Matcher`.
A (string) entity ID. Usually assigned by patterns in the Matcher.
RETURNS (unicode): The entity ID.
""" """
def __get__(self): def __get__(self):
return self.root.ent_id_ return self.root.ent_id_
@ -376,26 +403,38 @@ cdef class Span:
"tracker: http://github.com/explosion/spaCy/issues") "tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
# TODO: docstring
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]).strip() return ''.join([t.string for t in self]).strip()
property lemma_: property lemma_:
"""The span's lemma.
RETURNS (unicode): The span's lemma.
"""
def __get__(self): def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
property upper_: property upper_:
# TODO: docstring
def __get__(self): def __get__(self):
return ''.join([t.string.upper() for t in self]).strip() return ''.join([t.string.upper() for t in self]).strip()
property lower_: property lower_:
# TODO: docstring
def __get__(self): def __get__(self):
return ''.join([t.string.lower() for t in self]).strip() return ''.join([t.string.lower() for t in self]).strip()
property string: property string:
# TODO: docstring
def __get__(self): def __get__(self):
return ''.join([t.string for t in self]) return ''.join([t.string for t in self])
property label_: property label_:
"""The span's label.
RETURNS (unicode): The span's label.
"""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]

View File

@ -23,10 +23,14 @@ from .. import about
cdef class Token: cdef class Token:
""" """An individual token i.e. a word, punctuation symbol, whitespace, etc."""
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
"""
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
"""Construct a `Token` object.
vocab (Vocab): A storage container for lexical types.
doc (Doc): The parent document.
offset (int): The index of the token within the document.
"""
self.vocab = vocab self.vocab = vocab
self.doc = doc self.doc = doc
self.c = &self.doc.c[offset] self.c = &self.doc.c[offset]
@ -36,8 +40,9 @@ cdef class Token:
return hash((self.doc, self.i)) return hash((self.doc, self.i))
def __len__(self): def __len__(self):
""" """The number of unicode characters in the token, i.e. `token.text`.
Number of unicode characters in token.text.
RETURNS (int): The number of unicode characters in the token.
""" """
return self.c.lex.length return self.c.lex.length
@ -75,37 +80,35 @@ cdef class Token:
raise ValueError(op) raise ValueError(op)
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
""" """Check the value of a boolean flag.
Check the value of a boolean flag.
Arguments: flag_id (int): The ID of the flag attribute.
flag_id (int): The ID of the flag attribute. RETURNS (bool): Whether the flag is set.
Returns:
is_set (bool): Whether the flag is set. EXAMPLE:
>>> from spacy.attrs import IS_TITLE
>>> doc = nlp(u'Give it back! He pleaded.')
>>> token = doc[0]
>>> token.check_flag(IS_TITLE)
True
""" """
return Lexeme.c_check_flag(self.c.lex, flag_id) return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
""" """Get a neighboring token.
Get a neighboring token.
Arguments: i (int): The relative position of the token to get. Defaults to 1.
i (int): The relative position of the token to get. Defaults to 1. RETURNS (Token): The token at position `self.doc[self.i+i]`.
Returns:
neighbor (Token): The token at position self.doc[self.i+i]
""" """
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
""" """Make a semantic similarity estimate. The default estimate is cosine
Compute a semantic similarity estimate. Defaults to cosine over vectors. similarity using an average of word vectors.
Arguments: other (object): The object to compare with. By default, accepts `Doc`,
other: `Span`, `Token` and `Lexeme` objects.
The object to compare with. By default, accepts Doc, Span, RETURNS (float): A scalar similarity score. Higher is more similar.
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
""" """
if 'similarity' in self.doc.user_token_hooks: if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self) return self.doc.user_token_hooks['similarity'](self)
@ -114,10 +117,14 @@ cdef class Token:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property lex_id: property lex_id:
"""ID of the token's lexical type.
RETURNS (int): ID of the token's lexical type."""
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
property rank: property rank:
# TODO: add docstring
def __get__(self): def __get__(self):
return self.c.lex.id return self.c.lex.id
@ -126,10 +133,19 @@ cdef class Token:
return self.text_with_ws return self.text_with_ws
property text: property text:
"""A unicode representation of the token text.
RETURNS (unicode): The original verbatim text of the token.
"""
def __get__(self): def __get__(self):
return self.orth_ return self.orth_
property text_with_ws: property text_with_ws:
"""The text content of the token with a trailing whitespace character if
it has one.
RETURNS (unicode): The text content of the span (with trailing whitespace).
"""
def __get__(self): def __get__(self):
cdef unicode orth = self.vocab.strings[self.c.lex.orth] cdef unicode orth = self.vocab.strings[self.c.lex.orth]
if self.c.spacy: if self.c.spacy:
@ -184,6 +200,10 @@ cdef class Token:
return self.c.lex.suffix return self.c.lex.suffix
property lemma: property lemma:
"""Base form of the word, with no inflectional suffixes.
RETURNS (int): Token lemma.
"""
def __get__(self): def __get__(self):
return self.c.lemma return self.c.lemma
def __set__(self, int lemma): def __set__(self, int lemma):
@ -206,8 +226,10 @@ cdef class Token:
self.c.dep = label self.c.dep = label
property has_vector: property has_vector:
""" """A boolean value indicating whether a word vector is associated with
A boolean value indicating whether a word vector is associated with the object. the object.
RETURNS (bool): Whether a word vector is associated with the object.
""" """
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_token_hooks: if 'has_vector' in self.doc.user_token_hooks:
@ -220,10 +242,10 @@ cdef class Token:
return False return False
property vector: property vector:
""" """A real-valued meaning representation.
A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32'] RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
representing the token's semantics.
""" """
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_token_hooks: if 'vector' in self.doc.user_token_hooks:
@ -239,15 +261,11 @@ cdef class Token:
vector_view = <float[:length,]>self.c.lex.vector vector_view = <float[:length,]>self.c.lex.vector
return numpy.asarray(vector_view) return numpy.asarray(vector_view)
property repvec:
def __get__(self):
raise AttributeError("repvec was renamed to vector in v0.100")
property has_repvec:
def __get__(self):
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
property vector_norm: property vector_norm:
"""The L2 norm of the token's vector representation.
RETURNS (float): The L2 norm of the vector representation.
"""
def __get__(self): def __get__(self):
if 'vector_norm' in self.doc.user_token_hooks: if 'vector_norm' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector_norm'](self) return self.doc.user_token_hooks['vector_norm'](self)
@ -324,28 +342,26 @@ cdef class Token:
yield from word.subtree yield from word.subtree
property left_edge: property left_edge:
""" """The leftmost token of this token's syntactic descendents.
The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token) RETURNS (Token): The first token such that `self.is_ancestor(token)`.
""" """
def __get__(self): def __get__(self):
return self.doc[self.c.l_edge] return self.doc[self.c.l_edge]
property right_edge: property right_edge:
""" """The rightmost token of this token's syntactic descendents.
The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token) RETURNS (Token): The last token such that `self.is_ancestor(token)`.
""" """
def __get__(self): def __get__(self):
return self.doc[self.c.r_edge] return self.doc[self.c.r_edge]
property ancestors: property ancestors:
""" """A sequence of this token's syntactic ancestors.
A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) YIELDS (Token): A sequence of ancestor tokens such that
`ancestor.is_ancestor(self)`.
""" """
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
@ -357,33 +373,25 @@ cdef class Token:
yield self.doc[head_ptr - (self.c - self.i)] yield self.doc[head_ptr - (self.c - self.i)]
i += 1 i += 1
def is_ancestor_of(self, descendant):
# TODO: Remove after backward compatibility check.
return self.is_ancestor(descendant)
def is_ancestor(self, descendant): def is_ancestor(self, descendant):
""" """Check whether this token is a parent, grandparent, etc. of another
Check whether this token is a parent, grandparent, etc. of another
in the dependency tree. in the dependency tree.
Arguments: descendant (Token): Another token.
descendant (Token): Another token. RETURNS (bool): Whether this token is the ancestor of the descendant.
Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant.
""" """
if self.doc is not descendant.doc: if self.doc is not descendant.doc:
return False return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors ) return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head: property head:
""" """The syntactic parent, or "governor", of this token.
The syntactic parent, or "governor", of this token.
Returns: Token RETURNS (Token): The token head.
""" """
def __get__(self): def __get__(self):
""" """The token predicted by the parser to be the head of the current
The token predicted by the parser to be the head of the current token. token.
""" """
return self.doc[self.i + self.c.head] return self.doc[self.i + self.c.head]
def __set__(self, Token new_head): def __set__(self, Token new_head):
@ -399,7 +407,7 @@ cdef class Token:
cdef int rel_newhead_i = new_head.i - self.i cdef int rel_newhead_i = new_head.i - self.i
# is the new head a descendant of the old head # is the new head a descendant of the old head
cdef bint is_desc = old_head.is_ancestor_of(new_head) cdef bint is_desc = old_head.is_ancestor(new_head)
cdef int new_edge cdef int new_edge
cdef Token anc, child cdef Token anc, child
@ -477,10 +485,9 @@ cdef class Token:
self.c.head = rel_newhead_i self.c.head = rel_newhead_i
property conjuncts: property conjuncts:
""" """A sequence of coordinated tokens, including the token itself.
A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token YIELDS (Token): A coordinated token.
""" """
def __get__(self): def __get__(self):
"""Get a list of conjoined words.""" """Get a list of conjoined words."""
@ -495,25 +502,46 @@ cdef class Token:
yield from word.conjuncts yield from word.conjuncts
property ent_type: property ent_type:
"""Named entity type.
RETURNS (int): Named entity type.
"""
def __get__(self): def __get__(self):
return self.c.ent_type return self.c.ent_type
property ent_iob: property ent_iob:
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
is assigned.
RETURNS (int): IOB code of named entity tag.
"""
def __get__(self): def __get__(self):
return self.c.ent_iob return self.c.ent_iob
property ent_type_: property ent_type_:
"""Named entity type.
RETURNS (unicode): Named entity type.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_type] return self.vocab.strings[self.c.ent_type]
property ent_iob_: property ent_iob_:
"""IOB code of named entity tag. "B" means the token begins an entity,
"I" means it is inside an entity, "O" means it is outside an entity, and
"" means no entity tag is set.
RETURNS (unicode): IOB code of named entity tag.
"""
def __get__(self): def __get__(self):
iob_strings = ('', 'I', 'O', 'B') iob_strings = ('', 'I', 'O', 'B')
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
property ent_id: property ent_id:
""" """ID of the entity the token is an instance of, if any. Usually
An (integer) entity ID. Usually assigned by patterns in the Matcher. assigned by patterns in the Matcher.
RETURNS (int): ID of the entity.
""" """
def __get__(self): def __get__(self):
return self.c.ent_id return self.c.ent_id
@ -522,8 +550,10 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
""" """ID of the entity the token is an instance of, if any. Usually
A (string) entity ID. Usually assigned by patterns in the Matcher. assigned by patterns in the Matcher.
RETURNS (unicode): ID of the entity.
""" """
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_id] return self.vocab.strings[self.c.ent_id]
@ -564,6 +594,10 @@ cdef class Token:
return self.vocab.strings[self.c.lex.lang] return self.vocab.strings[self.c.lex.lang]
property lemma_: property lemma_:
"""Base form of the word, with no inflectional suffixes.
RETURNS (unicode): Token lemma.
"""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]
def __set__(self, unicode lemma_): def __set__(self, unicode lemma_):

View File

@ -145,7 +145,8 @@ def parse_package_meta(package_path, require=True):
def is_in_jupyter(): def is_in_jupyter():
"""Check if user is in a Jupyter notebook. Mainly used for displaCy. """Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
RETURNS (bool): True if in Jupyter, False if not. RETURNS (bool): True if in Jupyter, False if not.
""" """

View File

@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC
cdef class Vocab: cdef class Vocab:
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
C-data that is shared between `Doc` objects.
""" """
A map container for a language's LexemeC structs.
"""
@classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, oov_prob=True, **deprecated_kwargs):
"""
Deprecated --- replace in spaCy 2
Load the vocabulary from a path.
Arguments:
path (Path):
The path to load from.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
"""
path = util.ensure_path(path)
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
if 'vectors' in deprecated_kwargs:
raise AttributeError(
"vectors argument to Vocab.load() deprecated. "
"Install vectors after loading.")
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
tag_map = ujson.load(file_)
elif tag_map is True:
tag_map = None
if lex_attr_getters is not None \
and oov_prob is True \
and (path / 'vocab' / 'oov_prob').exists():
with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
oov_prob = float(file_.read())
lex_attr_getters[PROB] = lambda text: oov_prob
if lemmatizer is True:
lemmatizer = Lemmatizer.load(path)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_)
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
lemmatizer=lemmatizer,
strings=strings_list)
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
return self
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), **deprecated_kwargs): strings=tuple(), **deprecated_kwargs):
""" """Create the vocabulary.
Create the vocabulary.
lex_attr_getters (dict): lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
A dictionary mapping attribute IDs to functions to compute them. to compute them. Defaults to `None`.
Defaults to None. tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
lemmatizer (object): parts-of-speech, and optionally morphological attributes.
A lemmatizer. Defaults to None. lemmatizer (object): A lemmatizer. Defaults to `None`.
tag_map (dict): strings (StringStore): StringStore that maps strings to integers, and
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, vice versa.
and optionally morphological attributes. RETURNS (Vocab): The newly constructed vocab object.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
""" """
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
@ -148,33 +91,32 @@ cdef class Vocab:
return langfunc('_') if langfunc else '' return langfunc('_') if langfunc else ''
def __len__(self): def __len__(self):
""" """The current number of lexemes stored.
The current number of lexemes stored.
RETURNS (int): The current number of lexemes stored.
""" """
return self.length return self.length
def add_flag(self, flag_getter, int flag_id=-1): def add_flag(self, flag_getter, int flag_id=-1):
""" """Set a new boolean flag to words in the vocabulary.
Set a new boolean flag to words in the vocabulary.
The flag_setter function will be called over the words currently in the The flag_getter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id). to access the flag value on each token, using token.check_flag(flag_id).
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
`Token.check_flag`.
See also: flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag. value.
flag_id (int): An integer between 1 and 63 (inclusive), specifying
the bit at which the flag will be stored. If -1, the lowest
available bit will be chosen.
RETURNS (int): The integer ID by which the flag value can be checked.
Arguments: EXAMPLE:
flag_getter: >>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
A function f(unicode) -> bool, to get the flag value. >>> doc = nlp(u'I like spaCy')
>>> assert doc[2].check_flag(MY_PRODUCT) == True
flag_id (int):
An integer between 1 and 63 (inclusive), specifying the bit at which the
flag will be stored. If -1, the lowest available bit will be
chosen.
Returns:
flag_id (int): The integer ID by which the flag value can be checked.
""" """
if flag_id == -1: if flag_id == -1:
for bit in range(1, 64): for bit in range(1, 64):
@ -196,9 +138,8 @@ cdef class Vocab:
return flag_id return flag_id
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
""" """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon. is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if string == u'': if string == u'':
@ -216,9 +157,8 @@ cdef class Vocab:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
""" """Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool
if necessary, using memory acquired from the given pool. If the pool
is the lexicon's own memory, the lexeme is saved in the lexicon. is the lexicon's own memory, the lexeme is saved in the lexicon.
""" """
if orth == 0: if orth == 0:
@ -263,24 +203,19 @@ cdef class Vocab:
self.length += 1 self.length += 1
def __contains__(self, unicode string): def __contains__(self, unicode string):
""" """Check whether the string has an entry in the vocabulary.
Check whether the string has an entry in the vocabulary.
Arguments: string (unicode): The ID string.
string (unicode): The ID string. RETURNS (bool) Whether the string has an entry in the vocabulary.
Returns:
bool Whether the string has an entry in the vocabulary.
""" """
key = hash_string(string) key = hash_string(string)
lex = self._by_hash.get(key) lex = self._by_hash.get(key)
return lex is not NULL return lex is not NULL
def __iter__(self): def __iter__(self):
""" """Iterate over the lexemes in the vocabulary.
Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary. YIELDS (Lexeme): An entry in the vocabulary.
""" """
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
@ -288,19 +223,19 @@ cdef class Vocab:
yield Lexeme(self, orth) yield Lexeme(self, orth)
def __getitem__(self, id_or_string): def __getitem__(self, id_or_string):
""" """Retrieve a lexeme, given an int ID or a unicode string. If a
Retrieve a lexeme, given an int ID or a unicode string. If a previously previously unseen unicode string is given, a new lexeme is created and
unseen unicode string is given, a new lexeme is created and stored. stored.
Arguments: id_or_string (int or unicode): The integer ID of a word, or its unicode
id_or_string (int or unicode): string. If `int >= Lexicon.size`, `IndexError` is raised. If
The integer ID of a word, or its unicode string. `id_or_string` is neither an int nor a unicode string, `ValueError`
is raised.
RETURNS (Lexeme): The lexeme indicated by the given ID.
If an int >= Lexicon.size, IndexError is raised. If id_or_string EXAMPLE:
is neither an int nor a unicode string, ValueError is raised. >>> apple = nlp.vocab.strings['apple']
>>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
Returns:
lexeme (Lexeme): The lexeme indicated by the given ID.
""" """
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == unicode: if type(id_or_string) == unicode:
@ -324,15 +259,29 @@ cdef class Vocab:
return tokens return tokens
def to_disk(self, path): def to_disk(self, path):
"""Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or `Path`-like objects.
"""
path = util.ensure_path(path) path = util.ensure_path(path)
if not path.exists(): if not path.exists():
path.mkdir() path.mkdir()
strings_loc = path / 'strings.json' strings_loc = path / 'strings.json'
with strings_loc.open('w', encoding='utf8') as file_: with strings_loc.open('w', encoding='utf8') as file_:
self.strings.dump(file_) self.strings.dump(file_)
self.dump(path / 'lexemes.bin')
# TODO: pickle
# self.dump(path / 'lexemes.bin')
def from_disk(self, path): def from_disk(self, path):
"""Loads state from a directory. Modifies the object in place and
returns it.
path (unicode or Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (Vocab): The modified `Vocab` object.
"""
path = util.ensure_path(path) path = util.ensure_path(path)
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
strings_list = ujson.load(file_) strings_list = ujson.load(file_)
@ -340,6 +289,23 @@ cdef class Vocab:
self.strings[string] self.strings[string]
self.load_lexemes(path / 'lexemes.bin') self.load_lexemes(path / 'lexemes.bin')
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
**exclude: Named attributes to prevent from being serialized.
RETURNS (bytes): The serialized form of the `Vocab` object.
"""
raise NotImplementedError()
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object.
"""
raise NotImplementedError()
def lexemes_to_bytes(self, **exclude): def lexemes_to_bytes(self, **exclude):
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
@ -365,9 +331,7 @@ cdef class Vocab:
return byte_string return byte_string
def lexemes_from_bytes(self, bytes bytes_data): def lexemes_from_bytes(self, bytes bytes_data):
""" """Load the binary vocabulary data from the given string."""
Load the binary vocabulary data from the given string.
"""
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
@ -393,14 +357,10 @@ cdef class Vocab:
# Deprecated --- delete these once stable # Deprecated --- delete these once stable
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
""" """Save the word vectors to a binary file.
Save the word vectors to a binary file.
Arguments: loc (Path): The path to save to.
loc (Path): The path to save to. """
Returns:
None
#"""
cdef int32_t vec_len = self.vectors_length cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len cdef int32_t word_len
cdef bytes word_str cdef bytes word_str
@ -424,17 +384,14 @@ cdef class Vocab:
def load_vectors(self, file_): def load_vectors(self, file_):
""" """Load vectors from a text-based file.
Load vectors from a text-based file.
Arguments: file_ (buffer): The file to read from. Entries should be separated by
file_ (buffer): The file to read from. Entries should be separated by newlines, newlines, and each entry should be whitespace delimited. The first value of the entry
and each entry should be whitespace delimited. The first value of the entry should be the word string, and subsequent entries should be the values of the
should be the word string, and subsequent entries should be the values of the vector.
vector.
Returns: RETURNS (int): The length of the vectors loaded.
vec_len (int): The length of the vectors loaded.
""" """
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth cdef attr_t orth
@ -464,14 +421,11 @@ cdef class Vocab:
return vec_len return vec_len
def load_vectors_from_bin_loc(self, loc): def load_vectors_from_bin_loc(self, loc):
""" """Load vectors from the location of a binary file.
Load vectors from the location of a binary file.
Arguments: loc (unicode): The path of the binary file to load from.
loc (unicode): The path of the binary file to load from.
Returns: RETURNS (int): The length of the vectors loaded.
vec_len (int): The length of the vectors loaded.
""" """
cdef CFile file_ = CFile(loc, b'rb') cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len cdef int32_t word_len
@ -526,12 +480,10 @@ cdef class Vocab:
def resize_vectors(self, int new_size): def resize_vectors(self, int new_size):
""" """Set vectors_length to a new size, and allocate more memory for the
Set vectors_length to a new size, and allocate more memory for the Lexeme `Lexeme` vectors if necessary. The memory will be zeroed.
vectors if necessary. The memory will be zeroed.
Arguments: new_size (int): The new size of the vectors.
new_size (int): The new size of the vectors.
""" """
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
@ -633,237 +585,3 @@ class VectorReadError(Exception):
"Vector size: %d\n" "Vector size: %d\n"
"Max size: %d\n" "Max size: %d\n"
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE)) "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
#
#Deprecated --- delete these once stable
#
# def dump_vectors(self, out_loc):
# """
# Save the word vectors to a binary file.
#
# Arguments:
# loc (Path): The path to save to.
# Returns:
# None
# #"""
# cdef int32_t vec_len = self.vectors_length
# cdef int32_t word_len
# cdef bytes word_str
# cdef char* chars
#
# cdef Lexeme lexeme
# cdef CFile out_file = CFile(out_loc, 'wb')
# for lexeme in self:
# word_str = lexeme.orth_.encode('utf8')
# vec = lexeme.c.vector
# word_len = len(word_str)
#
# out_file.write_from(&word_len, 1, sizeof(word_len))
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
#
# chars = <char*>word_str
# out_file.write_from(chars, word_len, sizeof(char))
# out_file.write_from(vec, vec_len, sizeof(float))
# out_file.close()
#
#
#
# def load_vectors(self, file_):
# """
# Load vectors from a text-based file.
#
# Arguments:
# file_ (buffer): The file to read from. Entries should be separated by newlines,
# and each entry should be whitespace delimited. The first value of the entry
# should be the word string, and subsequent entries should be the values of the
# vector.
#
# Returns:
# vec_len (int): The length of the vectors loaded.
# """
# cdef LexemeC* lexeme
# cdef attr_t orth
# cdef int32_t vec_len = -1
# cdef double norm = 0.0
#
# whitespace_pattern = re.compile(r'\s', re.UNICODE)
#
# for line_num, line in enumerate(file_):
# pieces = line.split()
# word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
# if vec_len == -1:
# vec_len = len(pieces)
# elif vec_len != len(pieces):
# raise VectorReadError.mismatched_sizes(file_, line_num,
# vec_len, len(pieces))
# orth = self.strings[word_str]
# lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
# lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
# for i, val_str in enumerate(pieces):
# lexeme.vector[i] = float(val_str)
# norm = 0.0
# for i in range(vec_len):
# norm += lexeme.vector[i] * lexeme.vector[i]
# lexeme.l2_norm = sqrt(norm)
# self.vectors_length = vec_len
# return vec_len
#
# def load_vectors_from_bin_loc(self, loc):
# """
# Load vectors from the location of a binary file.
#
# Arguments:
# loc (unicode): The path of the binary file to load from.
#
# Returns:
# vec_len (int): The length of the vectors loaded.
# """
# cdef CFile file_ = CFile(loc, b'rb')
# cdef int32_t word_len
# cdef int32_t vec_len = 0
# cdef int32_t prev_vec_len = 0
# cdef float* vec
# cdef Address mem
# cdef attr_t string_id
# cdef bytes py_word
# cdef vector[float*] vectors
# cdef int line_num = 0
# cdef Pool tmp_mem = Pool()
# while True:
# try:
# file_.read_into(&word_len, sizeof(word_len), 1)
# except IOError:
# break
# file_.read_into(&vec_len, sizeof(vec_len), 1)
# if prev_vec_len != 0 and vec_len != prev_vec_len:
# raise VectorReadError.mismatched_sizes(loc, line_num,
# vec_len, prev_vec_len)
# if 0 >= vec_len >= MAX_VEC_SIZE:
# raise VectorReadError.bad_size(loc, vec_len)
#
# chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
# vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
#
# string_id = self.strings[chars[:word_len]]
# # Insert words into vocab to add vector.
# self.get_by_orth(self.mem, string_id)
# while string_id >= vectors.size():
# vectors.push_back(EMPTY_VEC)
# assert vec != NULL
# vectors[string_id] = vec
# line_num += 1
# cdef LexemeC* lex
# cdef size_t lex_addr
# cdef double norm = 0.0
# cdef int i
# for orth, lex_addr in self._by_orth.items():
# lex = <LexemeC*>lex_addr
# if lex.lower < vectors.size():
# lex.vector = vectors[lex.lower]
# norm = 0.0
# for i in range(vec_len):
# norm += lex.vector[i] * lex.vector[i]
# lex.l2_norm = sqrt(norm)
# else:
# lex.vector = EMPTY_VEC
# self.vectors_length = vec_len
# return vec_len
#
#
#def write_binary_vectors(in_loc, out_loc):
# cdef CFile out_file = CFile(out_loc, 'wb')
# cdef Address mem
# cdef int32_t word_len
# cdef int32_t vec_len
# cdef char* chars
# with bz2.BZ2File(in_loc, 'r') as file_:
# for line in file_:
# pieces = line.split()
# word = pieces.pop(0)
# mem = Address(len(pieces), sizeof(float))
# vec = <float*>mem.ptr
# for i, val_str in enumerate(pieces):
# vec[i] = float(val_str)
#
# word_len = len(word)
# vec_len = len(pieces)
#
# out_file.write_from(&word_len, 1, sizeof(word_len))
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
#
# chars = <char*>word
# out_file.write_from(chars, len(word), sizeof(char))
# out_file.write_from(vec, vec_len, sizeof(float))
#
#
# def resize_vectors(self, int new_size):
# """
# Set vectors_length to a new size, and allocate more memory for the Lexeme
# vectors if necessary. The memory will be zeroed.
#
# Arguments:
# new_size (int): The new size of the vectors.
# """
# cdef hash_t key
# cdef size_t addr
# if new_size > self.vectors_length:
# for key, addr in self._by_hash.items():
# lex = <LexemeC*>addr
# lex.vector = <float*>self.mem.realloc(lex.vector,
# new_size * sizeof(lex.vector[0]))
# self.vectors_length = new_size
#
#
#
# def dump(self, loc=None):
# """
# Save the lexemes binary data to the given location, or
# return a byte-string with the data if loc is None.
#
# Arguments:
# loc (Path or None): The path to save to, or None.
# """
# if loc is None:
# return self.to_bytes()
# else:
# return self.to_disk(loc)
#
# def load_lexemes(self, loc):
# """
# Load the binary vocabulary data from the given location.
#
# Arguments:
# loc (Path): The path to load from.
#
# Returns:
# None
# """
# fp = CFile(loc, 'rb',
# on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
# cdef LexemeC* lexeme = NULL
# cdef SerializedLexemeC lex_data
# cdef hash_t key
# cdef unicode py_str
# cdef attr_t orth = 0
# assert sizeof(orth) == sizeof(lexeme.orth)
# i = 0
# while True:
# try:
# fp.read_into(&orth, 1, sizeof(orth))
# except IOError:
# break
# lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
# # Copy data from the file into the lexeme
# fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
# Lexeme.c_from_bytes(lexeme, lex_data)
#
# lexeme.vector = EMPTY_VEC
# py_str = self.strings[lexeme.orth]
# key = hash_string(py_str)
# self._by_hash.set(key, lexeme)
# self._by_orth.set(lexeme.orth, lexeme)
# self.length += 1
# i += 1
# fp.close()

View File

@ -80,6 +80,7 @@
} }
], ],
"ALPHA": true,
"V_CSS": "1.6", "V_CSS": "1.6",
"V_JS": "1.2", "V_JS": "1.2",
"DEFAULT_SYNTAX": "python", "DEFAULT_SYNTAX": "python",

View File

@ -34,17 +34,17 @@ mixin src(url)
+a(url) +a(url)
block block
| #[+icon("code", 16).o-icon--inline.u-color-subtle] | #[+icon("code", 16).o-icon--inline.u-color-theme]
//- API link (with added tag and automatically generated path) //- API link (with added tag and automatically generated path)
path - [string] path to API docs page relative to /docs/api/ path - [string] path to API docs page relative to /docs/api/
mixin api(path) mixin api(path)
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block +a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
block block
| #[+icon("book", 18).o-icon--inline.u-color-subtle] | #[+icon("book", 18).o-icon--inline.u-color-theme]
//- Help icon with tooltip //- Help icon with tooltip
@ -104,15 +104,31 @@ mixin button(url, trusted, ...style)
language - [string] language for syntax highlighting (default: "python") language - [string] language for syntax highlighting (default: "python")
supports basic relevant languages available for PrismJS supports basic relevant languages available for PrismJS
mixin code(label, language) mixin code(label, language, icon)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}")&attributes(attributes) pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
if label if label
h4.u-text-label.u-text-label--dark=label h4.u-text-label.u-text-label--dark=label
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || "")
+icon(icon, 18)
code.c-code-block__content code.c-code-block__content
block block
//- Code blocks to display old/new versions
mixin code-old()
+code(false, false, "reject").o-block-small
block
mixin code-new()
+code(false, false, "accept").o-block-small
block
//- CodePen embed //- CodePen embed
slug - [string] ID of CodePen demo (taken from URL) slug - [string] ID of CodePen demo (taken from URL)
height - [integer] height of demo embed iframe height - [integer] height of demo embed iframe
@ -164,6 +180,16 @@ mixin tag()
block block
//- "Requires model" tag with tooltip and list of capabilities
...capabs - [string] Required model capabilities, e.g. "vectors".
mixin tag-model(...capabs)
- var intro = "To use this functionality, spaCy needs a model to be installed"
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
+tag Requires model
+help(intro + ext + ".").u-color-theme
//- List //- List
type - [string] "numbers", "letters", "roman" (bulleted list if none set) type - [string] "numbers", "letters", "roman" (bulleted list if none set)
start - [integer] start number start - [integer] start number

View File

@ -9,6 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION .u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
ul.c-nav__menu ul.c-nav__menu
if ALPHA
- var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
each url, item in NAVIGATION each url, item in NAVIGATION
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null) li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
+a(url)=item +a(url)=item

View File

@ -10,6 +10,14 @@ main.o-main.o-main--sidebar.o-main--aside
if tag if tag
+tag=tag +tag=tag
if ALPHA
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
| This page is part of the alpha documentation for spaCy v2.0
| and does not reflect the state of the latest stable release.
| #[+a("#") See here] for more information on how to install
| and test the new version. To read the official docs for
| v1.x, #[+a("https://spacy.io/docs") go here].
!=yield !=yield
+grid.o-content.u-text +grid.o-content.u-text

View File

@ -35,7 +35,10 @@ html(lang="en")
link(rel="shortcut icon" href="/assets/img/favicon.ico") link(rel="shortcut icon" href="/assets/img/favicon.ico")
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico") link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
if SUBSECTION == "usage" if ALPHA && SECTION == "docs"
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
else if SUBSECTION == "usage"
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet") link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
else else

View File

@ -13,6 +13,17 @@
white-space: pre white-space: pre
direction: ltr direction: ltr
&.c-code-block--has-icon
padding: 0
display: flex
.c-code-block__icon
padding: 0 0 0 1rem
display: flex
justify-content: center
align-items: center
border-left: 6px solid
//- Code block content //- Code block content
@ -26,8 +37,8 @@
*:not(.c-code-block) > code *:not(.c-code-block) > code
font: normal 600 0.8em/#{1} $font-code font: normal 600 0.8em/#{1} $font-code
background: rgba($color-front, 0.05) background: darken($color-theme-light, 5)
box-shadow: 1px 1px 0 rgba($color-front, 0.1) box-shadow: 1px 1px 0 rgba($color-front, 0.05)
text-shadow: 1px 1px 0 rgba($color-back, 0.5) text-shadow: 1px 1px 0 rgba($color-back, 0.5)
color: $color-front color: $color-front
padding: 0.1em 0.5em padding: 0.1em 0.5em

View File

@ -13,7 +13,7 @@
background: rgba($color-subtle-light, 0.35) background: rgba($color-subtle-light, 0.35)
&.c-table__row--foot &.c-table__row--foot
background: rgba($color-theme, 0.025) background: $color-theme-light
border-top: 2px solid $color-theme border-top: 2px solid $color-theme
.c-table__cell:first-child .c-table__cell:first-child

View File

@ -11,9 +11,8 @@
background: $color-front background: $color-front
border-radius: 2px border-radius: 2px
color: $color-back color: $color-back
font-family: inherit font: normal 1.3rem/#{1.25} $font-primary
font-size: 1.3rem text-transform: none
line-height: 1.25
opacity: 0 opacity: 0
padding: 0.5em 0.75em padding: 0.5em 0.75em
transform: translateX(-50%) translateY(-2px) transform: translateX(-50%) translateY(-2px)

View File

@ -26,8 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
// Colors // Colors
$colors: ( blue: #09a3d5, red: #d9515d ) $colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )
$colors-light: (blue: #cceaf4, red: #f9d7da)
$color-back: #fff !default $color-back: #fff !default
$color-front: #1a1e23 !default $color-front: #1a1e23 !default
@ -35,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default
$color-theme: map-get($colors, $theme) $color-theme: map-get($colors, $theme)
$color-theme-dark: darken(map-get($colors, $theme), 5) $color-theme-dark: darken(map-get($colors, $theme), 5)
$color-theme-light: map-get($colors-light, $theme) $color-theme-light: rgba($color-theme, 0.05)
$color-subtle: #ddd !default $color-subtle: #ddd !default
$color-subtle-light: #f6f6f6 !default $color-subtle-light: #f6f6f6 !default

View File

@ -0,0 +1,4 @@
//- 💫 STYLESHEET (GREEN)
$theme: green
@import style

View File

@ -30,5 +30,11 @@
<symbol id="help" viewBox="0 0 24 24"> <symbol id="help" viewBox="0 0 24 24">
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/> <path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
</symbol> </symbol>
<symbol id="reject" viewBox="0 0 24 24">
<path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"/>
</symbol>
<symbol id="accept" viewBox="0 0 24 24">
<path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
</symbol>
</defs> </defs>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 5.4 KiB

After

Width:  |  Height:  |  Size: 5.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 216 KiB

View File

@ -2,8 +2,13 @@
"sidebar": { "sidebar": {
"Introduction": { "Introduction": {
"Facts & Figures": "./", "Facts & Figures": "./",
"Languages": "language-models", "Languages": "language-models"
"Philosophy": "philosophy" },
"Top-level": {
"spacy": "spacy",
"displacy": "displacy",
"Utility Functions": "util",
"Command line": "cli"
}, },
"Classes": { "Classes": {
"Doc": "doc", "Doc": "doc",
@ -21,9 +26,6 @@
"GoldParse": "goldparse" "GoldParse": "goldparse"
}, },
"Other": { "Other": {
"Command line": "cli",
"displaCy": "displacy",
"Utility Functions": "util",
"Annotation Specs": "annotation", "Annotation Specs": "annotation",
"Feature Scheme": "features" "Feature Scheme": "features"
} }
@ -43,6 +45,26 @@
"title": "Philosophy" "title": "Philosophy"
}, },
"spacy": {
"title": "spaCy top-level functions",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module",
"next": "util"
},
"util": {
"title": "Utility Functions",
"next": "cli"
},
"cli": {
"title": "Command Line Interface"
},
"language": { "language": {
"title": "Language", "title": "Language",
"tag": "class" "tag": "class"
@ -113,20 +135,6 @@
"tag": "class" "tag": "class"
}, },
"cli": {
"title": "Command Line Interface",
"next": "displacy"
},
"displacy": {
"title": "displaCy",
"tag": "module"
},
"util": {
"title": "Utility Functions"
},
"annotation": { "annotation": {
"title": "Annotation Specifications" "title": "Annotation Specifications"
}, },

View File

@ -71,6 +71,44 @@ include _annotation/_dep-labels
include _annotation/_named-entities include _annotation/_named-entities
+h(3, "biluo") BILUO Scheme
p
| spaCy translates character offsets into the BILUO scheme, in order to
| decide the cost of each action given the current state of the entity
| recognizer. The costs are then used to calculate the gradient of the
| loss, to train the model.
+aside("Why BILUO, not IOB?")
| There are several coding schemes for encoding entity annotations as
| token tags. These coding schemes are equally expressive, but not
| necessarily equally learnable.
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
| scheme was more difficult to learn than the #[strong BILUO] scheme that
| we use, which explicitly marks boundary tokens.
+table([ "Tag", "Description" ])
+row
+cell #[code #[span.u-color-theme B] EGIN]
+cell The first token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme I] N]
+cell An inner token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme L] AST]
+cell The final token of a multi-token entity.
+row
+cell #[code #[span.u-color-theme U] NIT]
+cell A single-token entity.
+row
+cell #[code #[span.u-color-theme O] UT]
+cell A non-entity token.
+h(2, "json-input") JSON input format for training +h(2, "json-input") JSON input format for training
p p

View File

@ -10,11 +10,11 @@ p
+aside("Why python -m?") +aside("Why python -m?")
| The problem with a global entry point is that it's resolved by looking up | The problem with a global entry point is that it's resolved by looking up
| entries in your #[code PATH] environment variable. This can give you | entries in your #[code PATH] environment variable. This can give you
| unexpected results, especially when using #[code virtualenv]. For | unexpected results, like executing the wrong spaCy installation
| instance, you may have spaCy installed on your system but not in your | (especially when using #[code virtualenv]). #[code python -m] prevents
| current environment. The command will then execute the wrong | fallbacks to system modules and makes sure the correct spaCy version is
| spaCy installation. #[code python -m] prevents fallbacks to system modules | used. If you hate typing it every time, we recommend creating an
| and makes sure the correct version of spaCy is used. | #[code alias] instead.
+h(2, "download") Download +h(2, "download") Download
@ -45,13 +45,24 @@ p
+cell flag +cell flag
+cell Show help message and available arguments. +cell Show help message and available arguments.
+infobox("Important note")
| The #[code download] command is mostly intended as a convenient,
| interactive wrapper it performs compatibility checks and prints
| detailed messages in case things go wrong. It's #[strong not recommended]
| to use this command as part of an automated process. If you know which
| model your project needs, you should consider a
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
| uploading the model to a local PyPi installation and fetching it straight
| from there. This will also allow you to add it as a versioned package
| dependency to your project.
+h(2, "link") Link +h(2, "link") Link
p p
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model, | Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
| either a Python package or a local directory. This will let you load | either a Python package or a local directory. This will let you load
| models from any location via #[code spacy.load()]. | models from any location using a custom name via
| #[+api("spacy#load") #[code spacy.load()]].
+code(false, "bash"). +code(false, "bash").
python -m spacy link [origin] [link_name] [--force] python -m spacy link [origin] [link_name] [--force]
@ -92,7 +103,7 @@ p
+row +row
+cell #[code model] +cell #[code model]
+cell positional +cell positional
+cell Shortcut link of model (optional). +cell A model, i.e. shortcut link, package name or path (optional).
+row +row
+cell #[code --markdown], #[code -md] +cell #[code --markdown], #[code -md]
@ -114,7 +125,7 @@ p
| the input file. Currently only supports #[code .conllu]. | the input file. Currently only supports #[code .conllu].
+code(false, "bash"). +code(false, "bash").
python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology] python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -128,7 +139,7 @@ p
+cell Output directory for converted JSON file. +cell Output directory for converted JSON file.
+row +row
+cell #[code --n_sents], #[code -n] +cell #[code --n-sents], #[code -n]
+cell option +cell option
+cell Number of sentences per document. +cell Number of sentences per document.
@ -191,7 +202,7 @@ p
| #[+a("/docs/api/annotation#json-input") JSON format]. | #[+a("/docs/api/annotation#json-input") JSON format].
+code(false, "bash"). +code(false, "bash").
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
+table(["Argument", "Type", "Description"]) +table(["Argument", "Type", "Description"])
+row +row
@ -215,27 +226,37 @@ p
+cell Location of JSON-formatted dev data (optional). +cell Location of JSON-formatted dev data (optional).
+row +row
+cell #[code --n_iter], #[code -n] +cell #[code --n-iter], #[code -n]
+cell option +cell option
+cell Number of iterations (default: #[code 15]). +cell Number of iterations (default: #[code 15]).
+row +row
+cell #[code --parser_L1], #[code -L] +cell #[code --nsents]
+cell option
+cell Number of sentences (default: #[code 0]).
+row
+cell #[code --parser-L1], #[code -L]
+cell option +cell option
+cell L1 regularization penalty for parser (default: #[code 0.0]). +cell L1 regularization penalty for parser (default: #[code 0.0]).
+row +row
+cell #[code --no_tagger], #[code -T] +cell #[code --use-gpu], #[code -g]
+cell flag
+cell Use GPU.
+row
+cell #[code --no-tagger], #[code -T]
+cell flag +cell flag
+cell Don't train tagger. +cell Don't train tagger.
+row +row
+cell #[code --no_parser], #[code -P] +cell #[code --no-parser], #[code -P]
+cell flag +cell flag
+cell Don't train parser. +cell Don't train parser.
+row +row
+cell #[code --no_ner], #[code -N] +cell #[code --no-ner], #[code -N]
+cell flag +cell flag
+cell Don't train NER. +cell Don't train NER.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate syntactic dependencies on #[code Doc] objects. p Annotate syntactic dependencies on #[code Doc] objects.
+h(2, "load") DependencyParser.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code DependencyParser]
+cell The newly constructed object.
+h(2, "init") DependencyParser.__init__ +h(2, "init") DependencyParser.__init__
+tag method +tag method
@ -47,7 +21,7 @@ p Create a #[code DependencyParser].
+cell The statistical model. +cell The statistical model.
+footrow +footrow
+cell return +cell returns
+cell #[code DependencyParser] +cell #[code DependencyParser]
+cell The newly constructed object. +cell The newly constructed object.
@ -65,7 +39,7 @@ p
+cell The document to be processed. +cell The document to be processed.
+footrow +footrow
+cell return +cell returns
+cell #[code None] +cell #[code None]
+cell - +cell -
@ -93,7 +67,7 @@ p Process a stream of documents.
| parallel. | parallel.
+footrow +footrow
+cell yield +cell yields
+cell #[code Doc] +cell #[code Doc]
+cell Documents, in order. +cell Documents, in order.
@ -114,7 +88,7 @@ p Update the statistical model.
+cell The gold-standard annotations, to calculate the loss. +cell The gold-standard annotations, to calculate the loss.
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The loss on this example. +cell The loss on this example.
@ -130,6 +104,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
+cell The document to step through. +cell The document to step through.
+footrow +footrow
+cell return +cell returns
+cell #[code StepwiseState] +cell #[code StepwiseState]
+cell A state object, to step through the annotation process. +cell A state object, to step through the annotation process.

View File

@ -8,7 +8,7 @@ p
| #[+a("/docs/usage/visualizers") visualizing spaCy]. | #[+a("/docs/usage/visualizers") visualizing spaCy].
+h(2, "serve") serve +h(2, "serve") displacy.serve
+tag method +tag method
p p
@ -60,7 +60,7 @@ p
+cell Port to serve visualization. +cell Port to serve visualization.
+cell #[code 5000] +cell #[code 5000]
+h(2, "render") render +h(2, "render") displacy.render
+tag method +tag method
p Render a dependency parse tree or named entity visualization. p Render a dependency parse tree or named entity visualization.
@ -112,7 +112,7 @@ p Render a dependency parse tree or named entity visualization.
+cell #[code {}] +cell #[code {}]
+footrow +footrow
+cell return +cell returns
+cell unicode +cell unicode
+cell Rendered HTML markup. +cell Rendered HTML markup.
+cell +cell
@ -218,7 +218,7 @@ p
+cell #[code colors] +cell #[code colors]
+cell dict +cell dict
+cell +cell
| Color overrides. Entity types in lowercase should be mapped to | Color overrides. Entity types in uppercase should be mapped to
| color names or values. | color names or values.
+cell #[code {}] +cell #[code {}]

View File

@ -4,9 +4,508 @@ include ../../_includes/_mixins
p A container for accessing linguistic annotations. p A container for accessing linguistic annotations.
p
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
| Access sentences and named entities, export annotations to numpy arrays,
| losslessly serialize to compressed binary strings. The #[code Doc] object
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
| they don't own the data themselves.
+aside-code("Example").
# Construction 1
doc = nlp(u'Some text')
# Construction 2
from spacy.tokens import Doc
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
spaces=[True, False, False])
+h(2, "init") Doc.__init__
+tag method
p
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
| object is via the #[code nlp] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell returns
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p
| Get a #[+api("token") #[code Token]] object at position #[code i], where
| #[code i] is an integer. Negative indexing is supported, and follows the
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell returns
+cell #[code Token]
+cell The token at #[code doc[i]].
p
| Get a #[+api("span") #[code Span]] object, starting at position
| #[code start] (token index) and ending at position #[code end] (token
| index).
p
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
| You can use negative indices and open-ended ranges, which have their
| normal Python semantics.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell returns
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p
| Iterate over #[code Token] objects, from which the annotations can be
| easily accessed.
+aside-code("Example").
doc = nlp(u'Give it back')
assert [t.text for t in doc] == [u'Give', u'it', u'back']
p
| This is the main way of accessing #[+api("token") #[code Token]] objects,
| which are the main way annotations are accessed from Python. If
| faster-than-Python speeds are required, you can instead access the
| annotations as a numpy array, or access the underlying C data directly
| from Cython.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert len(doc) == 7
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
+tag-model("vectors")
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
apples = nlp(u'I like apples')
oranges = nlp(u'I like oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "count_by") Doc.count_by
+tag method
p
| Count the frequencies of a given attribute. Produces a dict of
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
| of the given attribute ID.
+aside-code("Example").
from spacy.attrs import ORTH
doc = nlp(u'apple apple orange banana')
assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
doc.to_array([attrs.ORTH])
# array([[11880], [11880], [7561], [12800]])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell returns
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "from_array") Doc.from_array
+tag method
p
| Load attributes from a numpy array. Write to a #[code Doc] object, from
| an #[code (M, N)] array of attributes.
+aside-code("Example").
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
doc = nlp(text)
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
doc2 = Doc(doc.vocab)
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code attrs]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code array]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell returns
+cell #[code Doc]
+cell Itself.
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Serialize, i.e. export the document contents to a binary string.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
doc_bytes = doc.to_bytes()
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc], including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Deserialize, i.e. import the document contents from a binary string.
+aside-code("Example").
from spacy.tokens import Doc
text = u'Give it back! He pleaded.'
doc = nlp(text)
bytes = doc.to_bytes()
doc2 = Doc(doc.vocab).from_bytes(bytes)
assert doc.text == doc2.text
+table(["Name", "Type", "Description"])
+row
+cell #[code data]
+cell bytes
+cell The string to load from.
+footrow
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+aside-code("Example").
doc = nlp(u'Los Angeles start.')
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
assert [t.text for t in doc] == [u'Los Angeles', u'start', u'.']
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell returns
+cell #[code Token]
+cell
| The newly merged token, or #[code None] if the start and end
| indices did not fall at token boundaries
+h(2, "print_tree") Doc.print_tree
+tag method
+tag-model("parse")
p
| Returns the parse trees in JSON (dict) format. Especially useful for
| web applications.
+aside-code("Example").
doc = nlp('Alice ate the pizza.')
trees = doc.print_tree()
# {'modifiers': [
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
+table(["Name", "Type", "Description"])
+row
+cell #[code light]
+cell bool
+cell Don't include lemmas or entities.
+row
+cell #[code flat]
+cell bool
+cell Don't include arcs or modifiers.
+footrow
+cell returns
+cell dict
+cell Parse tree as dict.
+h(2, "ents") Doc.ents
+tag property
+tag-model("NER")
p
| Iterate over the entities in the document. Yields named-entity
| #[code Span] objects, if the entity recognizer has been applied to the
| document.
+aside-code("Example").
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].text == 'Mr. Best'
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
+tag-model("parse")
p
| Iterate over the base noun phrases in the document. Yields base
| noun-phrase #[code Span] objects, if the document has been syntactically
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
| permit other NPs to be nested within it so no NP-level coordination, no
| prepositional phrases, and no relative clauses.
+aside-code("Example").
doc = nlp(u'A phrase with another phrase occurs.')
chunks = list(doc.noun_chunks)
assert chunks[0].text == "A phrase"
assert chunks[1].text == "another phrase"
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Noun chunks in the document.
+h(2, "sents") Doc.sents
+tag property
+tag-model("parse")
p
| Iterate over the sentences in the document. Sentence spans have no label.
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
| from the syntactic dependency parse. If the parser is disabled,
| the #[code sents] iterator will be unavailable.
+aside-code("Example").
doc = nlp(u"This is a sentence. Here's another...")
sents = list(doc.sents)
assert len(sents) == 2
assert [s.root.text for s in sents] == ["is", "'s"]
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Span]
+cell Sentences in the document.
+h(2, "has_vector") Doc.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the document has a vector data attached.
+h(2, "vector") Doc.vector
+tag property
+tag-model("vectors")
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
apples = nlp(u'I like apples')
assert doc.vector.dtype == 'float32'
assert doc.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "vector_norm") Doc.vector_norm
+tag property
+tag-model("vectors")
p
| The L2 norm of the document's vector representation.
+aside-code("Example").
doc1 = nlp(u'I like apples')
doc2 = nlp(u'I like oranges')
doc1.vector_norm # 4.54232424414368
doc2.vector_norm # 3.304373298575751
assert doc1.vector_norm != doc2.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell A unicode representation of the document text.
+row
+cell #[code text_with_ws]
+cell unicode
+cell
| An alias of #[code Doc.text], provided for duck-type compatibility
| with #[code Span] and #[code Token].
+row +row
+cell #[code mem] +cell #[code mem]
+cell #[code Pool] +cell #[code Pool]
@ -17,6 +516,11 @@ p A container for accessing linguistic annotations.
+cell #[code Vocab] +cell #[code Vocab]
+cell The store of lexical types. +cell The store of lexical types.
+row
+cell #[code tensor]
+cell object
+cell Container for dense vector representations.
+row +row
+cell #[code user_data] +cell #[code user_data]
+cell - +cell -
@ -59,358 +563,3 @@ p A container for accessing linguistic annotations.
+cell +cell
| A dictionary that allows customisation of properties of | A dictionary that allows customisation of properties of
| #[code Span] children. | #[code Span] children.
+h(2, "init") Doc.__init__
+tag method
p Construct a #[code Doc] object.
+aside("Note")
| The most common way to get a #[code Doc] object is via the #[code nlp]
| object. This method is usually only used for deserialization or preset
| tokenization.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code words]
+cell -
+cell A list of strings to add to the container.
+row
+cell #[code spaces]
+cell -
+cell
| A list of boolean values indicating whether each word has a
| subsequent space. Must have the same length as #[code words], if
| specified. Defaults to a sequence of #[code True].
+footrow
+cell return
+cell #[code Doc]
+cell The newly constructed object.
+h(2, "getitem") Doc.__getitem__
+tag method
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
assert doc[0].text == 'Give'
assert doc[-1].text == '.'
span = doc[1:1]
assert span.text == 'it back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token.
+footrow
+cell return
+cell #[code Token]
+cell The token at #[code doc[i]].
p Get a #[code Span] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the document to get.
+footrow
+cell return
+cell #[code Span]
+cell The span at #[code doc[start : end]].
+h(2, "iter") Doc.__iter__
+tag method
p Iterate over #[code Token] objects.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Doc.__len__
+tag method
p Get the number of tokens in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of tokens in the document.
+h(2, "similarity") Doc.similarity
+tag method
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "to_array") Doc.to_array
+tag method
p
| Export the document annotations to a numpy array of shape #[code N*M]
| where #[code N] is the length of the document and #[code M] is the number
| of attribute IDs to export. The values will be 32-bit integers.
+aside-code("Example").
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS,
attrs.ENT_TYPE, attrs.IS_ALPHA])
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell
| The exported attributes as a 2D numpy array, with one row per
| token and one column per attribute.
+h(2, "count_by") Doc.count_by
+tag method
p Count the frequencies of a given attribute.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_id]
+cell int
+cell The attribute ID
+footrow
+cell return
+cell dict
+cell A dictionary mapping attributes to integer counts.
+h(2, "from_array") Doc.from_array
+tag method
p Load attributes from a numpy array.
+table(["Name", "Type", "Description"])
+row
+cell #[code attr_ids]
+cell ints
+cell A list of attribute ID ints.
+row
+cell #[code values]
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+cell The attribute values to load.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "to_bytes") Doc.to_bytes
+tag method
p Export the document contents to a binary string.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bytes
+cell
| A losslessly serialized copy of the #[code Doc] including all
| annotations.
+h(2, "from_bytes") Doc.from_bytes
+tag method
p Import the document contents from a binary string.
+table(["Name", "Type", "Description"])
+row
+cell #[code byte_string]
+cell bytes
+cell The string to load from.
+footrow
+cell return
+cell #[code Doc]
+cell The #[code self] variable.
+h(2, "merge") Doc.merge
+tag method
p
| Retokenize the document, such that the span at
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
| #[code start_idx] and #[end_idx] do not mark start and end token
| boundaries, the document remains unchanged.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_idx]
+cell int
+cell The character index of the start of the slice to merge.
+row
+cell #[code end_idx]
+cell int
+cell The character index after the end of the slice to merge.
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default,
| attributes are inherited from the syntactic root token of
| the span.
+footrow
+cell return
+cell #[code Token]
+cell
| The newly merged token, or None if the start and end
| indices did not fall at token boundaries
+h(2, "read_bytes") Doc.read_bytes
+tag staticmethod
p A static method, used to read serialized #[code Doc] objects from a file.
+aside-code("Example").
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
+table(["Name", "Type", "Description"])
+row
+cell file
+cell buffer
+cell A binary buffer to read the serialized annotations from.
+footrow
+cell yield
+cell bytes
+cell Binary strings from with documents can be loaded.
+h(2, "text") Doc.text
+tag property
p A unicode representation of the document text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "text_with_ws") Doc.text_with_ws
+tag property
p
| An alias of #[code Doc.text], provided for duck-type compatibility with
| #[code Span] and #[code Token].
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the document.
+h(2, "sents") Doc.sents
+tag property
p Iterate over the sentences in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Sentences in the document.
+h(2, "ents") Doc.ents
+tag property
p Iterate over the entities in the document.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Entities in the document.
+h(2, "noun_chunks") Doc.noun_chunks
+tag property
p
| Iterate over the base noun phrases in the document. A base noun phrase,
| or "NP chunk", is a noun phrase that does not permit other NPs to be
| nested within it.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Span]
+cell Noun chunks in the document
+h(2, "vector") Doc.vector
+tag property
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the document's semantics.
+h(2, "has_vector") Doc.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the document has a vector data attached.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate named entities on #[code Doc] objects. p Annotate named entities on #[code Doc] objects.
+h(2, "load") EntityRecognizer.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code EntityRecognizer]
+cell The newly constructed object.
+h(2, "init") EntityRecognizer.__init__ +h(2, "init") EntityRecognizer.__init__
+tag method +tag method
@ -47,7 +21,7 @@ p Create an #[code EntityRecognizer].
+cell The statistical model. +cell The statistical model.
+footrow +footrow
+cell return +cell returns
+cell #[code EntityRecognizer] +cell #[code EntityRecognizer]
+cell The newly constructed object. +cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
+cell The document to be processed. +cell The document to be processed.
+footrow +footrow
+cell return +cell returns
+cell #[code None] +cell #[code None]
+cell - +cell -
@ -91,7 +65,7 @@ p Process a stream of documents.
| parallel. | parallel.
+footrow +footrow
+cell yield +cell yields
+cell #[code Doc] +cell #[code Doc]
+cell Documents, in order. +cell Documents, in order.
@ -112,7 +86,7 @@ p Update the statistical model.
+cell The gold-standard annotations, to calculate the loss. +cell The gold-standard annotations, to calculate the loss.
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The loss on this example. +cell The loss on this example.
@ -128,6 +102,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
+cell The document to step through. +cell The document to step through.
+footrow +footrow
+cell return +cell returns
+cell #[code StepwiseState] +cell #[code StepwiseState]
+cell A state object, to step through the annotation process. +cell A state object, to step through the annotation process.

View File

@ -4,6 +4,72 @@ include ../../_includes/_mixins
p Collection for training annotations. p Collection for training annotations.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse.
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the annotations refer to.
+row
+cell #[code words]
+cell iterable
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell iterable
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell iterable
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell iterable
+cell A sequence of strings, representing the syntactic relation types.
+row
+cell #[code entities]
+cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
+footrow
+cell returns
+cell #[code GoldParse]
+cell The newly constructed object.
+h(2, "len") GoldParse.__len__
+tag method
p Get the number of gold-standard tokens.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of gold-standard tokens.
+h(2, "is_projective") GoldParse.is_projective
+tag property
p
| Whether the provided syntactic annotations form a projective dependency
| tree.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether annotations form projective tree.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -37,67 +103,57 @@ p Collection for training annotations.
+cell list +cell list
+cell The alignment from gold tokenization to candidate tokenization. +cell The alignment from gold tokenization to candidate tokenization.
+h(2, "init") GoldParse.__init__
+tag method
p Create a GoldParse. +h(2, "util") Utilities
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
+tag function
p
| Encode labelled spans into per-token tags, using the
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
p
| Returns a list of unicode strings, describing the tags. Each tag string
| will be of the form either #[code ""], #[code "O"] or
| #[code "{action}-{label}"], where action is one of #[code "B"],
| #[code "I"], #[code "L"], #[code "U"]. The string #[code &quot;-&quot;]
| is used where the entity offsets don't align with the tokenization in the
| #[code Doc] object. The training algorithm will view these as missing
| values. #[code O] denotes a non-entity token. #[code B] denotes the
| beginning of a multi-token entity, #[code I] the inside of an entity
| of three or more tokens, and #[code L] the end of an entity of two or
| more tokens. #[code U] denotes a single-token entity.
+aside-code("Example").
from spacy.gold import biluo_tags_from_offsets
text = 'I like London.'
entities = [(len('I like '), len('I like London'), 'LOC')]
doc = tokenizer(text)
tags = biluo_tags_from_offsets(doc, entities)
assert tags == ['O', 'O', 'U-LOC', 'O']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code doc] +cell #[code doc]
+cell #[code Doc] +cell #[code Doc]
+cell The document the annotations refer to. +cell
| The document that the entity offsets refer to. The output tags
+row | will refer to the token boundaries within the document.
+cell #[code words]
+cell -
+cell A sequence of unicode word strings.
+row
+cell #[code tags]
+cell -
+cell A sequence of strings, representing tag annotations.
+row
+cell #[code heads]
+cell -
+cell A sequence of integers, representing syntactic head offsets.
+row
+cell #[code deps]
+cell -
+cell A sequence of strings, representing the syntactic relation types.
+row +row
+cell #[code entities] +cell #[code entities]
+cell - +cell iterable
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions. +cell
| A sequence of #[code (start, end, label)] triples. #[code start]
| and #[code end] should be character-offset integers denoting the
| slice into the original string.
+footrow +footrow
+cell return +cell returns
+cell #[code GoldParse] +cell list
+cell The newly constructed object. +cell
| Unicode strings, describing the
| #[+a("/docs/api/annotation#biluo") BILUO] tags.
+h(2, "len") GoldParse.__len__
+tag method
p Get the number of gold-standard tokens.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of gold-standard tokens.
+h(2, "is_projective") GoldParse.is_projective
+tag property
p
| Whether the provided syntactic annotations form a projective dependency
| tree.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether annotations form projective tree.

View File

@ -2,79 +2,69 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p A text processing pipeline. p
| A text-processing pipeline. Usually you'll load this once per process,
| and pass the instance around your application.
+h(2, "attributes") Attributes +h(2, "init") Language.__init__
+tag method
p Initialise a #[code Language] object.
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
from spacy.lang.en import English
nlp = English()
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code vocab] +cell #[code vocab]
+cell #[code Vocab] +cell #[code Vocab]
+cell A container for the lexical types. +cell
| A #[code Vocab] object. If #[code True], a vocab is created via
+row | #[code Language.Defaults.create_vocab].
+cell #[code tokenizer]
+cell #[code Tokenizer]
+cell Find word boundaries and create #[code Doc] object.
+row
+cell #[code tagger]
+cell #[code Tagger]
+cell Annotate #[code Doc] objects with POS tags.
+row
+cell #[code parser]
+cell #[code DependencyParser]
+cell Annotate #[code Doc] objects with syntactic dependencies.
+row
+cell #[code entity]
+cell #[code EntityRecognizer]
+cell Annotate #[code Doc] objects with named entities.
+row
+cell #[code matcher]
+cell #[code Matcher]
+cell Rule-based sequence matcher.
+row +row
+cell #[code make_doc] +cell #[code make_doc]
+cell #[code lambda text: Doc] +cell callable
+cell Create a #[code Doc] object from unicode text. +cell
| A function that takes text and returns a #[code Doc] object.
| Usually a #[code Tokenizer].
+row +row
+cell #[code pipeline] +cell #[code pipeline]
+cell - +cell list
+cell Sequence of annotation functions. +cell
| A list of annotation processes or IDs of annotation, processes,
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
| up in #[code Language.Defaults.factories].
+h(2, "init") Language.__init__
+tag method
p Create or load the pipeline.
+table(["Name", "Type", "Description"])
+row +row
+cell #[code **overrides] +cell #[code meta]
+cell - +cell dict
+cell Keyword arguments indicating which defaults to override. +cell
| Custom meta data for the #[code Language] class. Is written to by
| models to add model meta data.
+footrow +footrow
+cell return +cell returns
+cell #[code Language] +cell #[code Language]
+cell The newly constructed object. +cell The newly constructed object.
+h(2, "call") Language.__call__ +h(2, "call") Language.__call__
+tag method +tag method
p Apply the pipeline to a single text. p
| Apply the pipeline to some text. The text can span multiple sentences,
| and can contain arbtrary whitespace. Alignment into the original string
| is preserved.
+aside-code("Example"). +aside-code("Example").
from spacy.en import English doc = nlp(u'An example sentence. Another sentence.')
nlp = English() assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')
doc = nlp('An example sentence. Another example sentence.')
doc[0].orth_, doc[0].head.tag_
# ('An', 'NN')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -83,24 +73,104 @@ p Apply the pipeline to a single text.
+cell The text to be processed. +cell The text to be processed.
+row +row
+cell #[code tag] +cell #[code **disabled]
+cell bool +cell -
+cell Whether to apply the part-of-speech tagger. +cell Elements of the pipeline that should not be run.
+row
+cell #[code parse]
+cell bool
+cell Whether to apply the syntactic dependency parser.
+row
+cell #[code entity]
+cell bool
+cell Whether to apply the named entity recognizer.
+footrow +footrow
+cell return +cell returns
+cell #[code Doc] +cell #[code Doc]
+cell A container for accessing the linguistic annotations. +cell A container for accessing the annotations.
+h(2, "update") Language.update
+tag method
p Update the models in the pipeline.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code docs]
+cell iterable
+cell A batch of #[code Doc] objects.
+row
+cell #[code golds]
+cell iterable
+cell A batch of #[code GoldParse] objects.
+row
+cell #[code drop]
+cell float
+cell The dropout rate.
+row
+cell #[code sgd]
+cell callable
+cell An optimizer.
+footrow
+cell returns
+cell dict
+cell Results from the update.
+h(2, "begin_training") Language.begin_training
+tag contextmanager
p
| Allocate models, pre-process training data and acquire a trainer and
| optimizer. Used as a contextmanager.
+aside-code("Example").
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
for epoch in trainer.epochs(gold):
for docs, golds in epoch:
state = nlp.update(docs, golds, sgd=optimizer)
+table(["Name", "Type", "Description"])
+row
+cell #[code gold_tuples]
+cell iterable
+cell Gold-standard training data.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+footrow
+cell yields
+cell tuple
+cell A trainer and an optimizer.
+h(2, "use_params") Language.use_params
+tag contextmanager
+tag method
p
| Replace weights of models in the pipeline with those provided in the
| params dictionary. Can be used as a contextmanager, in which case, models
| go back to their original weights after the block.
+aside-code("Example").
with nlp.use_params(optimizer.averages):
nlp.to_disk('/tmp/checkpoint')
+table(["Name", "Type", "Description"])
+row
+cell #[code params]
+cell dict
+cell A dictionary of parameters keyed by model ID.
+row
+cell #[code **cfg]
+cell -
+cell Config parameters.
+h(2, "pipe") Language.pipe +h(2, "pipe") Language.pipe
+tag method +tag method
@ -133,22 +203,142 @@ p
+cell The number of texts to buffer. +cell The number of texts to buffer.
+footrow +footrow
+cell yield +cell yields
+cell #[code Doc] +cell #[code Doc]
+cell Containers for accessing the linguistic annotations. +cell Documents in the order of the original text.
+h(2, "save_to_directory") Language.save_to_directory +h(2, "to_disk") Language.to_disk
+tag method +tag method
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory. p Save the current state to a directory.
+aside-code("Example").
nlp.to_disk('/path/to/models')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code path] +cell #[code path]
+cell string or pathlib path +cell unicode or #[code Path]
+cell Path to save the model. +cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being saved.
+h(2, "from_disk") Language.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.language import Language
nlp = Language().from_disk('/path/to/models')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow +footrow
+cell return +cell returns
+cell #[code None] +cell #[code Language]
+cell The modified #[code Language] object.
+h(2, "to_bytes") Language.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
nlp_bytes = nlp.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell - +cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Language] object.
+h(2, "from_bytes") Language.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.lang.en import English
nlp_bytes = nlp.to_bytes()
nlp2 = English()
nlp2.from_bytes(nlp_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Language]
+cell The #[code Language] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A container for the lexical types.
+row
+cell #[code make_doc]
+cell #[code lambda text: Doc]
+cell Create a #[code Doc] object from unicode text.
+row
+cell #[code pipeline]
+cell list
+cell Sequence of annotation functions.
+row
+cell #[code meta]
+cell dict
+cell
| Custom meta data for the Language class. If a model is loaded,
| contains meta data of the model.
+h(2, "class-attributes") Class attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code Defaults]
+cell class
+cell
| Settings, data and factory methods for creating the
| #[code nlp] object and processing pipeline.
+row
+cell #[code lang]
+cell unicode
+cell
| Two-letter language ID, i.e.
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].

View File

@ -2,7 +2,154 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p An entry in the vocabulary. p
| An entry in the vocabulary. A #[code Lexeme] has no string context it's
| a word-type, as opposed to a word token. It therefore has no
| part-of-speech tag, dependency parse, or lemma (if lemmatization depends
| on the part-of-speech tag).
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+footrow
+cell returns
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+aside-code("Example").
COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
is_my_library = lambda text: text in ['spaCy', 'Thinc']
MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+footrow
+cell returns
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apple = nlp.vocab[u'apple']
orange = nlp.vocab[u'orange']
apple_orange = apple.similarity(orange)
orange_apple = orange.similarity(apple)
assert apple_orange == orange_apple
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "has_vector") Lexeme.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| lexeme.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the lexeme has a vector data attached.
+h(2, "vector") Lexeme.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
assert apple.vector.dtype == 'float32'
assert apple.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the lexeme's semantics.
+h(2, "vector_norm") Lexeme.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the lexeme's vector representation.
+aside-code("Example").
apple = nlp.vocab[u'apple']
pasta = nlp.vocab[u'pasta']
apple.vector_norm # 7.1346845626831055
pasta.vector_norm # 7.759851932525635
assert apple.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
@ -12,6 +159,16 @@ p An entry in the vocabulary.
+cell #[code Vocab] +cell #[code Vocab]
+cell +cell
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code lex_id]
+cell int
+cell ID of the lexeme's lexical type.
+row +row
+cell #[code lower] +cell #[code lower]
+cell int +cell int
@ -124,116 +281,9 @@ p An entry in the vocabulary.
+row +row
+cell #[code prob] +cell #[code prob]
+cell float +cell float
+cell Smoothed log probability estimate of token's type. +cell Smoothed log probability estimate of lexeme's type.
+row +row
+cell #[code sentiment] +cell #[code sentiment]
+cell float +cell float
+cell A scalar value indicating the positivity or negativity of the token. +cell A scalar value indicating the positivity or negativity of the lexeme.
+row
+cell #[code lex_id]
+cell int
+cell ID of the token's lexical type.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+h(2, "init") Lexeme.__init__
+tag method
p Create a #[code Lexeme] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The parent vocabulary.
+row
+cell #[code orth]
+cell int
+cell The orth id of the lexeme.
+footrow
+cell return
+cell #[code Lexeme]
+cell The newly constructed object.
+h(2, "set_flag") Lexeme.set_flag
+tag method
p Change the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to set.
+row
+cell #[code value]
+cell bool
+cell The new value of the flag.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "check_flag") Lexeme.check_flag
+tag method
p Check the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to query.
+footrow
+cell return
+cell bool
+cell The value of the flag.
+h(2, "similarity") Lexeme.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "vector") Lexeme.vector
+tag property
p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A real-valued meaning representation.
+h(2, "has_vector") Lexeme.has_vector
+tag property
p A boolean value indicating whether a word vector is associated with the object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether a word vector is associated with the object.

View File

@ -4,31 +4,26 @@ include ../../_includes/_mixins
p Match sequences of tokens, based on pattern rules. p Match sequences of tokens, based on pattern rules.
+h(2, "load") Matcher.load +infobox("⚠️ Deprecation note")
+tag classmethod | As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
| are deprecated and have been replaced with a simpler
p Load the matcher and patterns from a file path. | #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
+table(["Name", "Type", "Description"]) | is now called #[+api("matcher#get") #[code matcher.get]].
+row | #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
+cell #[code path] | and #[code Matcher.has_entity] (now redundant) have been removed.
+cell #[code Path]
+cell Path to a JSON-formatted patterns file.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary that the documents to match over will refer to.
+footrow
+cell return
+cell #[code Matcher]
+cell The newly constructed object.
+h(2, "init") Matcher.__init__ +h(2, "init") Matcher.__init__
+tag method +tag method
p Create the Matcher. p Create the rule-based #[code Matcher].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
matcher = Matcher(nlp.vocab)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -41,17 +36,38 @@ p Create the Matcher.
+row +row
+cell #[code patterns] +cell #[code patterns]
+cell dict +cell dict
+cell Patterns to add to the matcher. +cell Patterns to add to the matcher, keyed by ID.
+footrow +footrow
+cell return +cell returns
+cell #[code Matcher] +cell #[code Matcher]
+cell The newly constructed object. +cell The newly constructed object.
+h(2, "call") Matcher.__call__ +h(2, "call") Matcher.__call__
+tag method +tag method
p Find all token sequences matching the supplied patterns on the Doc. p Find all token sequences matching the supplied patterns on the #[code Doc].
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER
matcher = Matcher(nlp.vocab)
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
matcher.add("HelloWorld", on_match=None, pattern)
doc = nlp(u'hello world!')
matches = matcher(doc)
+infobox("Important note")
| By default, the matcher #[strong does not perform any action] on matches,
| like tagging matched phrases with entity types. Instead, actions need to
| be specified when #[strong adding patterns or entities], by
| passing in a callback function as the #[code on_match] argument on
| #[+api("matcher#add") #[code add]]. This allows you to define custom
| actions per pattern within the same matcher. For example, you might only
| want to merge some entity types, and set custom flags for other matched
| patterns. For more details and examples, see the usage workflow on
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -60,23 +76,28 @@ p Find all token sequences matching the supplied patterns on the Doc.
+cell The document to match over. +cell The document to match over.
+footrow +footrow
+cell return +cell returns
+cell list +cell list
+cell +cell
| A list of#[code (entity_key, label_id, start, end)] tuples, | A list of #[code (match_id, start, end)] tuples, describing the
| describing the matches. A match tuple describes a | matches. A match tuple describes a span #[code doc[start:end]].
| #[code span doc[start:end]]. The #[code label_id] and | The #[code match_id] is the ID of the added match pattern.
| #[code entity_key] are both integers.
+h(2, "pipe") Matcher.pipe +h(2, "pipe") Matcher.pipe
+tag method +tag method
p Match a stream of documents, yielding them in turn. p Match a stream of documents, yielding them in turn.
+aside-code("Example").
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
pass
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code docs] +cell #[code docs]
+cell - +cell iterable
+cell A stream of documents. +cell A stream of documents.
+row +row
@ -93,87 +114,132 @@ p Match a stream of documents, yielding them in turn.
| multi-threading. | multi-threading.
+footrow +footrow
+cell yield +cell yields
+cell #[code Doc] +cell #[code Doc]
+cell Documents, in order. +cell Documents, in order.
+h(2, "add_entity") Matcher.add_entity +h(2, "len") Matcher.__len__
+tag method +tag method
p Add an entity to the matcher. p
| Get the number of rules added to the matcher. Note that this only returns
| the number of rules (identical with the number of IDs), not the number
| of individual patterns.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert len(matcher) == 0
matcher.add('Rule', None, [{ORTH: 'test'}])
assert len(matcher) == 1
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of rules.
+h(2, "contains") Matcher.__contains__
+tag method
p Check whether the matcher contains rules for a match ID.
+aside-code("Example").
matcher = Matcher(nlp.vocab)
assert 'Rule' in matcher == False
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code entity_key] +cell #[code key]
+cell unicode / int
+cell An ID for the entity.
+row
+cell #[code attrs]
+cell -
+cell Attributes to associate with the Matcher.
+row
+cell #[code if_exists]
+cell unicode +cell unicode
+cell +cell The match ID.
| #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls +footrow
| what happens if the entity ID already exists. Defaults to +cell returns
| #[code 'raise']. +cell int
+cell Whether the matcher contains rules for this match ID.
+h(2, "add") Matcher.add
+tag method
p
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
| a callback function to act on the matches. The callback function will
| receive the arguments #[code matcher], #[code doc], #[code i] and
| #[code matches]. If a pattern already exists for the given ID, the
| patterns will be extended. An #[code on_match] callback will be
| overwritten.
+aside-code("Example").
def on_match(matcher, doc, id, matches):
print('Matched!', matches)
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
doc = nlp(u'HELLO WORLD on Google Maps.')
matches = matcher(doc)
+table(["Name", "Type", "Description"])
+row +row
+cell #[code acceptor] +cell #[code match_id]
+cell - +cell unicode
+cell Callback function to filter matches of the entity. +cell An ID for the thing you're matching.
+row +row
+cell #[code on_match] +cell #[code on_match]
+cell - +cell callable or #[code None]
+cell Callback function to act on matches of the entity. +cell
| Callback function to act on matches. Takes the arguments
| #[code matcher], #[code doc], #[code i] and #[code matches].
+footrow +row
+cell return +cell #[code *patterns]
+cell #[code None] +cell list
+cell - +cell
| Match pattern. A pattern consists of a list of dicts, where each
| dict describes a token.
+h(2, "add_pattern") Matcher.add_pattern +h(2, "remove") Matcher.remove
+tag method +tag method
p Add a pattern to the matcher. p
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
| ID does not exist.
+aside-code("Example").
matcher.add('Rule', None, [{ORTH: 'test'}])
assert 'Rule' in matcher == True
matcher.remove('Rule')
assert 'Rule' in matcher == False
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code entity_key] +cell #[code key]
+cell unicode / int +cell unicode
+cell An ID for the entity. +cell The ID of the match rule.
+row +h(2, "get") Matcher.get
+cell #[code token_specs]
+cell -
+cell Description of the pattern to be matched.
+row
+cell #[code label]
+cell unicode / int
+cell Label to assign to the matched pattern. Defaults to #[code ""].
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "has_entity") Matcher.has_entity
+tag method +tag method
p Check whether the matcher has an entity. p
| Retrieve the pattern stored for a key. Returns the rule as an
| #[code (on_match, patterns)] tuple containing the callback and available
| patterns.
+aside-code("Example").
pattern = [{ORTH: 'test'}]
matcher.add('Rule', None, pattern)
(on_match, patterns) = matcher.get('Rule')
assert patterns = [pattern]
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code entity_key] +cell #[code key]
+cell unicode / int +cell unicode
+cell The entity key to check. +cell The ID of the match rule.
+footrow +footrow
+cell return +cell returns
+cell bool +cell tuple
+cell Whether the matcher has the entity. +cell The rule, as an #[code (on_match, patterns)] tuple.

View File

@ -0,0 +1,95 @@
//- 💫 DOCS > API > SPACY
include ../../_includes/_mixins
+h(2, "load") spacy.load
+tag function
+tag-model
p
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
| the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code Language] class to initialise will be
| determined based on the model's settings.
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
+infobox("⚠️ Deprecation note")
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
| will also raise an error if no model could be loaded and never just
| return an empty #[code Language] object. If you need a blank language,
| you need to import it explicitly (#[code from spacy.lang.en import English])
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode or #[code Path]
+cell Model to load, i.e. shortcut link, package name or path.
+footrow
+cell returns
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
+h(2, "info") spacy.info
+tag function
p
| The same as the #[+api("cli#info") #[code info] command]. Pretty-print
| information about your installation, models and local setup from within
| spaCy. To get the model meta data as a dictionary instead, you can
| use the #[code meta] attribute on your #[code nlp] object with a
| loaded model, e.g. #[code nlp['meta']].
+aside-code("Example").
spacy.info()
spacy.info('en')
spacy.info('de', markdown=True)
+table(["Name", "Type", "Description"])
+row
+cell #[code model]
+cell unicode
+cell A model, i.e. shortcut link, package name or path (optional).
+row
+cell #[code markdown]
+cell bool
+cell Print information as Markdown.
+h(2, "explain") spacy.explain
+tag function
p
| Get a description for a given POS tag, dependency label or entity type.
| For a list of available terms, see
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
+aside-code("Example").
spacy.explain('NORP')
# Nationalities or religious or political groups
doc = nlp(u'Hello world')
for word in doc:
print(word.text, word.tag_, spacy.explain(word.tag_))
# Hello UH interjection
# world NN noun, singular or mass
+table(["Name", "Type", "Description"])
+row
+cell #[code term]
+cell unicode
+cell Term to explain.
+footrow
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.

View File

@ -2,66 +2,18 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p A slice from a #[code Doc] object. p A slice from a #[+api("doc") #[code Doc]] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code start]
+cell int
+cell The token offset for the start of the span.
+row
+cell #[code end]
+cell int
+cell The token offset for the end of the span.
+row
+cell #[code start_char]
+cell int
+cell The character offset for the start of the span.
+row
+cell #[code end_char]
+cell int
+cell The character offset for the end of the span.
+row
+cell #[code label]
+cell int
+cell The span's label.
+row
+cell #[code label_]
+cell unicode
+cell The span's label.
+row
+cell #[code lemma_]
+cell unicode
+cell The span's lemma.
+row
+cell #[code ent_id]
+cell int
+cell The integer ID of the named entity the token is an instance of.
+row
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.
+h(2, "init") Span.__init__ +h(2, "init") Span.__init__
+tag method +tag method
p Create a Span object from the #[code slice doc[start : end]]. p Create a Span object from the #[code slice doc[start : end]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert [t.text for t in span] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code doc] +cell #[code doc]
@ -89,7 +41,7 @@ p Create a Span object from the #[code slice doc[start : end]].
+cell A meaning representation of the span. +cell A meaning representation of the span.
+footrow +footrow
+cell return +cell returns
+cell #[code Span] +cell #[code Span]
+cell The newly constructed object. +cell The newly constructed object.
@ -98,6 +50,11 @@ p Create a Span object from the #[code slice doc[start : end]].
p Get a #[code Token] object. p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1].text == 'back'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code i] +cell #[code i]
@ -105,12 +62,17 @@ p Get a #[code Token] object.
+cell The index of the token within the span. +cell The index of the token within the span.
+footrow +footrow
+cell return +cell returns
+cell #[code Token] +cell #[code Token]
+cell The token at #[code span[i]]. +cell The token at #[code span[i]].
p Get a #[code Span] object. p Get a #[code Span] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1:3].text == 'back!'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code start_end] +cell #[code start_end]
@ -118,7 +80,7 @@ p Get a #[code Span] object.
+cell The slice of the span to get. +cell The slice of the span to get.
+footrow +footrow
+cell return +cell returns
+cell #[code Span] +cell #[code Span]
+cell The span at #[code span[start : end]]. +cell The span at #[code span[start : end]].
@ -127,9 +89,14 @@ p Get a #[code Span] object.
p Iterate over #[code Token] objects. p Iterate over #[code Token] objects.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert [t.text for t in span] == ['it', 'back', '!']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell #[code Token] +cell #[code Token]
+cell A #[code Token] object. +cell A #[code Token] object.
@ -138,19 +105,33 @@ p Iterate over #[code Token] objects.
p Get the number of tokens in the span. p Get the number of tokens in the span.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert len(span) == 3
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The number of tokens in the span. +cell The number of tokens in the span.
+h(2, "similarity") Span.similarity +h(2, "similarity") Span.similarity
+tag method +tag method
+tag-model("vectors")
p p
| Make a semantic similarity estimate. The default estimate is cosine | Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors. | similarity using an average of word vectors.
+aside-code("Example").
doc = nlp(u'green apples and red oranges')
green_apples = doc[:2]
red_oranges = doc[3:]
apples_oranges = green_apples.similarity(red_oranges)
oranges_apples = red_oranges.similarity(green_apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code other] +cell #[code other]
@ -160,7 +141,7 @@ p
| #[code Span], #[code Token] and #[code Lexeme] objects. | #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow +footrow
+cell return +cell returns
+cell float +cell float
+cell A scalar similarity score. Higher is more similar. +cell A scalar similarity score. Higher is more similar.
@ -178,87 +159,205 @@ p Retokenize the document, such that the span is merged into a single token.
| are inherited from the syntactic root token of the span. | are inherited from the syntactic root token of the span.
+footrow +footrow
+cell return +cell returns
+cell #[code Token] +cell #[code Token]
+cell The newly merged token. +cell The newly merged token.
+h(2, "text") Span.text
+tag property
p A unicode representation of the span text.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The original verbatim text of the span.
+h(2, "text_with_ws") Span.text_with_ws
+tag property
p
| The text content of the span with a trailing whitespace character if the
| last token has one.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell unicode
+cell The text content of the span (with trailing whitespace).
+h(2, "sent") Span.sent
+tag property
p The sentence span that this span is a part of.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Span]
+cell The sentence this is part of.
+h(2, "root") Span.root +h(2, "root") Span.root
+tag property +tag property
+tag-model("parse")
p p
| The token within the span that's highest in the parse tree. If there's a | The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered. | tie, the earlist is prefered.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
i, like, new, york, in_, autumn, dot = range(len(doc))
assert doc[new].head.text == 'York'
assert doc[york].head.text == 'like'
new_york = doc[new&#58;york+1]
assert new_york.root.text == 'York'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell return +cell returns
+cell #[code Token] +cell #[code Token]
+cell The root token. +cell The root token.
+h(2, "lefts") Span.lefts +h(2, "lefts") Span.lefts
+tag property +tag property
+tag-model("parse")
p Tokens that are to the left of the span, whose head is within the span. p Tokens that are to the left of the span, whose head is within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
lefts = [t.text for t in doc[3:7].lefts]
assert lefts == [u'New']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell #[code Token] +cell #[code Token]
+cell A left-child of a token of the span. +cell A left-child of a token of the span.
+h(2, "rights") Span.rights +h(2, "rights") Span.rights
+tag property +tag property
+tag-model("parse")
p Tokens that are to the right of the span, whose head is within the span. p Tokens that are to the right of the span, whose head is within the span.
+aside-code("Example").
doc = nlp(u'I like New York in Autumn.')
rights = [t.text for t in doc[2:4].rights]
assert rights == [u'in']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell #[code Token] +cell #[code Token]
+cell A right-child of a token of the span. +cell A right-child of a token of the span.
+h(2, "subtree") Span.subtree +h(2, "subtree") Span.subtree
+tag property +tag property
+tag-model("parse")
p Tokens that descend from tokens in the span, but fall outside it. p Tokens that descend from tokens in the span, but fall outside it.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
subtree = [t.text for t in doc[:3].subtree]
assert subtree == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell #[code Token] +cell #[code Token]
+cell A descendant of a token within the span. +cell A descendant of a token within the span.
+h(2, "has_vector") Span.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| object.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc[1:].has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the span has a vector data attached.
+h(2, "vector") Span.vector
+tag property
+tag-model("vectors")
p
| A real-valued meaning representation. Defaults to an average of the
| token vectors.
+aside-code("Example").
doc = nlp(u'I like apples')
assert doc[1:].vector.dtype == 'float32'
assert doc[1:].vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the span's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
+tag-model("vectors")
p
| The L2 norm of the span's vector representation.
+aside-code("Example").
doc = nlp(u'I like apples')
doc[1:].vector_norm # 4.800883928527915
doc[2:].vector_norm # 6.895897646384268
assert doc[1:].vector_norm != doc[2:].vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code sent]
+cell #[code Span]
+cell The sentence span that this span is a part of.
+row
+cell #[code start]
+cell int
+cell The token offset for the start of the span.
+row
+cell #[code end]
+cell int
+cell The token offset for the end of the span.
+row
+cell #[code start_char]
+cell int
+cell The character offset for the start of the span.
+row
+cell #[code end_char]
+cell int
+cell The character offset for the end of the span.
+row
+cell #[code text]
+cell unicode
+cell A unicode representation of the span text.
+row
+cell #[code text_with_ws]
+cell unicode
+cell
| The text content of the span with a trailing whitespace character
| if the last token has one.
+row
+cell #[code label]
+cell int
+cell The span's label.
+row
+cell #[code label_]
+cell unicode
+cell The span's label.
+row
+cell #[code lemma_]
+cell unicode
+cell The span's lemma.
+row
+cell #[code ent_id]
+cell int
+cell The integer ID of the named entity the token is an instance of.
+row
+cell #[code ent_id_]
+cell unicode
+cell The string ID of the named entity the token is an instance of.

View File

@ -7,16 +7,22 @@ p Map strings to and from integer IDs.
+h(2, "init") StringStore.__init__ +h(2, "init") StringStore.__init__
+tag method +tag method
p Create the #[code StringStore]. p
| Create the #[code StringStore]. Note that a newly initialised store will
| always include an empty string #[code ''] at position #[code 0].
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore([u'apple', u'orange'])
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code strings] +cell #[code strings]
+cell - +cell iterable
+cell A sequence of unicode strings to add to the store. +cell A sequence of unicode strings to add to the store.
+footrow +footrow
+cell return +cell returns
+cell #[code StringStore] +cell #[code StringStore]
+cell The newly constructed object. +cell The newly constructed object.
@ -25,9 +31,13 @@ p Create the #[code StringStore].
p Get the number of strings in the store. p Get the number of strings in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert len(stringstore) == 2
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The number of strings in the store. +cell The number of strings in the store.
@ -36,22 +46,32 @@ p Get the number of strings in the store.
p Retrieve a string from a given integer ID, or vice versa. p Retrieve a string from a given integer ID, or vice versa.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
int_id = stringstore[u'apple'] # 1
assert stringstore[int_id] == u'apple'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code string_or_id] +cell #[code string_or_id]
+cell bytes / unicode / int +cell bytes, unicode or int
+cell The value to encode. +cell The value to encode.
+footrow +footrow
+cell return +cell returns
+cell unicode / int +cell unicode or int
+cell The value to retrieved. +cell The value to be retrieved.
+h(2, "contains") StringStore.__contains__ +h(2, "contains") StringStore.__contains__
+tag method +tag method
p Check whether a string is in the store. p Check whether a string is in the store.
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
assert u'apple' in stringstore == True
assert u'cherry' in stringstore == False
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code string] +cell #[code string]
@ -59,49 +79,108 @@ p Check whether a string is in the store.
+cell The string to check. +cell The string to check.
+footrow +footrow
+cell return +cell returns
+cell bool +cell bool
+cell Whether the store contains the string. +cell Whether the store contains the string.
+h(2, "iter") StringStore.__iter__ +h(2, "iter") StringStore.__iter__
+tag method +tag method
p Iterate over the strings in the store, in order. p
| Iterate over the strings in the store, in order. Note that a newly
| initialised store will always include an empty string #[code ''] at
| position #[code 0].
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
all_strings = [s for s in stringstore]
assert all_strings == [u'', u'apple', u'orange']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell unicode +cell unicode
+cell A string in the store. +cell A string in the store.
+h(2, "dump") StringStore.dump +h(2, "to_disk") StringStore.to_disk
+tag method +tag method
p Save the strings to a JSON file. p Save the current state to a directory.
+aside-code("Example").
stringstore.to_disk('/path/to/strings')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code file] +cell #[code path]
+cell buffer +cell unicode or #[code Path]
+cell The file to save the strings. +cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+footrow +h(2, "from_disk") Tokenizer.from_disk
+cell return
+cell #[code None]
+cell -
+h(2, "load") StringStore.load
+tag method +tag method
p Load the strings from a JSON file. p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.strings import StringStore
stringstore = StringStore().from_disk('/path/to/strings')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code file] +cell #[code path]
+cell buffer +cell unicode or #[code Path]
+cell The file from which to load the strings. +cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow +footrow
+cell return +cell returns
+cell #[code None] +cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
store_bytes = stringstore.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell - +cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.strings import StringStore
store_bytes = stringstore.to_bytes()
new_store = StringStore().from_bytes(store_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code StringStore]
+cell The #[code StringStore] object.

View File

@ -4,32 +4,6 @@ include ../../_includes/_mixins
p Annotate part-of-speech tags on #[code Doc] objects. p Annotate part-of-speech tags on #[code Doc] objects.
+h(2, "load") Tagger.load
+tag classmethod
p Load the statistical model from the supplied path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocabulary. Must be shared by the documents to be processed.
+row
+cell #[code require]
+cell bool
+cell Whether to raise an error if the files are not found.
+footrow
+cell return
+cell #[code Tagger]
+cell The newly constructed object.
+h(2, "init") Tagger.__init__ +h(2, "init") Tagger.__init__
+tag method +tag method
@ -47,7 +21,7 @@ p Create a #[code Tagger].
+cell The statistical model. +cell The statistical model.
+footrow +footrow
+cell return +cell returns
+cell #[code Tagger] +cell #[code Tagger]
+cell The newly constructed object. +cell The newly constructed object.
@ -63,7 +37,7 @@ p Apply the tagger, setting the POS tags onto the #[code Doc] object.
+cell The tokens to be tagged. +cell The tokens to be tagged.
+footrow +footrow
+cell return +cell returns
+cell #[code None] +cell #[code None]
+cell - +cell -
@ -91,7 +65,7 @@ p Tag a stream of documents.
| parallel. | parallel.
+footrow +footrow
+cell yield +cell yields
+cell #[code Doc] +cell #[code Doc]
+cell Documents, in order. +cell Documents, in order.
@ -112,6 +86,6 @@ p Update the statistical model, with tags supplied for the given document.
+cell Manager for the gold-standard tags. +cell Manager for the gold-standard tags.
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell Number of tags predicted correctly. +cell Number of tags predicted correctly.

View File

@ -4,9 +4,296 @@ include ../../_includes/_mixins
p An individual token — i.e. a word, punctuation symbol, whitespace, etc. p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert token.text == u'Give'
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell returns
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p The number of unicode characters in the token, i.e. #[code token.text].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert len(token) == 4
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+aside-code("Example").
from spacy.attrs import IS_TITLE
doc = nlp(u'Give it back! He pleaded.')
token = doc[0]
assert token.check_flag(IS_TITLE) == True
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell returns
+cell bool
+cell Whether the flag is set.
+h(2, "similarity") Token.similarity
+tag method
+tag-model("vectors")
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+aside-code("Example").
apples, _, oranges = nlp(u'apples and oranges')
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_nbor = doc[0].nbor()
assert give_nbor.text == u'it'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell returns
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]].
+h(2, "is_ancestor") Token.is_ancestor
+tag method
+tag-model("parse")
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give = doc[0]
it = doc[1]
assert give.is_ancestor(it)
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell returns
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "ancestors") Token.ancestors
+tag property
+tag-model("parse")
p The rightmost token of this token's syntactic descendants.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
it_ancestors = doc[1].ancestors
assert [t.text for t in it_ancestors] == [u'Give']
he_ancestors = doc[4].ancestors
assert [t.text for t in he_ancestors] == [u'pleaded']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].
+h(2, "conjuncts") Token.conjuncts
+tag property
+tag-model("parse")
p A sequence of coordinated tokens, including the token itself.
+aside-code("Example").
doc = nlp(u'I like apples and oranges')
apples_conjuncts = doc[2].conjuncts
assert [t.text for t in apples_conjuncts] == [u'oranges']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
+tag-model("parse")
p A sequence of the token's immediate syntactic children.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_children = doc[0].children
assert [t.text for t in give_children] == [u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
+tag-model("parse")
p A sequence of all the token's syntactic descendents.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
give_subtree = doc[0].subtree
assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "has_vector") Token.has_vector
+tag property
+tag-model("vectors")
p
| A boolean value indicating whether a word vector is associated with the
| token.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.has_vector
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "vector") Token.vector
+tag property
+tag-model("vectors")
p A real-valued meaning representation.
+aside-code("Example").
doc = nlp(u'I like apples')
apples = doc[2]
assert apples.vector.dtype == 'float32'
assert apples.vector.shape == (300,)
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "vector_norm") Span.vector_norm
+tag property
+tag-model("vectors")
p The L2 norm of the token's vector representation.
+aside-code("Example").
doc = nlp(u'I like apples and pasta')
apples = doc[2]
pasta = doc[4]
apples.vector_norm # 6.89589786529541
pasta.vector_norm # 7.759851932525635
assert apples.vector_norm != pasta.vector_norm
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell float
+cell The L2 norm of the vector representation.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+row +row
+cell #[code vocab] +cell #[code vocab]
+cell #[code Vocab] +cell #[code Vocab]
@ -17,14 +304,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code Doc] +cell #[code Doc]
+cell The parent document. +cell The parent document.
+row
+cell #[code head]
+cell #[code Token]
+cell The syntactic parent, or "governor", of this token.
+row
+cell #[code left_edge]
+cell #[code Token]
+cell The leftmost token of this token's syntactic descendants.
+row
+cell #[code right_edge]
+cell #[code Token]
+cell The rightmost token of this token's syntactic descendents.
+row +row
+cell #[code i] +cell #[code i]
+cell int +cell int
+cell The index of the token within the parent document. +cell The index of the token within the parent document.
+row +row
+cell #[code ent_type] +cell #[code ent_type]
+cell int +cell int
+cell Named entity type. +cell Named entity type.
+row +row
+cell #[code ent_type_] +cell #[code ent_type_]
+cell unicode +cell unicode
@ -42,19 +346,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell unicode +cell unicode
+cell +cell
| IOB code of named entity tag. #[code "B"] | IOB code of named entity tag. #[code "B"]
| means the token begins an entity, #[code "I"] means it inside an | means the token begins an entity, #[code "I"] means it is inside
| entity, #[code "O"] means it is outside an entity, and | an entity, #[code "O"] means it is outside an entity, and
| #[code ""] means no entity tag is set. | #[code ""] means no entity tag is set.
+row +row
+cell #[code ent_id] +cell #[code ent_id]
+cell int +cell int
+cell ID of the entity the token is an instance of, if any. +cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row +row
+cell #[code ent_id_] +cell #[code ent_id_]
+cell unicode +cell unicode
+cell ID of the entity the token is an instance of, if any. +cell
| ID of the entity the token is an instance of, if any. Usually
| assigned by patterns in the Matcher.
+row +row
+cell #[code lemma] +cell #[code lemma]
@ -229,232 +537,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
+cell #[code lex_id] +cell #[code lex_id]
+cell int +cell int
+cell ID of the token's lexical type. +cell ID of the token's lexical type.
+row
+cell #[code text]
+cell unicode
+cell Verbatim text content.
+row
+cell #[code text_with_ws]
+cell unicode
+cell Text content, with trailing space character if present.
+row
+cell #[code whitespace]
+cell int
+cell Trailing space character if present.
+row
+cell #[code whitespace_]
+cell unicode
+cell Trailing space character if present.
+h(2, "init") Token.__init__
+tag method
p Construct a #[code Token] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code offset]
+cell int
+cell The index of the token within the document.
+footrow
+cell return
+cell #[code Token]
+cell The newly constructed object.
+h(2, "len") Token.__len__
+tag method
p Get the number of unicode characters in the token.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell int
+cell The number of unicode characters in the token.
+h(2, "check_flag") Token.check_flag
+tag method
p Check the value of a boolean flag.
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_id]
+cell int
+cell The attribute ID of the flag to check.
+footrow
+cell return
+cell bool
+cell Whether the flag is set.
+h(2, "nbor") Token.nbor
+tag method
p Get a neighboring token.
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The relative position of the token to get. Defaults to #[code 1].
+footrow
+cell return
+cell #[code Token]
+cell The token at position #[code self.doc[self.i+i]]
+h(2, "similarity") Token.similarity
+tag method
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
+table(["Name", "Type", "Description"])
+row
+cell other
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell return
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "is_ancestor") Token.is_ancestor
+tag method
p
| Check whether this token is a parent, grandparent, etc. of another
| in the dependency tree.
+table(["Name", "Type", "Description"])
+row
+cell descendant
+cell #[code Token]
+cell Another token.
+footrow
+cell return
+cell bool
+cell Whether this token is the ancestor of the descendant.
+h(2, "vector") Token.vector
+tag property
p A real-valued meaning representation.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A 1D numpy array representing the token's semantics.
+h(2, "has_vector") Token.has_vector
+tag property
p
| A boolean value indicating whether a word vector is associated with the
| object.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell bool
+cell Whether the token has a vector data attached.
+h(2, "head") Token.head
+tag property
p The syntactic parent, or "governor", of this token.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The head.
+h(2, "conjuncts") Token.conjuncts
+tag property
p A sequence of coordinated tokens, including the token itself.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A coordinated token.
+h(2, "children") Token.children
+tag property
p A sequence of the token's immediate syntactic children.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A child token such that #[code child.head==self].
+h(2, "subtree") Token.subtree
+tag property
p A sequence of all the token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell A descendant token such that #[code self.is_ancestor(descendant)].
+h(2, "left_edge") Token.left_edge
+tag property
p The leftmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The first token such that #[code self.is_ancestor(token)].
+h(2, "right_edge") Token.right_edge
+tag property
p The rightmost token of this token's syntactic descendents.
+table(["Name", "Type", "Description"])
+footrow
+cell return
+cell #[code Token]
+cell The last token such that #[code self.is_ancestor(token)].
+h(2, "ancestors") Token.ancestors
+tag property
p The rightmost token of this token's syntactic descendants.
+table(["Name", "Type", "Description"])
+footrow
+cell yield
+cell #[code Token]
+cell
| A sequence of ancestor tokens such that
| #[code ancestor.is_ancestor(self)].

View File

@ -6,6 +6,283 @@ p
| Segment text, and create #[code Doc] objects with the discovered segment | Segment text, and create #[code Doc] objects with the discovered segment
| boundaries. | boundaries.
+h(2, "init") Tokenizer.__init__
+tag method
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+aside-code("Example").
# Construction 1
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Construction 2
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+row
+cell #[code token_match]
+cell callable
+cell A boolean function matching strings to be recognised as tokens.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "call") Tokenizer.__call__
+tag method
p Tokenize a string.
+aside-code("Example").
tokens = tokenizer(u'This is a sentence')
assert len(tokens) == 4
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to tokenize.
+footrow
+cell returns
+cell #[code Doc]
+cell A container for linguistic annotations.
+h(2, "pipe") Tokenizer.pipe
+tag method
p Tokenize a stream of texts.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in tokenizer.pipe(texts, batch_size=50):
pass
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode texts.
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to accumulate in an internal buffer.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads to use, if the implementation supports
| multi-threading. The default tokenizer is single-threaded.
+footrow
+cell yields
+cell #[code Doc]
+cell A sequence of Doc objects, in order.
+h(2, "find_infix") Tokenizer.find_infix
+tag method
p Find internal split points of the string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to split.
+footrow
+cell returns
+cell list
+cell
| A list of #[code re.MatchObject] objects that have #[code .start()]
| and #[code .end()] methods, denoting the placement of internal
| segment separators, e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix
+tag method
p
| Find the length of a prefix that should be segmented from the string, or
| #[code None] if no prefix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int
+cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix
+tag method
p
| Find the length of a suffix that should be segmented from the string, or
| #[code None] if no suffix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int / #[code None]
+cell The length of the suffix if present, otherwise #[code None].
+h(2, "add_special_case") Tokenizer.add_special_case
+tag method
p
| Add a special-case tokenization rule. This mechanism is also used to add
| custom tokenizer exceptions to the language data. See the usage workflow
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
| for more details and examples.
+aside-code("Example").
from spacy.attrs import ORTH, LEMMA
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
tokenizer.add_special_case(case)
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to specially tokenize.
+row
+cell #[code token_attrs]
+cell iterable
+cell
| A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
@ -35,215 +312,3 @@ p
| A function to find internal segment separators, e.g. hyphens. | A function to find internal segment separators, e.g. hyphens.
| Returns a (possibly empty) list of #[code re.MatchObject] | Returns a (possibly empty) list of #[code re.MatchObject]
| objects. | objects.
+h(2, "load") Tokenizer.load
+tag classmethod
p Load a #[code Tokenizer], reading unsupplied components from the path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+footrow
+cell return
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "init") Tokenizer.__init__
+tag method
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+footrow
+cell return
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "call") Tokenizer.__call__
+tag method
p Tokenize a string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to tokenize.
+footrow
+cell return
+cell #[code Doc]
+cell A container for linguistic annotations.
+h(2, "pipe") Tokenizer.pipe
+tag method
p Tokenize a stream of texts.
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode texts.
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to accumulate in an internal buffer.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads to use, if the implementation supports
| multi-threading. The default tokenizer is single-threaded.
+footrow
+cell yield
+cell #[code Doc]
+cell A sequence of Doc objects, in order.
+h(2, "find_infix") Tokenizer.find_infix
+tag method
p Find internal split points of the string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to split.
+footrow
+cell return
+cell #[code List[re.MatchObject]]
+cell
| A list of objects that have #[code .start()] and #[code .end()]
| methods, denoting the placement of internal segment separators,
| e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix
+tag method
p
| Find the length of a prefix that should be segmented from the string, or
| #[code None] if no prefix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell return
+cell int / #[code None]
+cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix
+tag method
p
| Find the length of a suffix that should be segmented from the string, or
| #[code None] if no suffix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell return
+cell int / #[code None]
+cell The length of the suffix if present, otherwise #[code None].
+h(2, "add_special_case") Tokenizer.add_special_case
+tag method
p Add a special-case tokenization rule.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to specially tokenize.
+row
+cell #[code token_attrs]
+cell -
+cell
| A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+footrow
+cell return
+cell #[code None]
+cell -

View File

@ -14,7 +14,7 @@ p
| recommend having additional tests in place if your application depends on | recommend having additional tests in place if your application depends on
| any of spaCy's utilities. | any of spaCy's utilities.
+h(2, "get_data_path") get_data_path +h(2, "get_data_path") util.get_data_path
+tag function +tag function
p p
@ -28,11 +28,11 @@ p
+cell Only return path if it exists, otherwise return #[code None]. +cell Only return path if it exists, otherwise return #[code None].
+footrow +footrow
+cell return +cell returns
+cell #[code Path] / #[code None] +cell #[code Path] / #[code None]
+cell Data path or #[code None]. +cell Data path or #[code None].
+h(2, "set_data_path") set_data_path +h(2, "set_data_path") util.set_data_path
+tag function +tag function
p p
@ -49,7 +49,7 @@ p
+cell unicode or #[code Path] +cell unicode or #[code Path]
+cell Path to new data directory. +cell Path to new data directory.
+h(2, "get_lang_class") get_lang_class +h(2, "get_lang_class") util.get_lang_class
+tag function +tag function
p p
@ -70,11 +70,11 @@ p
+cell Two-letter language code, e.g. #[code 'en']. +cell Two-letter language code, e.g. #[code 'en'].
+footrow +footrow
+cell return +cell returns
+cell #[code Language] +cell #[code Language]
+cell Language class. +cell Language class.
+h(2, "resolve_model_path") resolve_model_path +h(2, "resolve_model_path") util.resolve_model_path
+tag function +tag function
p Resolve a model name or string to a model path. p Resolve a model name or string to a model path.
@ -90,11 +90,11 @@ p Resolve a model name or string to a model path.
+cell Package name, shortcut link or model path. +cell Package name, shortcut link or model path.
+footrow +footrow
+cell return +cell returns
+cell #[code Path] +cell #[code Path]
+cell Path to model data directory. +cell Path to model data directory.
+h(2, "is_package") is_package +h(2, "is_package") util.is_package
+tag function +tag function
p p
@ -112,11 +112,11 @@ p
+cell Name of package. +cell Name of package.
+footrow +footrow
+cell return +cell returns
+cell #[code bool] +cell #[code bool]
+cell #[code True] if installed package, #[code False] if not. +cell #[code True] if installed package, #[code False] if not.
+h(2, "get_model_package_path") get_model_package_path +h(2, "get_model_package_path") util.get_model_package_path
+tag function +tag function
p p
@ -134,11 +134,11 @@ p
+cell Name of installed package. +cell Name of installed package.
+footrow +footrow
+cell return +cell returns
+cell #[code Path] +cell #[code Path]
+cell Path to model data directory. +cell Path to model data directory.
+h(2, "parse_package_meta") parse_package_meta +h(2, "parse_package_meta") util.parse_package_meta
+tag function +tag function
p p
@ -163,11 +163,31 @@ p
+cell If #[code True], raise error if no #[code meta.json] is found. +cell If #[code True], raise error if no #[code meta.json] is found.
+footrow +footrow
+cell return +cell returns
+cell dict / #[code None] +cell dict / #[code None]
+cell Model meta data or #[code None]. +cell Model meta data or #[code None].
+h(2, "update_exc") update_exc +h(2, "is_in_jupyter") util.is_in_jupyter
+tag function
p
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
| notebook by detecting the IPython kernel. Mainly used for the
| #[+api("displacy") #[code displacy]] visualizer.
+aside-code("Example").
html = '&lt;h1&gt;Hello world!&lt;/h1&gt;'
if util.is_in_jupyter():
from IPython.core.display import display, HTML
return display(HTML(html))
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell bool
+cell #[code True] if in Jupyter, #[code False] if not.
+h(2, "update_exc") util.update_exc
+tag function +tag function
p p
@ -194,12 +214,12 @@ p
+cell Exception dictionaries to add to the base exceptions, in order. +cell Exception dictionaries to add to the base exceptions, in order.
+footrow +footrow
+cell return +cell returns
+cell dict +cell dict
+cell Combined tokenizer exceptions. +cell Combined tokenizer exceptions.
+h(2, "prints") prints +h(2, "prints") util.prints
+tag function +tag function
p p

View File

@ -7,59 +7,6 @@ p
| #[code Vocab] instance also provides access to the #[code StringStore], | #[code Vocab] instance also provides access to the #[code StringStore],
| and owns underlying C-data that is shared between #[code Doc] objects. | and owns underlying C-data that is shared between #[code Doc] objects.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.
+row
+cell #[code vectors_length]
+cell int
+cell The dimensionality of the word vectors, if present.
+h(2, "load") Vocab.load
+tag classmethod
p Load the vocabulary from a path.
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell #[code Path]
+cell The path to load from.
+row
+cell #[code lex_attr_getters]
+cell dict
+cell
| A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None].
+row
+cell #[code lemmatizer]
+cell -
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code tag_map]
+cell dict
+cell
| A dictionary mapping fine-grained tags to coarse-grained
| parts-of-speech, and optionally morphological attributes.
+row
+cell #[code oov_prob]
+cell float
+cell The default probability for out-of-vocabulary words.
+footrow
+cell return
+cell #[code Vocab]
+cell The newly constructed object.
+h(2, "init") Vocab.__init__ +h(2, "init") Vocab.__init__
+tag method +tag method
@ -73,11 +20,6 @@ p Create the vocabulary.
| A dictionary mapping attribute IDs to functions to compute them. | A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None]. | Defaults to #[code None].
+row
+cell #[code lemmatizer]
+cell -
+cell A lemmatizer. Defaults to #[code None].
+row +row
+cell #[code tag_map] +cell #[code tag_map]
+cell dict +cell dict
@ -86,23 +28,34 @@ p Create the vocabulary.
| parts-of-speech, and optionally morphological attributes. | parts-of-speech, and optionally morphological attributes.
+row +row
+cell #[code oov_prob] +cell #[code lemmatizer]
+cell float +cell object
+cell The default probability for out-of-vocabulary words. +cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code strings]
+cell #[code StringStore]
+cell
| A #[code StringStore] that maps strings to integers, and vice
| versa.
+footrow +footrow
+cell return +cell returns
+cell #[code Vocab] +cell #[code Vocab]
+cell The newly constructed object. +cell The newly constructed object.
+h(2, "len") Vocab.__len__ +h(2, "len") Vocab.__len__
+tag method +tag method
p Get the number of lexemes in the vocabulary. p Get the current number of lexemes in the vocabulary.
+aside-code("Example").
doc = nlp(u'This is a sentence.')
assert len(nlp.vocab) > 0
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The number of lexems in the vocabulary. +cell The number of lexems in the vocabulary.
@ -113,6 +66,10 @@ p
| Retrieve a lexeme, given an int ID or a unicode string. If a previously | Retrieve a lexeme, given an int ID or a unicode string. If a previously
| unseen unicode string is given, a new lexeme is created and stored. | unseen unicode string is given, a new lexeme is created and stored.
+aside-code("Example").
apple = nlp.vocab.strings['apple']
assert nlp.vocab[apple] == nlp.vocab[u'apple']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code id_or_string] +cell #[code id_or_string]
@ -120,25 +77,37 @@ p
+cell The integer ID of a word, or its unicode string. +cell The integer ID of a word, or its unicode string.
+footrow +footrow
+cell return +cell returns
+cell #[code Lexeme] +cell #[code Lexeme]
+cell The lexeme indicated by the given ID. +cell The lexeme indicated by the given ID.
+h(2, "iter") Span.__iter__ +h(2, "iter") Vocab.__iter__
+tag method +tag method
p Iterate over the lexemes in the vocabulary. p Iterate over the lexemes in the vocabulary.
+aside-code("Example").
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
+cell yield +cell yields
+cell #[code Lexeme] +cell #[code Lexeme]
+cell An entry in the vocabulary. +cell An entry in the vocabulary.
+h(2, "contains") Vocab.__contains__ +h(2, "contains") Vocab.__contains__
+tag method +tag method
p Check whether the string has an entry in the vocabulary. p
| Check whether the string has an entry in the vocabulary. To get the ID
| for a given string, you need to look it up in
| #[+api("vocab#attributes") #[code vocab.strings]].
+aside-code("Example").
apple = nlp.vocab.strings['apple']
oov = nlp.vocab.strings['dskfodkfos']
assert apple in nlp.vocab
assert oov not in nlp.vocab
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -147,32 +116,27 @@ p Check whether the string has an entry in the vocabulary.
+cell The ID string. +cell The ID string.
+footrow +footrow
+cell return +cell returns
+cell bool +cell bool
+cell Whether the string has an entry in the vocabulary. +cell Whether the string has an entry in the vocabulary.
+h(2, "resize_vectors") Vocab.resize_vectors
+tag method
p
| Set #[code vectors_length] to a new size, and allocate more memory for
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
+table(["Name", "Type", "Description"])
+row
+cell #[code new_size]
+cell int
+cell The new size of the vectors.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "add_flag") Vocab.add_flag +h(2, "add_flag") Vocab.add_flag
+tag method +tag method
p Set a new boolean flag to words in the vocabulary. p
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
| function will be called over the words currently in the vocab, and then
| applied to new words as they occur. You'll then be able to access the flag
| value on each token, using #[code token.check_flag(flag_id)].
+aside-code("Example").
def is_my_product(text):
products = [u'spaCy', u'Thinc', u'displaCy']
return text in products
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
doc = nlp(u'I like spaCy')
assert doc[2].check_flag(MY_PRODUCT) == True
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
@ -189,90 +153,104 @@ p Set a new boolean flag to words in the vocabulary.
| available bit will be chosen. | available bit will be chosen.
+footrow +footrow
+cell return +cell returns
+cell int +cell int
+cell The integer ID by which the flag value can be checked. +cell The integer ID by which the flag value can be checked.
+h(2, "dump") Vocab.dump +h(2, "to_disk") Vocab.to_disk
+tag method +tag method
p Save the lexemes binary data to the given location. p Save the current state to a directory.
+aside-code("Example").
nlp.vocab.to_disk('/path/to/vocab')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code loc] +cell #[code path]
+cell #[code Path] +cell unicode or #[code Path]
+cell The path to load from.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "load_lexemes") Vocab.load_lexemes
+tag method
p
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell unicode
+cell Path to load the lexemes.bin file from.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "dump_vectors") Vocab.dump_vectors
+tag method
p Save the word vectors to a binary file.
+table(["Name", "Type", "Description"])
+row
+cell #[code loc]
+cell #[code Path]
+cell The path to save to.
+footrow
+cell return
+cell #[code None]
+cell -
+h(2, "load_vectors") Vocab.load_vectors
+tag method
p Load vectors from a text-based file.
+table(["Name", "Type", "Description"])
+row
+cell #[code file_]
+cell buffer
+cell +cell
| The file to read from. Entries should be separated by newlines, | A path to a directory, which will be created if it doesn't exist.
| and each entry should be whitespace delimited. The first value | Paths may be either strings or #[code Path]-like objects.
| of the entry should be the word string, and subsequent entries
| should be the values of the vector.
+footrow +h(2, "from_disk") Vocab.from_disk
+cell return
+cell int
+cell The length of the vectors loaded.
+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
+tag method +tag method
p Load vectors from the location of a binary file. p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.vocab import Vocab
vocab = Vocab().from_disk('/path/to/vocab')
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code loc] +cell #[code path]
+cell unicode +cell unicode or #[code Path]
+cell The path of the binary file to load from. +cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow +footrow
+cell return +cell returns
+cell int +cell #[code Vocab]
+cell The length of the vectors loaded. +cell The modified #[code Vocab] object.
+h(2, "to_bytes") Vocab.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
vocab_bytes = nlp.vocab.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Vocab] object.
+h(2, "from_bytes") Vocab.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.vocab import Vocab
vocab_bytes = nlp.vocab.to_bytes()
vocab = Vocab()
vocab.from_bytes(vocab_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Vocab]
+cell The #[code Vocab] object.
+h(2, "attributes") Attributes
+aside-code("Example").
apple_id = nlp.vocab.strings['apple']
assert type(apple_id) == int
PERSON = nlp.vocab.strings['PERSON']
assert type(PERSON) == int
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.

View File

@ -56,20 +56,22 @@ p
from ...attrs import LANG from ...attrs import LANG
from ...util import update_exc from ...util import update_exc
# create Defaults class in the module scope (necessary for pickling!)
class XxxxxDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
# optional: replace flags with custom functions, e.g. like_num()
lex_attr_getters.update(LEX_ATTRS)
# merge base exceptions and custom tokenizer exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
# create actual Language class
class Xxxxx(Language): class Xxxxx(Language):
lang = 'xx' # language ISO code lang = 'xx' # language ISO code
Defaults = XxxxxDefaults # override defaults
# override defaults
class Defaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
# optional: replace flags with custom functions, e.g. like_num()
lex_attr_getters.update(LEX_ATTRS)
# merge base exceptions and custom tokenizer exceptions
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = set(STOP_WORDS)
# set default export this allows the language class to be lazy-loaded # set default export this allows the language class to be lazy-loaded
__all__ = ['Xxxxx'] __all__ = ['Xxxxx']

View File

@ -141,11 +141,11 @@ p
include ../api/_annotation/_named-entities include ../api/_annotation/_named-entities
+aside("Install") +aside("Install")
| The #[+api("load") spacy.load()] function configures a pipeline that | The #[+api("load") #[code spacy.load()]] function configures a pipeline that
| includes all of the available annotators for the given ID. In the example | includes all of the available annotators for the given ID. In the example
| above, the #[code 'en'] ID tells spaCy to load the default English | above, the #[code 'en'] ID tells spaCy to load the default English
| pipeline. If you have installed the data with | pipeline. If you have installed the data with
| #[code python -m spacy.en.download] this will include the entity | #[code python -m spacy download en], this will include the entity
| recognition model. | recognition model.
+h(2, "updating") Training and updating +h(2, "updating") Training and updating

View File

@ -4,58 +4,190 @@ include ../../_includes/_mixins
p p
| spaCy features a rule-matching engine that operates over tokens, similar | spaCy features a rule-matching engine that operates over tokens, similar
| to regular expressions. The rules can refer to token annotations and | to regular expressions. The rules can refer to token annotations (e.g.
| flags, and matches support callbacks to accept, modify and/or act on the | the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
| match. The rule matcher also allows you to associate patterns with | The rule matcher also lets you pass in a custom callback
| entity IDs, to allow some basic entity linking or disambiguation. | to act on matches for example, to merge entities and apply custom labels.
| You can also associate patterns with entity IDs, to allow some basic
| entity linking or disambiguation.
p Here's a minimal example. We first add a pattern that specifies three tokens: +aside("What about \"real\" regular expressions?")
+list("numbers") +h(2, "adding-patterns") Adding patterns
+item A token whose lower-case form matches "hello"
+item A token whose #[code is_punct] flag is set to #[code True]
+item A token whose lower-case form matches "world"
p p
| Once we've added the pattern, we can use the #[code matcher] as a | Let's say we want to enable spaCy to find a combination of three tokens:
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes +list("numbers")
| of #[code spacy.attrs]. +item
| A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
| or "HELLO".
+item
| A token whose #[strong #[code is_punct] flag is set to #[code True]],
| i.e. any punctuation.
+item
| A token whose #[strong lower-case form matches "world"], e.g. "World"
| or "WORLD".
+code. +code.
from spacy.matcher import Matcher [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]
matcher = Matcher(nlp.vocab)
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
doc = nlp(u'Hello, world!') p
| First, we initialise the #[code Matcher] with a vocab. The matcher must
| always share the same vocab with the documents it will operate on. We
| can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
| our custom pattern. The second argument lets you pass in an optional
| callback function to invoke on a successful match. For now, we set it
| to #[code None].
+code.
import spacy
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
doc = nlp(u'Hello, world! Hello world!')
matches = matcher(doc) matches = matcher(doc)
p p
| The returned matches include the ID, to let you associate the matches | The matcher returns a list of #[code (match_id, start, end)] tuples in
| with the patterns. You can also group multiple patterns together, which | this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
| is useful when you have a knowledge base of entities you want to match, | #[code doc[0:2]] of our original document. Optionally, we could also
| and you want to write multiple patterns for each entity. | choose to add more than one pattern, for example to also match sequences
| without punctuation between "hello" and "world":
+h(2, "entities-patterns") Entities and patterns
+code. +code.
matcher.add_entity( matcher.add('HelloWorld', on_match=None,
"GoogleNow", # Entity ID -- Helps you act on the match. [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
{"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional) [{LOWER: 'hello'}, {LOWER: 'world'}])
)
matcher.add_pattern( p
"GoogleNow", # Entity ID -- Created if doesn't exist. | By default, the matcher will only return the matches and
[ # The pattern is a list of *Token Specifiers*. | #[strong not do anything else], like merge entities or assign labels.
{ # This Token Specifier matches tokens whose orth field is "Google" | This is all up to you and can be defined individually for each pattern,
ORTH: "Google" | by passing in a callback function as the #[code on_match] argument on
}, | #[code add()]. This is useful, because it lets you write entirely custom
{ # This Token Specifier matches tokens whose orth field is "Now" | and #[strong pattern-specific logic]. For example, you might want to
ORTH: "Now" | merge #[em some] patterns into one token, while adding entity labels for
} | other pattern types. You shouldn't have to create different matchers for
], | each of those processes.
label=None # Can associate a label to the pattern-match, to handle it better.
) +h(2, "on_match") Adding #[code on_match] rules
p
| To move on to a more realistic example, let's say you're working with a
| large corpus of blog articles, and you want to match all mentions of
| "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
| To be safe, you only match on the uppercase versions, in case someone has
| written it as "Google i/o". You also add a second pattern with an added
| #[code {IS_DIGIT: True}] token this will make sure you also match on
| "Google I/O 2017". If your pattern matches, spaCy should execute your
| custom callback function #[code add_event_ent].
+code.
import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
nlp = spacy.load('en')
matcher = Matcher(nlp.vocab)
matcher.add('GoogleIO', on_match=add_event_ent,
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
EVENT = nlp.vocab.strings['EVENT']
def add_event_ent(matcher, doc, i, matches):
# Get the current match and create tuple of entity label, start and end.
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
match_id, start, end = matches[i]
doc.ents += ((EVENT, start, end),)
p
| In addition to mentions of "Google I/O", your data also contains some
| annoying pre-processing artefacts, like leftover HTML line breaks
| (e.g. #[code &lt;br&gt;] or #[code &lt;BR/&gt;]). While you're at it,
| you want to merge those into one token and flag them, to make sure you
| can easily ignore them later. So you add a second pattern and pass in a
| function #[code merge_and_flag]:
+code.
matcher.add('BAD_HTML', on_match=merge_and_flag,
[{ORTH: '&lt;'}, {LOWER: 'br'}, {ORTH: '&gt;'}],
[{ORTH: '&lt;'}, {LOWER: 'br/'}, {ORTH: '&gt;'}])
# Add a new custom flag to the vocab, which is always False by default.
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
def merge_and_flag(matcher, doc, i, matches):
match_id, start, end = matches[i]
span = doc[start : end]
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
+aside("Tip: Visualizing matches")
| When working with entities, you can use #[+api("displacy") displaCy]
| to quickly generate a NER visualization from your updated #[code Doc],
| which can be exported as an HTML file:
+code.o-no-block.
from spacy import displacy
html = displacy.render(doc, style='ent', page=True,
options={'ents': ['EVENT']})
| For more info and examples, see the usage workflow on
| #[+a("/docs/usage/visualizers") visualizing spaCy].
p
| We can now call the matcher on our documents. The patterns will be
| matched in the order they occur in the text.
+code.
doc = nlp(LOTS_OF_TEXT)
matcher(doc)
+h(3, "on_match-callback") The callback function
p
| The matcher will first collect all matches over the document. It will
| then iterate over the matches, lookup the callback for the entity ID
| that was matched, and invoke it. When the callback is invoked, it is
| passed four arguments: the matcher itself, the document, the position of
| the current match, and the total list of matches. This allows you to
| write callbacks that consider the entire set of matched phrases, so that
| you can resolve overlaps and other conflicts in whatever way you prefer.
+table(["Argument", "Type", "Description"])
+row
+cell #[code matcher]
+cell #[code Matcher]
+cell The matcher instance.
+row
+cell #[code doc]
+cell #[code Doc]
+cell The document the matcher was used on.
+row
+cell #[code i]
+cell int
+cell Index of the current match (#[code matches[i]]).
+row
+cell #[code matches]
+cell list
+cell
| A list of #[code (match_id, start, end)] tuples, describing the
| matches. A match tuple describes a span #[code doc[start:end]].
| The #[code match_id] is the ID of the added match pattern.
+h(2, "quantifiers") Using quantifiers +h(2, "quantifiers") Using quantifiers
@ -82,78 +214,4 @@ p
p p
| There are no nested or scoped quantifiers. You can build those | There are no nested or scoped quantifiers. You can build those
| behaviours with acceptors and | behaviours with #[code on_match] callbacks.
| #[+api("matcher#add_entity") #[code on_match]] callbacks.
+h(2, "acceptor-functions") Acceptor functions
p
| The #[code acceptor] keyword of #[code matcher.add_entity()] allows you to
| pass a function to reject or modify matches. The function you pass should
| take five arguments: #[code doc], #[code ent_id], #[code label], #[code start],
| and #[code end]. You can return a falsey value to reject the match, or
| return a 4-tuple #[code (ent_id, label, start, end)].
+code.
from spacy.tokens.doc import Doc
def trim_title(doc, ent_id, label, start, end):
if doc[start].check_flag(IS_TITLE_TERM):
return (ent_id, label, start+1, end)
else:
return (ent_id, label, start, end)
titles = set(title.lower() for title in [u'Mr.', 'Dr.', 'Ms.', u'Admiral'])
IS_TITLE_TERM = matcher.vocab.add_flag(lambda string: string.lower() in titles)
matcher.add_entity('PersonName', acceptor=trim_title)
matcher.add_pattern('PersonName', [{LOWER: 'mr.'}, {LOWER: 'cruise'}])
matcher.add_pattern('PersonName', [{LOWER: 'dr.'}, {LOWER: 'seuss'}])
doc = Doc(matcher.vocab, words=[u'Mr.', u'Cruise', u'likes', 'Dr.', u'Seuss'])
for ent_id, label, start, end in matcher(doc):
print(doc[start:end].text)
# Cruise
# Seuss
p
| Passing an #[code acceptor] function allows you to match patterns with
| arbitrary logic that can't easily be expressed by a finite-state machine.
| You can look at the entirety of the
| matched phrase, and its context in the document, and decide to move
| the boundaries or reject the match entirely.
+h(2, "callback-functions") Callback functions
p
| In spaCy &lt;1.0, the #[code Matcher] automatically tagged matched phrases
| with entity types. Since spaCy 1.0, the matcher no longer acts on matches
| automatically. By default, the match list is returned for the user to action.
| However, it's often more convenient to register the required actions as a
| callback. You can do this by passing a function to the #[code on_match]
| keyword argument of #[code matcher.add_entity].
+aside-code("Example").
def merge_phrases(matcher, doc, i, matches):
'''
Merge a phrase. We have to be careful here because we'll change the token indices.
To avoid problems, merge all the phrases once we're called on the last match.
'''
if i != len(matches)-1:
return None
# Get Span objects
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
for ent_id, label, span in spans:
span.merge(label=label, tag='NNP' if label else span.root.tag_)
matcher.add_entity('GoogleNow', on_match=merge_phrases)
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
matcher(doc)
print([w.text for w in doc])
# [u'Google Now', u'is', u'being', u'rebranded']
p
| The matcher will first collect all matches over the document. It will
| then iterate over the matches, look-up the callback for the entity ID
| that was matched, and invoke it. When the callback is invoked, it is
| passed four arguments: the matcher itself, the document, the position of
| the current match, and the total list of matches. This allows you to
| write callbacks that consider the entire set of matched phrases, so that
| you can resolve overlaps and other conflicts in whatever way you prefer.

View File

@ -2,9 +2,218 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p
| We also re-wrote a large part of the documentation and usage workflows,
| and added more examples.
+h(2, "features") New features +h(2, "features") New features
+h(3, "features-displacy") displaCy visualizer with Jupyter support
+aside-code("Example").
from spacy import displacy
doc = nlp(u'This is a sentence about Facebook.')
displacy.serve(doc, style='dep') # run the web server
html = displacy.render(doc, style='ent') # generate HTML
p
| Our popular dependency and named entity visualizers are now an official
| part of the spaCy library! displaCy can run a simple web server, or
| generate raw HTML markup or SVG files to be exported. You can pass in one
| or more docs, and customise the style. displaCy also auto-detects whether
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
| visualizations in your notebook.
+infobox
| #[strong API:] #[+api("displacy") #[code displacy]]
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
+h(3, "features-loading") Loading
+aside-code("Example").
nlp = spacy.load('en') # shortcut link
nlp = spacy.load('en_core_web_sm') # package
nlp = spacy.load('/path/to/en') # unicode path
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
p
| The improved #[code spacy.load] makes loading models easier and more
| transparent. You can load a model by supplying its
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
| path or a #[code Path]-like object. spaCy will try resolving the load
| argument in this order. The #[code path] keyword argument is now deprecated.
p
| The #[code Language] class to initialise will be determined based on the
| model's settings. If no model is found, spaCy will let you know and won't
| just return an empty #[code Language] object anymore. If you want a blank
| language, you can always import the class directly, e.g.
| #[code from spacy.lang.en import English].
+infobox
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-language") Improved language data and processing pipelines
+aside-code("Example").
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
+aside-code("Example").
LOOKUP = {
"aba": "abar",
"ababa": "abar",
"ababais": "abar",
"ababan": "abar",
"ababanes": "ababán"
}
p
| spaCy now supports simple lookup-based lemmatization. The data is stored
| in a dictionary mapping a string to its lemma. To determine a token's
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
| the lookup table, and should be returned by the #[code create_lemmatizer]
| classmethod of the language's defaults.
+infobox
| #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
+h(3, "features-matcher") Revised matcher API
+aside-code("Example").
from spacy.matcher import Matcher
from spacy.attrs import LOWER, IS_PUNCT
matcher = Matcher(nlp.vocab)
matcher.add('HelloWorld', on_match=None,
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
[{LOWER: 'hello'}, {LOWER: 'world'}])
assert len(matcher) == 1
assert 'HelloWorld' in matcher
p
| Patterns can now be added to the matcher by calling
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
| callback function to be invoked on each match, and one or more patterns.
| This allows you to write powerful, pattern-specific logic using only one
| matcher. For example, you might only want to merge some entity types,
| and set custom flags for other matched patterns.
+infobox
| #[strong API:] #[+api("matcher") #[code Matcher]]
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
+h(3, "features-serializer") Serialization
+infobox
| #[strong API:] #[+api("serializer") #[code Serializer]]
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
+h(3, "features-models") Neural network models for English, German, French and Spanish
+infobox
| #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
| #[strong Usage:] #[+a("/docs/usage/models") Models]
+h(2, "incompat") Backwards incompatibilities +h(2, "incompat") Backwards incompatibilities
+table(["Old", "New"])
+row
+cell #[code Language.save_to_directory]
+cell #[+api("language#to_disk") #[code Language.to_disk]]
+row
+cell #[code Tokenizer.load]
+cell
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
+row
+cell #[code Tagger.load]
+cell
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
+row
+cell #[code DependencyParser.load]
+cell
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
+row
+cell #[code EntityRecognizer.load]
+cell
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
+row
+cell
| #[code Vocab.load]
| #[code Vocab.load_lexemes]
| #[code Vocab.load_vectors]
| #[code Vocab.load_vectors_from_bin_loc]
+cell
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
+row
+cell
| #[code Vocab.dump]
| #[code Vocab.dump_vectors]
+cell
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
+row
+cell
| #[code StringStore.load]
+cell
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
+row
+cell
| #[code StringStore.dump]
+cell
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
+row
+cell #[code Matcher.load]
+cell -
+row
+cell
| #[code Matcher.add_pattern]
| #[code Matcher.add_entity]
+cell #[+api("matcher#add") #[code Matcher.add]]
+row
+cell #[code Matcher.get_entity]
+cell #[+api("matcher#get") #[code Matcher.get]]
+row
+cell #[code Matcher.has_entity]
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
+row
+cell #[code Doc.read_bytes]
+cell
+row
+cell #[code Token.is_ancestor_of]
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
+h(2, "migrating") Migrating from spaCy 1.x +h(2, "migrating") Migrating from spaCy 1.x