mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Merge docstrings
This commit is contained in:
commit
5db89053aa
|
@ -14,3 +14,4 @@ regex==2017.4.5
|
|||
ftfy>=4.4.2,<5.0.0
|
||||
pytest>=3.0.6,<4.0.0
|
||||
pip>=9.0.0,<10.0.0
|
||||
mock>=2.0.0,<3.0.0
|
||||
|
|
|
@ -20,7 +20,17 @@ def download(model, direct=False):
|
|||
compatibility = get_compatibility()
|
||||
version = get_version(model_name, compatibility)
|
||||
download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version))
|
||||
link(model_name, model, force=True)
|
||||
try:
|
||||
link(model_name, model, force=True)
|
||||
except:
|
||||
# Dirty, but since spacy.download and the auto-linking is mostly
|
||||
# a convenience wrapper, it's best to show a success message and
|
||||
# loading instructions, even if linking fails.
|
||||
prints("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load "
|
||||
"the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful")
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
|
|
|
@ -11,15 +11,14 @@ from .. import util
|
|||
|
||||
def info(model=None, markdown=False):
|
||||
if model:
|
||||
data_path = util.get_data_path()
|
||||
data = util.parse_package_meta(data_path / model, require=True)
|
||||
model_path = Path(__file__).parent / data_path / model
|
||||
model_path = util.resolve_model_path(model)
|
||||
meta = util.parse_package_meta(model_path)
|
||||
if model_path.resolve() != model_path:
|
||||
data['link'] = path2str(model_path)
|
||||
data['source'] = path2str(model_path.resolve())
|
||||
meta['link'] = path2str(model_path)
|
||||
meta['source'] = path2str(model_path.resolve())
|
||||
else:
|
||||
data['source'] = path2str(model_path)
|
||||
print_info(data, 'model %s' % model, markdown)
|
||||
meta['source'] = path2str(model_path)
|
||||
print_info(meta, 'model %s' % model, markdown)
|
||||
else:
|
||||
data = {'spaCy version': about.__version__,
|
||||
'Location': path2str(Path(__file__).parent.parent),
|
||||
|
|
|
@ -306,25 +306,17 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
"""
|
||||
Create a GoldParse.
|
||||
"""Create a GoldParse.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document the annotations refer to.
|
||||
words:
|
||||
A sequence of unicode word strings.
|
||||
tags:
|
||||
A sequence of strings, representing tag annotations.
|
||||
heads:
|
||||
A sequence of integers, representing syntactic head offsets.
|
||||
deps:
|
||||
A sequence of strings, representing the syntactic relation types.
|
||||
entities:
|
||||
A sequence of named entity annotations, either as BILUO tag strings,
|
||||
or as (start_char, end_char, label) tuples, representing the entity
|
||||
positions.
|
||||
Returns (GoldParse): The newly constructed object.
|
||||
doc (Doc): The document the annotations refer to.
|
||||
words (iterable): A sequence of unicode word strings.
|
||||
tags (iterable): A sequence of strings, representing tag annotations.
|
||||
heads (iterable): A sequence of integers, representing syntactic head offsets.
|
||||
deps (iterable): A sequence of strings, representing the syntactic relation types.
|
||||
entities (iterable): A sequence of named entity annotations, either as
|
||||
BILUO tag strings, or as `(start_char, end_char, label)` tuples,
|
||||
representing the entity positions.
|
||||
RETURNS (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
words = [token.text for token in doc]
|
||||
|
@ -389,55 +381,45 @@ cdef class GoldParse:
|
|||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Get the number of gold-standard tokens.
|
||||
"""Get the number of gold-standard tokens.
|
||||
|
||||
Returns (int): The number of gold-standard tokens.
|
||||
RETURNS (int): The number of gold-standard tokens.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""
|
||||
Whether the provided syntactic annotations form a projective dependency
|
||||
tree.
|
||||
"""Whether the provided syntactic annotations form a projective
|
||||
dependency tree.
|
||||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities):
|
||||
"""
|
||||
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (biluo).
|
||||
"""Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (BILUO).
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document that the entity offsets refer to. The output tags will
|
||||
refer to the token boundaries within the document.
|
||||
doc (Doc): The document that the entity offsets refer to. The output tags
|
||||
will refer to the token boundaries within the document.
|
||||
entities (iterable): A sequence of `(start, end, label)` triples. `start` and
|
||||
`end` should be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
|
||||
entities (sequence):
|
||||
A sequence of (start, end, label) triples. start and end should be
|
||||
character-offset integers denoting the slice into the original string.
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object. The
|
||||
training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
||||
Returns:
|
||||
tags (list):
|
||||
A list of unicode strings, describing the tags. Each tag string will
|
||||
be of the form either "", "O" or "{action}-{label}", where action is one
|
||||
of "B", "I", "L", "U". The string "-" is used where the entity
|
||||
offsets don't align with the tokenization in the Doc object. The
|
||||
training algorithm will view these as missing values. "O" denotes
|
||||
a non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
"I" the inside of an entity of three or more tokens, and "L" the end
|
||||
of an entity of two or more tokens. "U" denotes a single-token entity.
|
||||
|
||||
Example:
|
||||
text = 'I like London.'
|
||||
entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
doc = nlp.tokenizer(text)
|
||||
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
|
||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
EXAMPLE:
|
||||
>>> text = 'I like London.'
|
||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
>>> doc = nlp.tokenizer(text)
|
||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||
>>> assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
"""
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx+len(token): token.i for token in doc}
|
||||
|
|
|
@ -13,21 +13,23 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class BengaliDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
|
||||
|
||||
class Bengali(Language):
|
||||
lang = 'bn'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'bn'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = TAG_MAP
|
||||
stop_words = STOP_WORDS
|
||||
lemma_rules = LEMMA_RULES
|
||||
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
Defaults = BengaliDefaults
|
||||
|
||||
|
||||
__all__ = ['Bengali']
|
||||
|
|
|
@ -10,15 +10,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class DanishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Danish(Language):
|
||||
lang = 'da'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'da'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = DanishDefaults
|
||||
|
||||
|
||||
__all__ = ['Danish']
|
||||
|
|
|
@ -14,21 +14,23 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class GermanDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class German(Language):
|
||||
lang = 'de'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'de'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
tag_map = dict(TAG_MAP)
|
||||
stop_words = set(STOP_WORDS)
|
||||
syntax_iterators = dict(SYNTAX_ITERATORS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = GermanDefaults
|
||||
|
||||
|
||||
__all__ = ['German']
|
||||
|
|
|
@ -32,7 +32,6 @@ class EnglishDefaults(Language.Defaults):
|
|||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
|
||||
Defaults = EnglishDefaults
|
||||
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ class SpanishDefaults(Language.Defaults):
|
|||
|
||||
class Spanish(Language):
|
||||
lang = 'es'
|
||||
|
||||
Defaults = SpanishDefaults
|
||||
|
||||
|
||||
__all__ = ['Spanish']
|
||||
|
|
|
@ -10,15 +10,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class FinnishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Finnish(Language):
|
||||
lang = 'fi'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fi'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = FinnishDefaults
|
||||
|
||||
|
||||
__all__ = ['Finnish']
|
||||
|
|
|
@ -13,22 +13,24 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class FrenchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class French(Language):
|
||||
lang = 'fr'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'fr'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = FrenchDefaults
|
||||
|
||||
|
||||
__all__ = ['French']
|
||||
|
|
|
@ -9,15 +9,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class HebrewDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Hebrew(Language):
|
||||
lang = 'he'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'he'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = HebrewDefaults
|
||||
|
||||
|
||||
__all__ = ['Hebrew']
|
||||
|
|
|
@ -13,23 +13,25 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class HungarianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Hungarian(Language):
|
||||
lang = 'hu'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'hu'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
prefixes = tuple(TOKENIZER_PREFIXES)
|
||||
suffixes = tuple(TOKENIZER_SUFFIXES)
|
||||
infixes = tuple(TOKENIZER_INFIXES)
|
||||
token_match = TOKEN_MATCH
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = HungarianDefaults
|
||||
|
||||
|
||||
__all__ = ['Hungarian']
|
||||
|
|
|
@ -11,19 +11,21 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class ItalianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Italian(Language):
|
||||
lang = 'it'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'it'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = ItalianDefaults
|
||||
|
||||
|
||||
__all__ = ['Italian']
|
||||
|
|
|
@ -11,15 +11,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class NorwegianDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Norwegian(Language):
|
||||
lang = 'nb'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nb'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = NorwegianDefaults
|
||||
|
||||
|
||||
__all__ = ['Norwegian']
|
||||
|
|
|
@ -9,16 +9,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class DutchDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Dutch(Language):
|
||||
lang = 'nl'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'nl'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = DutchDefaults
|
||||
|
||||
|
||||
__all__ = ['Dutch']
|
||||
|
|
|
@ -9,15 +9,17 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class PolishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
|
||||
class Polish(Language):
|
||||
lang = 'pl'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pl'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = PolishDefaults
|
||||
|
||||
|
||||
__all__ = ['Polish']
|
||||
|
|
|
@ -13,20 +13,22 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class PortugueseDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Portuguese(Language):
|
||||
lang = 'pt'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'pt'
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = PortugueseDefaults
|
||||
|
||||
|
||||
__all__ = ['Portuguese']
|
||||
|
|
|
@ -13,19 +13,21 @@ from ...attrs import LANG
|
|||
from ...util import update_exc
|
||||
|
||||
|
||||
class SwedishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
|
||||
|
||||
class Swedish(Language):
|
||||
lang = 'sv'
|
||||
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'sv'
|
||||
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
return Lemmatizer(LOOKUP)
|
||||
Defaults = SwedishDefaults
|
||||
|
||||
|
||||
__all__ = ['Swedish']
|
||||
|
|
|
@ -116,14 +116,30 @@ class BaseDefaults(object):
|
|||
|
||||
|
||||
class Language(object):
|
||||
"""
|
||||
A text-processing pipeline. Usually you'll load this once per process, and
|
||||
pass the instance around your program.
|
||||
"""A text-processing pipeline. Usually you'll load this once per process,
|
||||
and pass the instance around your application.
|
||||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (unicode): Two-letter language ID, i.e. ISO code.
|
||||
"""
|
||||
Defaults = BaseDefaults
|
||||
lang = None
|
||||
|
||||
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
|
||||
"""Initialise a Language object.
|
||||
|
||||
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
|
||||
`Language.Defaults.create_vocab`.
|
||||
make_doc (callable): A function that takes text and returns a `Doc`
|
||||
object. Usually a `Tokenizer`.
|
||||
pipeline (list): A list of annotation processes or IDs of annotation,
|
||||
processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked
|
||||
up in `Language.Defaults.factories`.
|
||||
meta (dict): Custom meta data for the Language class. Is written to by
|
||||
models to add model meta data.
|
||||
RETURNS (Language): The newly constructed object.
|
||||
"""
|
||||
self.meta = dict(meta)
|
||||
|
||||
if vocab is True:
|
||||
|
@ -147,22 +163,17 @@ class Language(object):
|
|||
self.pipeline = []
|
||||
|
||||
def __call__(self, text, **disabled):
|
||||
"""
|
||||
Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
"""'Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
text (unicode): The text to be processed.
|
||||
**disabled: Elements of the pipeline that should not be run.
|
||||
RETURNS (Doc): A container for accessing the annotations.
|
||||
|
||||
Returns:
|
||||
doc (Doc): A container for accessing the annotations.
|
||||
|
||||
Example:
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
EXAMPLE:
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
>>> tokens[0].text, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
doc = self.make_doc(text)
|
||||
|
@ -174,6 +185,21 @@ class Language(object):
|
|||
return doc
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None):
|
||||
"""Update the models in the pipeline.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The droput rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
|
||||
grads = {}
|
||||
def get_grads(W, dW, key=None):
|
||||
grads[key] = (W, dW)
|
||||
|
@ -204,7 +230,20 @@ class Language(object):
|
|||
for doc, gold in docs_golds:
|
||||
yield doc, gold
|
||||
|
||||
def begin_training(self, get_gold_tuples, **cfg):
|
||||
def begin_training(self, gold_tuples, **cfg):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer. Used as a contextmanager.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
**cfg: Config parameters.
|
||||
YIELDS (tuple): A trainer and an optimizer.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
>>> for epoch in trainer.epochs(gold):
|
||||
>>> for docs, golds in epoch:
|
||||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
# Populate vocab
|
||||
for _, annots_brackets in get_gold_tuples():
|
||||
for annots, _ in annots_brackets:
|
||||
|
@ -233,6 +272,17 @@ class Language(object):
|
|||
|
||||
@contextmanager
|
||||
def use_params(self, params, **cfg):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary. Can be used as a contextmanager, in which case,
|
||||
models go back to their original weights after the block.
|
||||
|
||||
params (dict): A dictionary of parameters keyed by model ID.
|
||||
**cfg: Config parameters.
|
||||
|
||||
EXAMPLE:
|
||||
>>> with nlp.use_params(optimizer.averages):
|
||||
>>> nlp.to_disk('/tmp/checkpoint')
|
||||
"""
|
||||
contexts = [pipe.use_params(params) for pipe
|
||||
in self.pipeline if hasattr(pipe, 'use_params')]
|
||||
# TODO: Having trouble with contextlib
|
||||
|
@ -250,16 +300,20 @@ class Language(object):
|
|||
pass
|
||||
|
||||
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
||||
"""
|
||||
Process texts as a stream, and yield Doc objects in order.
|
||||
"""Process texts as a stream, and yield `Doc` objects in order. Supports
|
||||
GIL-free multi-threading.
|
||||
|
||||
Supports GIL-free multi-threading.
|
||||
texts (iterator): A sequence of texts to process.
|
||||
n_threads (int): The number of worker threads to use. If -1, OpenMP will
|
||||
decide how many to use at run time. Default is 2.
|
||||
batch_size (int): The number of texts to buffer.
|
||||
**disabled: Pipeline components to exclude.
|
||||
YIELDS (Doc): Documents in the order of the original text.
|
||||
|
||||
Arguments:
|
||||
texts (iterator)
|
||||
tag (bool)
|
||||
parse (bool)
|
||||
entity (bool)
|
||||
EXAMPLE:
|
||||
>>> texts = [u'One document.', u'...', u'Lots of documents']
|
||||
>>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
|
||||
>>> assert doc.is_parsed
|
||||
"""
|
||||
#docs = (self.make_doc(text) for text in texts)
|
||||
docs = texts
|
||||
|
@ -267,7 +321,6 @@ class Language(object):
|
|||
name = getattr(proc, 'name', None)
|
||||
if name in disabled and not disabled[name]:
|
||||
continue
|
||||
|
||||
if hasattr(proc, 'pipe'):
|
||||
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
|
||||
else:
|
||||
|
@ -278,11 +331,12 @@ class Language(object):
|
|||
def to_disk(self, path, **exclude):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
Args:
|
||||
path: A path to a directory, which will be created if it doesn't
|
||||
exist. Paths may be either strings or pathlib.Path-like
|
||||
objects.
|
||||
**exclude: Prevent named attributes from being saved.
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
**exclude: Named attributes to prevent from being saved.
|
||||
|
||||
EXAMPLE:
|
||||
>>> nlp.to_disk('/path/to/models')
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
|
@ -301,12 +355,17 @@ class Language(object):
|
|||
dill.dump(props, file_)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Load the current state from a directory.
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
Args:
|
||||
path: A path to a directory. Paths may be either strings or
|
||||
pathlib.Path-like objects.
|
||||
**exclude: Prevent named attributes from being saved.
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.language import Language
|
||||
>>> nlp = Language().from_disk('/path/to/models')
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
for name in path.iterdir():
|
||||
|
@ -320,10 +379,8 @@ class Language(object):
|
|||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
Args:
|
||||
path: A path to a directory. Paths may be either strings or
|
||||
pathlib.Path-like objects.
|
||||
**exclude: Prevent named attributes from being serialized.
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Language` object.
|
||||
"""
|
||||
props = dict(self.__dict__)
|
||||
for key in exclude:
|
||||
|
@ -334,13 +391,12 @@ class Language(object):
|
|||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
Args:
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Prevent named attributes from being loaded.
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Language): The `Language` object.
|
||||
"""
|
||||
props = dill.loads(bytes_data)
|
||||
for key, value in props.items():
|
||||
if key not in exclude:
|
||||
setattr(self, key, value)
|
||||
return self
|
||||
|
||||
|
|
|
@ -30,19 +30,16 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""
|
||||
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
"""An entry in the vocabulary. A `Lexeme` has no string context – it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
"""
|
||||
Create a Lexeme object.
|
||||
"""Create a Lexeme object.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab): The parent vocabulary
|
||||
orth (int): The orth id of the lexeme.
|
||||
vocab (Vocab): The parent vocabulary
|
||||
orth (int): The orth id of the lexeme.
|
||||
Returns (Lexeme): The newly constructd object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
|
@ -82,35 +79,28 @@ cdef class Lexeme:
|
|||
return self.c.orth
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""
|
||||
Change the value of a boolean flag.
|
||||
"""Change the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
value (bool): The new value of the flag.
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
value (bool): The new value of the flag.
|
||||
"""
|
||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||
|
||||
def check_flag(self, attr_id_t flag_id):
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
"""Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
Returns (bool): The value of the flag.
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
RETURNS (bool): The value of the flag.
|
||||
"""
|
||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""Compute a semantic similarity estimate. Defaults to cosine over
|
||||
vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
The object to compare with. By default, accepts Doc, Span,
|
||||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
|
@ -140,6 +130,11 @@ cdef class Lexeme:
|
|||
self.orth = self.c.orth
|
||||
|
||||
property has_vector:
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
for i in range(self.vocab.vectors_length):
|
||||
|
@ -149,6 +144,10 @@ cdef class Lexeme:
|
|||
return False
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the lexeme's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.l2_norm
|
||||
|
||||
|
@ -156,6 +155,11 @@ cdef class Lexeme:
|
|||
self.c.l2_norm = value
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation.
|
||||
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the lexeme's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
|
@ -196,6 +200,14 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the token text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the token.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.orth_
|
||||
|
||||
property lower:
|
||||
def __get__(self): return self.c.lower
|
||||
def __set__(self, int x): self.c.lower = x
|
||||
|
|
|
@ -87,7 +87,7 @@ ctypedef TokenPatternC* TokenPatternC_ptr
|
|||
ctypedef pair[int, TokenPatternC_ptr] StateC
|
||||
|
||||
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
|
||||
cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
|
||||
object token_specs) except NULL:
|
||||
pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
|
||||
cdef int i
|
||||
|
@ -99,15 +99,21 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, attr_t label,
|
|||
pattern[i].attrs[j].attr = attr
|
||||
pattern[i].attrs[j].value = value
|
||||
i = len(token_specs)
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(3, sizeof(AttrValueC))
|
||||
pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
|
||||
pattern[i].attrs[0].attr = ID
|
||||
pattern[i].attrs[0].value = entity_id
|
||||
pattern[i].attrs[1].attr = ENT_TYPE
|
||||
pattern[i].attrs[1].value = label
|
||||
pattern[i].nr_attr = 0
|
||||
return pattern
|
||||
|
||||
|
||||
cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
|
||||
while pattern.nr_attr != 0:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
assert id_attr.attr == ID
|
||||
return id_attr.value
|
||||
|
||||
|
||||
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
|
||||
for attr in pattern.attrs[:pattern.nr_attr]:
|
||||
if get_token_attr(token, attr.attr) != attr.value:
|
||||
|
@ -159,14 +165,14 @@ def _convert_strings(token_specs, string_store):
|
|||
|
||||
|
||||
def merge_phrase(matcher, doc, i, matches):
|
||||
'''Callback to merge a phrase on match'''
|
||||
"""Callback to merge a phrase on match."""
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
cdef class Matcher:
|
||||
'''Match sequences of tokens, based on pattern rules.'''
|
||||
"""Match sequences of tokens, based on pattern rules."""
|
||||
cdef Pool mem
|
||||
cdef vector[TokenPatternC*] patterns
|
||||
cdef readonly Vocab vocab
|
||||
|
@ -175,37 +181,12 @@ cdef class Matcher:
|
|||
cdef public object _callbacks
|
||||
cdef public object _acceptors
|
||||
|
||||
@classmethod
|
||||
def load(cls, path, vocab):
|
||||
"""
|
||||
Load the matcher and patterns from a file path.
|
||||
def __init__(self, vocab):
|
||||
"""Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
Path to a JSON-formatted patterns file.
|
||||
vocab (Vocab):
|
||||
The vocabulary that the documents to match over will refer to.
|
||||
Returns:
|
||||
Matcher: The newly constructed object.
|
||||
"""
|
||||
if (path / 'gazetteer.json').exists():
|
||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||
patterns = ujson.load(file_)
|
||||
else:
|
||||
patterns = {}
|
||||
return cls(vocab, patterns)
|
||||
|
||||
def __init__(self, vocab, patterns={}):
|
||||
"""
|
||||
Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object, which must be shared with the documents
|
||||
the matcher will operate on.
|
||||
patterns (dict): Patterns to add to the matcher.
|
||||
Returns:
|
||||
The newly constructed object.
|
||||
vocab (Vocab): The vocabulary object, which must be shared with the
|
||||
documents the matcher will operate on.
|
||||
RETURNS (Matcher): The newly constructed object.
|
||||
"""
|
||||
self._patterns = {}
|
||||
self._entities = {}
|
||||
|
@ -213,144 +194,111 @@ cdef class Matcher:
|
|||
self._callbacks = {}
|
||||
self.vocab = vocab
|
||||
self.mem = Pool()
|
||||
for entity_key, (etype, attrs, specs) in sorted(patterns.items()):
|
||||
self.add_entity(entity_key, attrs)
|
||||
for spec in specs:
|
||||
self.add_pattern(entity_key, spec, label=etype)
|
||||
|
||||
def __reduce__(self):
|
||||
return (self.__class__, (self.vocab, self._patterns), None, None)
|
||||
|
||||
property n_patterns:
|
||||
def __get__(self): return self.patterns.size()
|
||||
def __len__(self):
|
||||
"""Get the number of rules added to the matcher. Note that this only
|
||||
returns the number of rules (identical with the number of IDs), not the
|
||||
number of individual patterns.
|
||||
|
||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||
acceptor=None, on_match=None):
|
||||
RETURNS (int): The number of rules.
|
||||
"""
|
||||
Add an entity to the matcher.
|
||||
return len(self._patterns)
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
attrs:
|
||||
Attributes to associate with the Matcher.
|
||||
if_exists ('raise', 'ignore' or 'update'):
|
||||
Controls what happens if the entity ID already exists. Defaults to 'raise'.
|
||||
acceptor:
|
||||
Callback function to filter matches of the entity.
|
||||
on_match:
|
||||
Callback function to act on matches of the entity.
|
||||
Returns:
|
||||
None
|
||||
def __contains__(self, key):
|
||||
"""Check whether the matcher contains rules for a match ID.
|
||||
|
||||
key (unicode): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
if if_exists not in ('raise', 'ignore', 'update'):
|
||||
raise ValueError(
|
||||
"Unexpected value for if_exists: %s.\n"
|
||||
"Expected one of: ['raise', 'ignore', 'update']" % if_exists)
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if self.has_entity(entity_key):
|
||||
if if_exists == 'raise':
|
||||
raise KeyError(
|
||||
"Tried to add entity %s. Entity exists, and if_exists='raise'.\n"
|
||||
"Set if_exists='ignore' or if_exists='update', or check with "
|
||||
"matcher.has_entity()")
|
||||
elif if_exists == 'ignore':
|
||||
return
|
||||
self._entities[entity_key] = dict(attrs)
|
||||
self._patterns.setdefault(entity_key, [])
|
||||
self._acceptors[entity_key] = acceptor
|
||||
self._callbacks[entity_key] = on_match
|
||||
return len(self._patterns)
|
||||
|
||||
def add_pattern(self, entity_key, token_specs, label=""):
|
||||
def add(self, key, on_match, *patterns):
|
||||
"""Add a match-rule to the matcher.
|
||||
A match-rule consists of: an ID key, an on_match callback, and one or
|
||||
more patterns. If the key exists, the patterns are appended to the
|
||||
previous ones, and the previous on_match callback is replaced. The
|
||||
`on_match` callback will receive the arguments `(matcher, doc, i,
|
||||
matches)`. You can also set `on_match` to `None` to not perform any
|
||||
actions. A pattern consists of one or more `token_specs`, where a
|
||||
`token_spec` is a dictionary mapping attribute IDs to values. Token
|
||||
descriptors can also include quantifiers. There are currently important
|
||||
known problems with the quantifiers – see the docs.
|
||||
"""
|
||||
Add a pattern to the matcher.
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
msg = ("Cannot add pattern for zero tokens to matcher.\n"
|
||||
"key: {key}\n")
|
||||
raise ValueError(msg.format(key=key))
|
||||
key = self._normalize_key(key)
|
||||
self._patterns.setdefault(key, [])
|
||||
self._callbacks[key] = on_match
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
token_specs:
|
||||
Description of the pattern to be matched.
|
||||
label:
|
||||
Label to assign to the matched pattern. Defaults to "".
|
||||
Returns:
|
||||
None
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
self.patterns.push_back(init_pattern(self.mem, key, specs))
|
||||
self._patterns[key].append(specs)
|
||||
|
||||
def remove(self, key):
|
||||
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
||||
not exist.
|
||||
|
||||
key (unicode): The ID of the match rule.
|
||||
"""
|
||||
token_specs = list(token_specs)
|
||||
if len(token_specs) == 0:
|
||||
msg = ("Cannot add pattern for zero tokens to matcher.\n"
|
||||
"entity_key: {entity_key}\n"
|
||||
"label: {label}")
|
||||
raise ValueError(msg.format(entity_key=entity_key, label=label))
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if not self.has_entity(entity_key):
|
||||
self.add_entity(entity_key)
|
||||
if isinstance(label, basestring):
|
||||
label = self.vocab.strings[label]
|
||||
elif label is None:
|
||||
label = 0
|
||||
spec = _convert_strings(token_specs, self.vocab.strings)
|
||||
key = self._normalize_key(key)
|
||||
self._patterns.pop(key)
|
||||
self._callbacks.pop(key)
|
||||
cdef int i = 0
|
||||
while i < self.patterns.size():
|
||||
pattern_key = get_pattern_key(self.patterns.at(i))
|
||||
if pattern_key == key:
|
||||
self.patterns.erase(self.patterns.begin()+i)
|
||||
else:
|
||||
i += 1
|
||||
|
||||
self.patterns.push_back(init_pattern(self.mem, entity_key, label, spec))
|
||||
self._patterns[entity_key].append((label, token_specs))
|
||||
def has_key(self, key):
|
||||
"""Check whether the matcher has a rule with a given key.
|
||||
|
||||
def add(self, entity_key, label, attrs, specs, acceptor=None, on_match=None):
|
||||
self.add_entity(entity_key, attrs=attrs, if_exists='update',
|
||||
acceptor=acceptor, on_match=on_match)
|
||||
for spec in specs:
|
||||
self.add_pattern(entity_key, spec, label=label)
|
||||
|
||||
def normalize_entity_key(self, entity_key):
|
||||
if isinstance(entity_key, basestring):
|
||||
return self.vocab.strings[entity_key]
|
||||
else:
|
||||
return entity_key
|
||||
|
||||
def has_entity(self, entity_key):
|
||||
key (string or int): The key to check.
|
||||
RETURNS (bool): Whether the matcher has the rule.
|
||||
"""
|
||||
Check whether the matcher has an entity.
|
||||
key = self._normalize_key(key)
|
||||
return key in self._patterns
|
||||
|
||||
Arguments:
|
||||
entity_key (string or int): The entity key to check.
|
||||
Returns:
|
||||
bool: Whether the matcher has the entity.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
return entity_key in self._entities
|
||||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
||||
def get_entity(self, entity_key):
|
||||
key (unicode or int): The key to retrieve.
|
||||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||
"""
|
||||
Retrieve the attributes stored for an entity.
|
||||
key = self._normalize_key(key)
|
||||
if key not in self._patterns:
|
||||
return default
|
||||
return (self._callbacks[key], self._patterns[key])
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int): The entity to retrieve.
|
||||
Returns:
|
||||
The entity attributes if present, otherwise None.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if entity_key in self._entities:
|
||||
return self._entities[entity_key]
|
||||
else:
|
||||
return None
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
def __call__(self, Doc doc, acceptor=None):
|
||||
docs (iterable): A stream of documents.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the `Matcher` implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
Find all token sequences matching the supplied patterns on the Doc.
|
||||
for doc in docs:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to match over.
|
||||
Returns:
|
||||
list
|
||||
A list of (entity_key, label_id, start, end) tuples,
|
||||
describing the matches. A match tuple describes a span doc[start:end].
|
||||
The label_id and entity_key are both integers.
|
||||
def __call__(self, Doc doc):
|
||||
"""Find all token sequences matching the supplied patterns on the `Doc`.
|
||||
|
||||
doc (Doc): The document to match over.
|
||||
RETURNS (list): A list of `(key, label_id, start, end)` tuples,
|
||||
describing the matches. A match tuple describes a span
|
||||
`doc[start:end]`. The `label_id` and `key` are both integers.
|
||||
"""
|
||||
if acceptor is not None:
|
||||
raise ValueError(
|
||||
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
|
||||
"functions when you add patterns instead.")
|
||||
cdef vector[StateC] partials
|
||||
cdef int n_partials = 0
|
||||
cdef int q = 0
|
||||
|
@ -388,13 +336,7 @@ cdef class Matcher:
|
|||
end = token_i+1
|
||||
ent_id = state.second[1].attrs[0].value
|
||||
label = state.second[1].attrs[1].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
matches.append((ent_id, start, end))
|
||||
partials.resize(q)
|
||||
# Check whether we open any new patterns on this token
|
||||
for pattern in self.patterns:
|
||||
|
@ -419,13 +361,7 @@ cdef class Matcher:
|
|||
end = token_i+1
|
||||
ent_id = pattern[1].attrs[0].value
|
||||
label = pattern[1].attrs[1].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
matches.append((ent_id, start, end))
|
||||
# Look for open patterns that are actually satisfied
|
||||
for state in partials:
|
||||
while state.second.quantifier in (ZERO, ZERO_PLUS):
|
||||
|
@ -435,36 +371,19 @@ cdef class Matcher:
|
|||
end = len(doc)
|
||||
ent_id = state.second.attrs[0].value
|
||||
label = state.second.attrs[0].value
|
||||
acceptor = self._acceptors.get(ent_id)
|
||||
if acceptor is None:
|
||||
matches.append((ent_id, label, start, end))
|
||||
else:
|
||||
match = acceptor(doc, ent_id, label, start, end)
|
||||
if match:
|
||||
matches.append(match)
|
||||
matches.append((ent_id, start, end))
|
||||
for i, (ent_id, label, start, end) in enumerate(matches):
|
||||
on_match = self._callbacks.get(ent_id)
|
||||
if on_match is not None:
|
||||
on_match(self, doc, i, matches)
|
||||
# TODO: only return (match_id, start, end)
|
||||
return matches
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
docs: A stream of documents.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
self(doc)
|
||||
yield doc
|
||||
def _normalize_key(self, key):
|
||||
if isinstance(key, basestring):
|
||||
return self.vocab.strings[key]
|
||||
else:
|
||||
return key
|
||||
|
||||
|
||||
def get_bilou(length):
|
||||
|
|
|
@ -38,33 +38,71 @@ from .parts_of_speech import X
|
|||
|
||||
|
||||
class TokenVectorEncoder(object):
|
||||
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
|
||||
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
||||
name = 'tok2vec'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, width=128, embed_size=5000, **cfg):
|
||||
"""Create a new statistical model for the class.
|
||||
|
||||
width (int): Output size of the model.
|
||||
embed_size (int): Number of vectors in the embedding table.
|
||||
**cfg: Config parameters.
|
||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||
"""
|
||||
width = util.env_opt('token_vector_width', width)
|
||||
embed_size = util.env_opt('embed_size', embed_size)
|
||||
return Tok2Vec(width, embed_size, preprocess=None)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
initialisation.
|
||||
|
||||
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||
instance with the `Doc` objects it will process.
|
||||
model (Model): A `Model` instance or `True` allocate one later.
|
||||
**cfg: Config parameters.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.pipeline import TokenVectorEncoder
|
||||
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
|
||||
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.doc2feats = doc2feats()
|
||||
self.model = model
|
||||
|
||||
def __call__(self, docs):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
model. Vectors are set to the `Doc.tensor` attribute.
|
||||
|
||||
docs (Doc or iterable): One or more documents to add vectors to.
|
||||
RETURNS (dict or None): Intermediate computations.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
tokvecses = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecses)
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
"""Process `Doc` objects as a stream.
|
||||
|
||||
stream (iterator): A sequence of `Doc` objects to process.
|
||||
batch_size (int): Number of `Doc` objects to group.
|
||||
n_threads (int): Number of threads.
|
||||
YIELDS (iterator): A sequence of `Doc` objects, in order of input.
|
||||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
tokvecses = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecses)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
"""Return a single tensor for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the documents.
|
||||
"""
|
||||
feats = self.doc2feats(docs)
|
||||
tokvecs = self.model(feats)
|
||||
return tokvecs
|
||||
|
@ -73,7 +111,26 @@ class TokenVectorEncoder(object):
|
|||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
doc.tensor = tokvecs
|
||||
|
||||
def begin_update(self, docs, drop=0.):
|
||||
def set_annotations(self, docs, tokvecs):
|
||||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tokvecs (object): Vector representation for each token in the documents.
|
||||
"""
|
||||
start = 0
|
||||
for doc in docs:
|
||||
doc.tensor = tokvecs[start : start + len(doc)]
|
||||
start += len(doc)
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||
"""Update the model.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
golds (iterable): A batch of `GoldParse` objects.
|
||||
drop (float): The droput rate.
|
||||
sgd (callable): An optimizer.
|
||||
RETURNS (dict): Results from the update.
|
||||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
feats = self.doc2feats(docs)
|
||||
|
@ -81,14 +138,26 @@ class TokenVectorEncoder(object):
|
|||
return tokvecs, bp_tokvecs
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
# TODO: implement
|
||||
raise NotImplementedError
|
||||
|
||||
def begin_training(self, gold_tuples, pipeline=None):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
optimizer.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
self.doc2feats = doc2feats()
|
||||
if self.model is True:
|
||||
self.model = self.Model()
|
||||
|
||||
def use_params(self, params):
|
||||
"""Replace weights of models in the pipeline with those provided in the
|
||||
params dictionary.
|
||||
|
||||
params (dict): A dictionary of parameters keyed by model ID.
|
||||
"""
|
||||
with self.model.use_params(params):
|
||||
yield
|
||||
|
||||
|
@ -189,9 +258,7 @@ class NeuralTagger(object):
|
|||
|
||||
|
||||
cdef class EntityRecognizer(LinearParser):
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
@ -203,9 +270,7 @@ cdef class EntityRecognizer(LinearParser):
|
|||
|
||||
|
||||
cdef class BeamEntityRecognizer(BeamParser):
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
|
|
@ -11,8 +11,6 @@ from preshed.maps cimport map_iter, key_t
|
|||
from .typedefs cimport hash_t
|
||||
from libc.stdint cimport uint32_t
|
||||
|
||||
import ujson
|
||||
|
||||
|
||||
cpdef hash_t hash_string(unicode string) except 0:
|
||||
chars = string.encode('utf8')
|
||||
|
@ -72,15 +70,12 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
"""
|
||||
Map strings to and from integer IDs.
|
||||
"""
|
||||
"""Map strings to and from integer IDs."""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""
|
||||
Create the StringStore.
|
||||
"""Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
strings (iterable): A sequence of unicode strings to add to the store.
|
||||
RETURNS (StringStore): The newly constructed object.
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
|
@ -106,23 +101,17 @@ cdef class StringStore:
|
|||
return (StringStore, (list(self),))
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The number of strings in the store.
|
||||
"""The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
RETURNS (int): The number of strings in the store.
|
||||
"""
|
||||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""
|
||||
Retrieve a string from a given integer ID, or vice versa.
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
Returns:
|
||||
unicode or int: The value to retrieved.
|
||||
string_or_id (bytes or unicode or int): The value to encode.
|
||||
Returns (unicode or int): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
|
@ -163,13 +152,10 @@ cdef class StringStore:
|
|||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
"""
|
||||
Check whether a string is in the store.
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
Returns bool:
|
||||
Whether the store contains the string.
|
||||
string (unicode): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
if len(string) == 0:
|
||||
return True
|
||||
|
@ -177,10 +163,9 @@ cdef class StringStore:
|
|||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over the strings in the store, in order.
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
YIELDS (unicode): A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
|
@ -195,6 +180,41 @@ cdef class StringStore:
|
|||
strings.append(py_string)
|
||||
return (StringStore, (strings,), None, None, None)
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (StringStore): The modified `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (StringStore): The `StringStore` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def set_frozen(self, bint is_frozen):
|
||||
# TODO
|
||||
self.is_frozen = is_frozen
|
||||
|
@ -235,40 +255,6 @@ cdef class StringStore:
|
|||
self.size += 1
|
||||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""
|
||||
Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
string_data = ujson.dumps(list(self))
|
||||
if not isinstance(string_data, unicode):
|
||||
string_data = string_data.decode('utf8')
|
||||
# TODO: OOV?
|
||||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""
|
||||
Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
strings = ujson.load(file_)
|
||||
if strings == ['']:
|
||||
return None
|
||||
cdef unicode string
|
||||
for string in strings:
|
||||
# explicit None/len check instead of simple truth testing
|
||||
# (bug in Cython <= 0.23.4)
|
||||
if string is not None and len(string):
|
||||
self.intern_unicode(string)
|
||||
|
||||
def _realloc(self):
|
||||
# We want to map straight to pointers, but they'll be invalidated if
|
||||
# we resize our array. So, first we remap to indices, then we resize,
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
from collections import defaultdict
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -15,7 +14,6 @@ from .tokens.doc cimport Doc
|
|||
from .attrs cimport TAG
|
||||
from .gold cimport GoldParse
|
||||
from .attrs cimport *
|
||||
from . import util
|
||||
|
||||
|
||||
cpdef enum:
|
||||
|
@ -108,55 +106,15 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
|
||||
|
||||
cdef class Tagger:
|
||||
"""
|
||||
Annotate part-of-speech tags on Doc objects.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, vocab, require=False):
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
The vocabulary. Must be shared by the documents to be processed.
|
||||
require (bool):
|
||||
Whether to raise an error if the files are not found.
|
||||
Returns (Tagger):
|
||||
The newly created object.
|
||||
"""
|
||||
# TODO: Change this to expect config.json when we don't have to
|
||||
# support old data.
|
||||
path = util.ensure_path(path)
|
||||
if (path / 'templates.json').exists():
|
||||
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
||||
templates = ujson.load(file_)
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/templates.json not found when loading Tagger" % str(path))
|
||||
else:
|
||||
templates = cls.feature_templates
|
||||
self = cls(vocab, model=None, feature_templates=templates)
|
||||
|
||||
if (path / 'model').exists():
|
||||
self.model.load(str(path / 'model'))
|
||||
elif require:
|
||||
raise IOError(
|
||||
"Required file %s/model not found when loading Tagger" % str(path))
|
||||
return self
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""
|
||||
Create a Tagger.
|
||||
"""Create a Tagger.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
model (thinc.linear.AveragedPerceptron):
|
||||
The statistical model.
|
||||
Returns (Tagger):
|
||||
The newly constructed object.
|
||||
vocab (Vocab): The vocabulary object. Must be shared with documents to
|
||||
be processed.
|
||||
model (thinc.linear.AveragedPerceptron): The statistical model.
|
||||
RETURNS (Tagger): The newly constructed object.
|
||||
"""
|
||||
if model is None:
|
||||
model = TaggerModel(cfg.get('features', self.feature_templates),
|
||||
|
@ -186,13 +144,9 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""
|
||||
Apply the tagger, setting the POS tags onto the Doc object.
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The tokens to be tagged.
|
||||
Returns:
|
||||
None
|
||||
doc (Doc): The tokens to be tagged.
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
|
@ -215,34 +169,25 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""
|
||||
Tag a stream of documents.
|
||||
"""Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
stream: The sequence of documents to tag.
|
||||
batch_size (int): The number of documents to accumulate into a working set.
|
||||
n_threads (int): The number of threads with which to work on the buffer
|
||||
in parallel, if the Matcher implementation supports multi-threading.
|
||||
YIELDS (Doc): Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""
|
||||
Update the statistical model, with tags supplied for the given document.
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to update on.
|
||||
gold (GoldParse):
|
||||
Manager for the gold-standard tags.
|
||||
Returns (int):
|
||||
Number of tags correct.
|
||||
doc (Doc): The document to update on.
|
||||
gold (GoldParse): Manager for the gold-standard tags.
|
||||
RETURNS (int): Number of tags predicted correctly.
|
||||
"""
|
||||
gold_tag_strs = gold.tags
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
|
|
|
@ -99,8 +99,8 @@ def test_doc_token_api_ancestors(en_tokenizer):
|
|||
assert [t.text for t in doc[1].ancestors] == ["saw"]
|
||||
assert [t.text for t in doc[2].ancestors] == []
|
||||
|
||||
assert doc[2].is_ancestor_of(doc[7])
|
||||
assert not doc[6].is_ancestor_of(doc[2])
|
||||
assert doc[2].is_ancestor(doc[7])
|
||||
assert not doc[6].is_ancestor(doc[2])
|
||||
|
||||
|
||||
def test_doc_token_api_head_setter(en_tokenizer):
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import ujson
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -12,75 +10,31 @@ from preshed.maps cimport PreshMap
|
|||
from .strings cimport hash_string
|
||||
cimport cython
|
||||
|
||||
from . import util
|
||||
from .tokens.doc cimport Doc
|
||||
|
||||
|
||||
cdef class Tokenizer:
|
||||
"""Segment text, and create Doc objects with the discovered segment
|
||||
boundaries.
|
||||
"""
|
||||
Segment text, and create Doc objects with the discovered segment boundaries.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||
infix_finditer=None, token_match=None):
|
||||
"""
|
||||
Load a Tokenizer, reading unsupplied components from the path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
A storage container for lexical types.
|
||||
rules (dict):
|
||||
Exceptions and special-cases for the tokenizer.
|
||||
token_match:
|
||||
A boolean function matching strings that becomes tokens.
|
||||
prefix_search:
|
||||
Signature of re.compile(string).search
|
||||
suffix_search:
|
||||
Signature of re.compile(string).search
|
||||
infix_finditer:
|
||||
Signature of re.compile(string).finditer
|
||||
Returns Tokenizer
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if rules is None:
|
||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||
rules = ujson.load(file_)
|
||||
if prefix_search in (None, True):
|
||||
with (path / 'tokenizer' / 'prefix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
prefix_search = util.compile_prefix_regex(entries).search
|
||||
if suffix_search in (None, True):
|
||||
with (path / 'tokenizer' / 'suffix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
suffix_search = util.compile_suffix_regex(entries).search
|
||||
if infix_finditer in (None, True):
|
||||
with (path / 'tokenizer' / 'infix.txt').open() as file_:
|
||||
entries = file_.read().split('\n')
|
||||
infix_finditer = util.compile_infix_regex(entries).finditer
|
||||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
||||
|
||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||
"""
|
||||
Create a Tokenizer, to create Doc objects given unicode text.
|
||||
"""Create a `Tokenizer`, to create `Doc` objects given unicode text.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
A storage container for lexical types.
|
||||
rules (dict):
|
||||
Exceptions and special-cases for the tokenizer.
|
||||
prefix_search:
|
||||
A function matching the signature of re.compile(string).search
|
||||
to match prefixes.
|
||||
suffix_search:
|
||||
A function matching the signature of re.compile(string).search
|
||||
to match suffixes.
|
||||
infix_finditer:
|
||||
A function matching the signature of re.compile(string).finditer
|
||||
to find infixes.
|
||||
token_match:
|
||||
A boolean function matching strings that becomes tokens.
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
rules (dict): Exceptions and special-cases for the tokenizer.
|
||||
prefix_search (callable): A function matching the signature of
|
||||
`re.compile(string).search` to match prefixes.
|
||||
suffix_search (callable): A function matching the signature of
|
||||
`re.compile(string).search` to match suffixes.
|
||||
`infix_finditer` (callable): A function matching the signature of
|
||||
`re.compile(string).finditer` to find infixes.
|
||||
token_match (callable): A boolean function matching strings to be
|
||||
recognised as tokens.
|
||||
RETURNS (Tokenizer): The newly constructed object.
|
||||
|
||||
EXAMPLE:
|
||||
>>> tokenizer = Tokenizer(nlp.vocab)
|
||||
>>> tokenizer = English().Defaults.create_tokenizer(nlp)
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._cache = PreshMap()
|
||||
|
@ -112,13 +66,10 @@ cdef class Tokenizer:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, unicode string):
|
||||
"""
|
||||
Tokenize a string.
|
||||
"""Tokenize a string.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to tokenize.
|
||||
Returns:
|
||||
Doc A container for linguistic annotations.
|
||||
string (unicode): The string to tokenize.
|
||||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
raise ValueError(
|
||||
|
@ -166,18 +117,13 @@ cdef class Tokenizer:
|
|||
return tokens
|
||||
|
||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||
"""
|
||||
Tokenize a stream of texts.
|
||||
"""Tokenize a stream of texts.
|
||||
|
||||
Arguments:
|
||||
texts: A sequence of unicode texts.
|
||||
batch_size (int):
|
||||
The number of texts to accumulate in an internal buffer.
|
||||
n_threads (int):
|
||||
The number of threads to use, if the implementation supports
|
||||
multi-threading. The default tokenizer is single-threaded.
|
||||
Yields:
|
||||
Doc A sequence of Doc objects, in order.
|
||||
texts: A sequence of unicode texts.
|
||||
batch_size (int): The number of texts to accumulate in an internal buffer.
|
||||
n_threads (int): The number of threads to use, if the implementation
|
||||
supports multi-threading. The default tokenizer is single-threaded.
|
||||
YIELDS (Doc): A sequence of Doc objects, in order.
|
||||
"""
|
||||
for text in texts:
|
||||
yield self(text)
|
||||
|
@ -321,27 +267,23 @@ cdef class Tokenizer:
|
|||
self._cache.set(key, cached)
|
||||
|
||||
def find_infix(self, unicode string):
|
||||
"""
|
||||
Find internal split points of the string, such as hyphens.
|
||||
"""Find internal split points of the string, such as hyphens.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
|
||||
Returns List[re.MatchObject]
|
||||
A list of objects that have .start() and .end() methods, denoting the
|
||||
placement of internal segment separators, e.g. hyphens.
|
||||
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
|
||||
and `.end()` methods, denoting the placement of internal segment
|
||||
separators, e.g. hyphens.
|
||||
"""
|
||||
if self.infix_finditer is None:
|
||||
return 0
|
||||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""
|
||||
Find the length of a prefix that should be segmented from the string,
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to segment.
|
||||
Returns (int or None): The length of the prefix if present, otherwise None.
|
||||
string (unicode): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
"""
|
||||
if self.prefix_search is None:
|
||||
return 0
|
||||
|
@ -349,13 +291,11 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""
|
||||
Find the length of a suffix that should be segmented from the string,
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to segment.
|
||||
Returns (int or None): The length of the suffix if present, otherwise None.
|
||||
string (unicode): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
"""
|
||||
if self.suffix_search is None:
|
||||
return 0
|
||||
|
@ -363,23 +303,17 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
"""
|
||||
Add special-case tokenization rules.
|
||||
"""
|
||||
"""Add special-case tokenization rules."""
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
"""
|
||||
Add a special-case tokenization rule.
|
||||
"""Add a special-case tokenization rule.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to specially tokenize.
|
||||
token_attrs:
|
||||
A sequence of dicts, where each dict describes a token and its
|
||||
attributes. The ORTH fields of the attributes must exactly match
|
||||
the string when they are concatenated.
|
||||
Returns None
|
||||
string (unicode): The string to specially tokenize.
|
||||
token_attrs (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes. The `ORTH` fields of the attributes must
|
||||
exactly match the string when they are concatenated.
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
|
@ -390,3 +324,38 @@ cdef class Tokenizer:
|
|||
self._specials.set(key, cached)
|
||||
self._cache.set(key, cached)
|
||||
self._rules[string] = substrings
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Tokenizer` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Tokenizer): The `Tokenizer` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
|
|
@ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
|||
|
||||
|
||||
cdef class Doc:
|
||||
"""
|
||||
A sequence of `Token` objects. Access sentences and named entities,
|
||||
export annotations to numpy arrays, losslessly serialize to compressed
|
||||
binary strings.
|
||||
"""A sequence of Token objects. Access sentences and named entities, export
|
||||
annotations to numpy arrays, losslessly serialize to compressed binary strings.
|
||||
The `Doc` object holds an array of `TokenC` structs. The Python-level
|
||||
`Token` and `Span` objects are views of this array, i.e. they don't own
|
||||
the data themselves.
|
||||
|
||||
Aside: Internals
|
||||
The `Doc` object holds an array of `TokenC` structs.
|
||||
The Python-level `Token` and `Span` objects are views of this
|
||||
array, i.e. they don't own the data themselves.
|
||||
|
||||
Code: Construction 1
|
||||
doc = nlp.tokenizer(u'Some text')
|
||||
|
||||
Code: Construction 2
|
||||
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
|
||||
EXAMPLE: Construction 1
|
||||
>>> doc = nlp(u'Some text')
|
||||
|
||||
Construction 2
|
||||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False])
|
||||
"""
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
|
||||
"""
|
||||
Create a Doc object.
|
||||
"""Create a Doc object.
|
||||
|
||||
Arguments:
|
||||
vocab:
|
||||
A Vocabulary object, which must match any models you want to
|
||||
use (e.g. tokenizer, parser, entity recognizer).
|
||||
|
||||
words:
|
||||
A list of unicode strings to add to the document as words. If None,
|
||||
defaults to empty list.
|
||||
|
||||
spaces:
|
||||
A list of boolean values, of the same length as words. True
|
||||
means that the word is followed by a space, False means it is not.
|
||||
If None, defaults to [True]*len(words)
|
||||
vocab (Vocab): A vocabulary object, which must match any models you want
|
||||
to use (e.g. tokenizer, parser, entity recognizer).
|
||||
words (list or None): A list of unicode strings to add to the document
|
||||
as words. If `None`, defaults to empty list.
|
||||
spaces (list or None): A list of boolean values, of the same length as
|
||||
words. True means that the word is followed by a space, False means
|
||||
it is not. If `None`, defaults to `[True]*len(words)`
|
||||
RETURNS (Doc): The newly constructed object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
size = 20
|
||||
|
@ -158,20 +148,26 @@ cdef class Doc:
|
|||
self.is_parsed = True
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""
|
||||
doc[i]
|
||||
Get the Token object at position i, where i is an integer.
|
||||
"""Get a `Token` or `Span` object.
|
||||
|
||||
i (int or tuple) The index of the token, or the slice of the document to get.
|
||||
RETURNS (Token or Span): The token at `doc[i]]`, or the span at
|
||||
`doc[start : end]`.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc[i]
|
||||
Get the `Token` object at position `i`, where `i` is an integer.
|
||||
Negative indexing is supported, and follows the usual Python
|
||||
semantics, i.e. doc[-2] is doc[len(doc) - 2].
|
||||
doc[start : end]]
|
||||
Get a `Span` object, starting at position `start`
|
||||
and ending at position `end`, where `start` and
|
||||
`end` are token indices. For instance,
|
||||
`doc[2:5]` produces a span consisting of
|
||||
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
|
||||
are not supported, as `Span` objects must be contiguous (cannot have gaps).
|
||||
You can use negative indices and open-ended ranges, which have their
|
||||
normal Python semantics.
|
||||
semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`.
|
||||
|
||||
>>> doc[start : end]]
|
||||
Get a `Span` object, starting at position `start` and ending at
|
||||
position `end`, where `start` and `end` are token indices. For
|
||||
instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4.
|
||||
Stepped slices (e.g. `doc[start : end : step]`) are not supported,
|
||||
as `Span` objects must be contiguous (cannot have gaps). You can use
|
||||
negative indices and open-ended ranges, which have their normal
|
||||
Python semantics.
|
||||
"""
|
||||
if isinstance(i, slice):
|
||||
start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -186,14 +182,14 @@ cdef class Doc:
|
|||
return Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
for token in doc
|
||||
Iterate over `Token` objects, from which the annotations can
|
||||
be easily accessed. This is the main way of accessing Token
|
||||
objects, which are the main way annotations are accessed from
|
||||
Python. If faster-than-Python speeds are required, you can
|
||||
instead access the annotations as a numpy array, or access the
|
||||
underlying C data directly from Cython.
|
||||
"""Iterate over `Token` objects, from which the annotations can be
|
||||
easily accessed. This is the main way of accessing `Token` objects,
|
||||
which are the main way annotations are accessed from Python. If faster-
|
||||
than-Python speeds are required, you can instead access the annotations
|
||||
as a numpy array, or access the underlying C data directly from Cython.
|
||||
|
||||
EXAMPLE:
|
||||
>>> for token in doc
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.length):
|
||||
|
@ -203,9 +199,12 @@ cdef class Doc:
|
|||
yield Token.cinit(self.vocab, &self.c[i], i, self)
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
len(doc)
|
||||
The number of tokens in the document.
|
||||
"""The number of tokens in the document.
|
||||
|
||||
RETURNS (int): The number of tokens in the document.
|
||||
|
||||
EXAMPLE:
|
||||
>>> len(doc)
|
||||
"""
|
||||
return self.length
|
||||
|
||||
|
@ -228,16 +227,12 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Make a semantic similarity estimate. The default estimate is cosine
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other (object): The object to compare with. By default, accepts Doc,
|
||||
Span, Token and Lexeme objects.
|
||||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if 'similarity' in self.user_hooks:
|
||||
return self.user_hooks['similarity'](self, other)
|
||||
|
@ -246,8 +241,10 @@ cdef class Doc:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property has_vector:
|
||||
"""
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
|
@ -256,10 +253,11 @@ cdef class Doc:
|
|||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
"""
|
||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||
"""A real-valued meaning representation. Defaults to an average of the
|
||||
token vectors.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the document's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.user_hooks:
|
||||
|
@ -275,6 +273,10 @@ cdef class Doc:
|
|||
self._vector = value
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the document's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.user_hooks:
|
||||
return self.user_hooks['vector_norm'](self)
|
||||
|
@ -295,34 +297,37 @@ cdef class Doc:
|
|||
return self.text
|
||||
|
||||
property text:
|
||||
"""
|
||||
A unicode representation of the document text.
|
||||
"""A unicode representation of the document text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join(t.text_with_ws for t in self)
|
||||
|
||||
property text_with_ws:
|
||||
"""
|
||||
An alias of Doc.text, provided for duck-type compatibility with Span and Token.
|
||||
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
||||
`Span` and `Token`.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.text
|
||||
|
||||
property ents:
|
||||
"""
|
||||
Yields named-entity `Span` objects, if the entity recognizer
|
||||
has been applied to the document. Iterate over the span to get
|
||||
individual Token objects, or access the label:
|
||||
"""Iterate over the entities in the document. Yields named-entity `Span`
|
||||
objects, if the entity recognizer has been applied to the document.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].label == 346
|
||||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].orth_ == 'Best'
|
||||
assert ents[0].text == 'Mr. Best'
|
||||
YIELDS (Span): Entities in the document.
|
||||
|
||||
EXAMPLE: Iterate over the span to get individual Token objects, or access
|
||||
the label:
|
||||
|
||||
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
>>> ents = list(tokens.ents)
|
||||
>>> assert ents[0].label == 346
|
||||
>>> assert ents[0].label_ == 'PERSON'
|
||||
>>> assert ents[0].orth_ == 'Best'
|
||||
>>> assert ents[0].text == 'Mr. Best'
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef int i
|
||||
|
@ -387,12 +392,13 @@ cdef class Doc:
|
|||
self.c[start].ent_iob = 3
|
||||
|
||||
property noun_chunks:
|
||||
"""
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses.
|
||||
"""Iterate over the base noun phrases in the document. Yields base
|
||||
noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
parsed. A base noun phrase, or "NP chunk", is a noun phrase that does
|
||||
not permit other NPs to be nested within it – so no NP-level
|
||||
coordination, no prepositional phrases, and no relative clauses.
|
||||
|
||||
YIELDS (Span): Noun chunks in the document.
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.is_parsed:
|
||||
|
@ -411,17 +417,15 @@ cdef class Doc:
|
|||
yield span
|
||||
|
||||
property sents:
|
||||
"""
|
||||
Yields sentence `Span` objects. Sentence spans have no label.
|
||||
To improve accuracy on informal texts, spaCy calculates sentence
|
||||
boundaries from the syntactic dependency parse. If the parser is disabled,
|
||||
`sents` iterator will be unavailable.
|
||||
"""Iterate over the sentences in the document. Yields sentence `Span`
|
||||
objects. Sentence spans have no label. To improve accuracy on informal
|
||||
texts, spaCy calculates sentence boundaries from the syntactic
|
||||
dependency parse. If the parser is disabled, the `sents` iterator will
|
||||
be unavailable.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
doc = nlp("This is a sentence. Here's another...")
|
||||
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
|
||||
EXAMPLE:
|
||||
>>> doc = nlp("This is a sentence. Here's another...")
|
||||
>>> assert [s.root.text for s in doc.sents] == ["is", "'s"]
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sents' in self.user_hooks:
|
||||
|
@ -467,24 +471,20 @@ cdef class Doc:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
cpdef np.ndarray to_array(self, object py_attr_ids):
|
||||
"""
|
||||
Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape (N, M), where `N` is the length
|
||||
of the document. The values will be 32-bit integers.
|
||||
"""Given a list of M attribute IDs, export the tokens to a numpy
|
||||
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
|
||||
The values will be 32-bit integers.
|
||||
|
||||
Example:
|
||||
from spacy import attrs
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
|
||||
per word, and one column per attribute indicated in the input
|
||||
`attr_ids`.
|
||||
|
||||
Arguments:
|
||||
attr_ids (list[int]): A list of attribute ID ints.
|
||||
|
||||
Returns:
|
||||
feat_array (numpy.ndarray[long, ndim=2]):
|
||||
A feature matrix, with one row per word, and one column per attribute
|
||||
indicated in the input attr_ids.
|
||||
EXAMPLE:
|
||||
>>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
>>> doc = nlp(text)
|
||||
>>> # All strings mapped to integers, for easy export to numpy
|
||||
>>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
"""
|
||||
cdef int i, j
|
||||
cdef attr_id_t feature
|
||||
|
@ -499,27 +499,20 @@ cdef class Doc:
|
|||
return output
|
||||
|
||||
def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
|
||||
"""
|
||||
Produce a dict of {attribute (int): count (ints)} frequencies, keyed
|
||||
by the values of the given attribute ID.
|
||||
"""Count the frequencies of a given attribute. Produces a dict of
|
||||
`{attribute (int): count (ints)}` frequencies, keyed by the values of
|
||||
the given attribute ID.
|
||||
|
||||
Example:
|
||||
from spacy.en import English
|
||||
from spacy import attrs
|
||||
nlp = English()
|
||||
tokens = nlp(u'apple apple orange banana')
|
||||
tokens.count_by(attrs.ORTH)
|
||||
# {12800L: 1, 11880L: 2, 7561L: 1}
|
||||
tokens.to_array([attrs.ORTH])
|
||||
# array([[11880],
|
||||
# [11880],
|
||||
# [ 7561],
|
||||
# [12800]])
|
||||
attr_id (int): The attribute ID to key the counts.
|
||||
RETURNS (dict): A dictionary mapping attributes to integer counts.
|
||||
|
||||
Arguments:
|
||||
attr_id
|
||||
int
|
||||
The attribute ID to key the counts.
|
||||
EXAMPLE:
|
||||
>>> from spacy import attrs
|
||||
>>> doc = nlp(u'apple apple orange banana')
|
||||
>>> tokens.count_by(attrs.ORTH)
|
||||
{12800L: 1, 11880L: 2, 7561L: 1}
|
||||
>>> tokens.to_array([attrs.ORTH])
|
||||
array([[11880], [11880], [7561], [12800]])
|
||||
"""
|
||||
cdef int i
|
||||
cdef attr_t attr
|
||||
|
@ -567,8 +560,12 @@ cdef class Doc:
|
|||
self.c[i] = parsed[i]
|
||||
|
||||
def from_array(self, attrs, int[:, :] array):
|
||||
"""
|
||||
Write to a `Doc` object, from an `(M, N)` array of attributes.
|
||||
"""Load attributes from a numpy array. Write to a `Doc` object, from an
|
||||
`(M, N)` array of attributes.
|
||||
|
||||
attrs (ints): A list of attribute ID ints.
|
||||
array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load.
|
||||
RETURNS (Doc): Itself.
|
||||
"""
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
|
@ -597,8 +594,10 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
"""
|
||||
Serialize, producing a byte string.
|
||||
"""Serialize, i.e. export the document contents to a binary string.
|
||||
|
||||
RETURNS (bytes): A losslessly serialized copy of the `Doc`, including
|
||||
all annotations.
|
||||
"""
|
||||
return dill.dumps(
|
||||
(self.text,
|
||||
|
@ -611,8 +610,10 @@ cdef class Doc:
|
|||
protocol=-1)
|
||||
|
||||
def from_bytes(self, data):
|
||||
"""
|
||||
Deserialize, loading from bytes.
|
||||
"""Deserialize, i.e. import the document contents from a binary string.
|
||||
|
||||
data (bytes): The string to load from.
|
||||
RETURNS (Doc): Itself.
|
||||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError("Cannot load into non-empty Doc")
|
||||
|
@ -640,21 +641,16 @@ cdef class Doc:
|
|||
return self
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""
|
||||
Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||
is merged into a single token. If start_idx and end_idx do not mark start
|
||||
and end token boundaries, the document remains unchanged.
|
||||
"""Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||
is merged into a single token. If `start_idx` and `end_idx `do not mark
|
||||
start and end token boundaries, the document remains unchanged.
|
||||
|
||||
Arguments:
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
**attributes:
|
||||
Attributes to assign to the merged token. By default, attributes
|
||||
are inherited from the syntactic root token of the span.
|
||||
Returns:
|
||||
token (Token):
|
||||
The newly merged token, or None if the start and end indices did
|
||||
not fall at token boundaries.
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token, or `None` if the start and end
|
||||
indices did not fall at token boundaries.
|
||||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
if len(args) == 3:
|
||||
|
@ -758,7 +754,29 @@ cdef class Doc:
|
|||
return self[start]
|
||||
|
||||
def print_tree(self, light=False, flat=False):
|
||||
"""Returns the parse trees in the JSON (Dict) format."""
|
||||
"""Returns the parse trees in JSON (dict) format.
|
||||
|
||||
light (bool): Don't include lemmas or entities.
|
||||
flat (bool): Don't include arcs or modifiers.
|
||||
RETURNS (dict): Parse tree as dict.
|
||||
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
return parse_tree(self, light=light, flat=flat)
|
||||
|
||||
|
||||
|
|
|
@ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
|
|||
|
||||
|
||||
def merge_ents(doc):
|
||||
"""
|
||||
Helper: merge adjacent entities into single tokens; modifies the doc.
|
||||
"""
|
||||
"""Helper: merge adjacent entities into single tokens; modifies the doc."""
|
||||
for ent in doc.ents:
|
||||
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||||
return doc
|
||||
|
||||
|
||||
def format_POS(token, light, flat):
|
||||
"""
|
||||
Helper: form the POS output for a token.
|
||||
"""
|
||||
"""Helper: form the POS output for a token."""
|
||||
subtree = dict([
|
||||
("word", token.text),
|
||||
("lemma", token.lemma_), # trigger
|
||||
|
@ -37,9 +33,8 @@ def format_POS(token, light, flat):
|
|||
|
||||
|
||||
def POS_tree(root, light=False, flat=False):
|
||||
"""
|
||||
Helper: generate a POS tree for a root token. The doc must have
|
||||
merge_ents(doc) ran on it.
|
||||
"""Helper: generate a POS tree for a root token. The doc must have
|
||||
`merge_ents(doc)` ran on it.
|
||||
"""
|
||||
subtree = format_POS(root, light=light, flat=flat)
|
||||
for c in root.children:
|
||||
|
@ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False):
|
|||
|
||||
|
||||
def parse_tree(doc, light=False, flat=False):
|
||||
"""
|
||||
Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
"""Makes a copy of the doc, then construct a syntactic parse tree, similar to
|
||||
the one used in displaCy. Generates the POS tree for all sentences in a doc.
|
||||
|
||||
Args:
|
||||
doc: The doc for parsing.
|
||||
doc (Doc): The doc for parsing.
|
||||
RETURNS (dict): The parse tree.
|
||||
|
||||
Returns:
|
||||
[parse_trees (Dict)]:
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
[{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}]
|
||||
EXAMPLE:
|
||||
>>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
|
||||
>>> trees = doc.print_tree()
|
||||
>>> trees[1]
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
|
||||
'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
{'modifiers': [
|
||||
{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
|
||||
'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
|
||||
'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
|
||||
'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
{'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
|
||||
'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
|
||||
'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
|
||||
'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
"""
|
||||
doc_clone = Doc(doc.vocab, words=[w.text for w in doc])
|
||||
doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
|
||||
|
|
|
@ -20,22 +20,17 @@ from .. import about
|
|||
|
||||
|
||||
cdef class Span:
|
||||
"""
|
||||
A slice from a Doc object.
|
||||
"""
|
||||
"""A slice from a Doc object."""
|
||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||
vector_norm=None):
|
||||
"""
|
||||
Create a Span object from the slice doc[start : end]
|
||||
"""Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (int): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
Returns:
|
||||
Span The newly constructed object.
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (int): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
raise IndexError
|
||||
|
@ -70,8 +65,11 @@ cdef class Span:
|
|||
def __hash__(self):
|
||||
return hash((self.doc, self.label, self.start_char, self.end_char))
|
||||
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of tokens in the span.
|
||||
|
||||
RETURNS (int): The number of tokens in the span.
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if self.end < self.start:
|
||||
return 0
|
||||
|
@ -83,6 +81,16 @@ cdef class Span:
|
|||
return self.text.encode('utf-8')
|
||||
|
||||
def __getitem__(self, object i):
|
||||
"""Get a `Token` or a `Span` object
|
||||
|
||||
i (int or tuple): The index of the token within the span, or slice of
|
||||
the span to get.
|
||||
RETURNS (Token or Span): The token at `span[i]`.
|
||||
|
||||
EXAMPLE:
|
||||
>>> span[0]
|
||||
>>> span[1:3]
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
if isinstance(i, slice):
|
||||
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
|
||||
|
@ -94,35 +102,31 @@ cdef class Span:
|
|||
return self.doc[self.start + i]
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over `Token` objects.
|
||||
|
||||
YIELDS (Token): A `Token` object.
|
||||
"""
|
||||
self._recalculate_indices()
|
||||
for i in range(self.start, self.end):
|
||||
yield self.doc[i]
|
||||
|
||||
def merge(self, *args, **attributes):
|
||||
"""
|
||||
Retokenize the document, such that the span is merged into a single token.
|
||||
"""Retokenize the document, such that the span is merged into a single
|
||||
token.
|
||||
|
||||
Arguments:
|
||||
**attributes:
|
||||
Attributes to assign to the merged token. By default, attributes
|
||||
are inherited from the syntactic root token of the span.
|
||||
Returns:
|
||||
token (Token):
|
||||
The newly merged token.
|
||||
**attributes: Attributes to assign to the merged token. By default,
|
||||
attributes are inherited from the syntactic root token of the span.
|
||||
RETURNS (Token): The newly merged token.
|
||||
"""
|
||||
return self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Make a semantic similarity estimate. The default estimate is cosine
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other (object): The object to compare with. By default, accepts Doc,
|
||||
Span, Token and Lexeme objects.
|
||||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if 'similarity' in self.doc.user_span_hooks:
|
||||
self.doc.user_span_hooks['similarity'](self, other)
|
||||
|
@ -145,11 +149,9 @@ cdef class Span:
|
|||
self.end = end + 1
|
||||
|
||||
property sent:
|
||||
"""
|
||||
The sentence span that this span is a part of.
|
||||
"""The sentence span that this span is a part of.
|
||||
|
||||
Returns:
|
||||
Span The sentence this is part of.
|
||||
RETURNS (Span): The sentence span that the span is a part of.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
|
@ -166,12 +168,23 @@ cdef class Span:
|
|||
return self.doc[root.l_edge : root.r_edge + 1]
|
||||
|
||||
property has_vector:
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['has_vector'](self)
|
||||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation. Defaults to an average of the
|
||||
token vectors.
|
||||
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the span's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['vector'](self)
|
||||
|
@ -180,6 +193,10 @@ cdef class Span:
|
|||
return self._vector
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the document's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['vector'](self)
|
||||
|
@ -193,6 +210,7 @@ cdef class Span:
|
|||
return self._vector_norm
|
||||
|
||||
property sentiment:
|
||||
# TODO: docstring
|
||||
def __get__(self):
|
||||
if 'sentiment' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sentiment'](self)
|
||||
|
@ -200,6 +218,10 @@ cdef class Span:
|
|||
return sum([token.sentiment for token in self]) / len(self)
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the span text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
text = self.text_with_ws
|
||||
if self[-1].whitespace_:
|
||||
|
@ -207,16 +229,21 @@ cdef class Span:
|
|||
return text
|
||||
|
||||
property text_with_ws:
|
||||
"""The text content of the span with a trailing whitespace character if
|
||||
the last token has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
return u''.join([t.text_with_ws for t in self])
|
||||
|
||||
property noun_chunks:
|
||||
"""
|
||||
Yields base noun-phrase #[code Span] objects, if the document
|
||||
has been syntactically parsed. A base noun phrase, or
|
||||
'NP chunk', is a noun phrase that does not permit other NPs to
|
||||
be nested within it – so no NP-level coordination, no prepositional
|
||||
phrases, and no relative clauses. For example:
|
||||
"""Yields base noun-phrase `Span` objects, if the document has been
|
||||
syntactically parsed. A base noun phrase, or "NP chunk", is a noun
|
||||
phrase that does not permit other NPs to be nested within it – so no
|
||||
NP-level coordination, no prepositional phrases, and no relative clauses.
|
||||
|
||||
YIELDS (Span): Base noun-phrase `Span` objects
|
||||
"""
|
||||
def __get__(self):
|
||||
if not self.doc.is_parsed:
|
||||
|
@ -235,49 +262,47 @@ cdef class Span:
|
|||
yield span
|
||||
|
||||
property root:
|
||||
"""
|
||||
The token within the span that's highest in the parse tree. If there's a
|
||||
tie, the earlist is prefered.
|
||||
"""The token within the span that's highest in the parse tree.
|
||||
If there's a tie, the earliest is prefered.
|
||||
|
||||
Returns:
|
||||
Token: The root token.
|
||||
RETURNS (Token): The root token.
|
||||
|
||||
i.e. has the shortest path to the root of the sentence (or is the root
|
||||
itself). If multiple words are equally high in the tree, the first word
|
||||
is taken. For example:
|
||||
EXAMPLE: The root token has the shortest path to the root of the sentence
|
||||
(or is the root itself). If multiple words are equally high in the
|
||||
tree, the first word is taken. For example:
|
||||
|
||||
>>> toks = nlp(u'I like New York in Autumn.')
|
||||
>>> toks = nlp(u'I like New York in Autumn.')
|
||||
|
||||
Let's name the indices --- easier than writing "toks[4]" etc.
|
||||
Let's name the indices – easier than writing `toks[4]` etc.
|
||||
|
||||
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||
>>> i, like, new, york, in_, autumn, dot = range(len(toks))
|
||||
|
||||
The head of 'new' is 'York', and the head of 'York' is 'like'
|
||||
The head of 'new' is 'York', and the head of "York" is "like"
|
||||
|
||||
>>> toks[new].head.orth_
|
||||
'York'
|
||||
>>> toks[york].head.orth_
|
||||
'like'
|
||||
>>> toks[new].head.text
|
||||
'York'
|
||||
>>> toks[york].head.text
|
||||
'like'
|
||||
|
||||
Create a span for "New York". Its root is "York".
|
||||
Create a span for "New York". Its root is "York".
|
||||
|
||||
>>> new_york = toks[new:york+1]
|
||||
>>> new_york.root.orth_
|
||||
'York'
|
||||
>>> new_york = toks[new:york+1]
|
||||
>>> new_york.root.text
|
||||
'York'
|
||||
|
||||
Here's a more complicated case, raise by Issue #214
|
||||
Here's a more complicated case, raised by issue #214:
|
||||
|
||||
>>> toks = nlp(u'to, north and south carolina')
|
||||
>>> to, north, and_, south, carolina = toks
|
||||
>>> south.head.text, carolina.head.text
|
||||
('north', 'to')
|
||||
>>> toks = nlp(u'to, north and south carolina')
|
||||
>>> to, north, and_, south, carolina = toks
|
||||
>>> south.head.text, carolina.head.text
|
||||
('north', 'to')
|
||||
|
||||
Here 'south' is a child of 'north', which is a child of 'carolina'.
|
||||
Carolina is the root of the span:
|
||||
Here "south" is a child of "north", which is a child of "carolina".
|
||||
Carolina is the root of the span:
|
||||
|
||||
>>> south_carolina = toks[-2:]
|
||||
>>> south_carolina.root.text
|
||||
'carolina'
|
||||
>>> south_carolina = toks[-2:]
|
||||
>>> south_carolina.root.text
|
||||
'carolina'
|
||||
"""
|
||||
def __get__(self):
|
||||
self._recalculate_indices()
|
||||
|
@ -314,10 +339,10 @@ cdef class Span:
|
|||
return self.doc[root]
|
||||
|
||||
property lefts:
|
||||
"""
|
||||
Tokens that are to the left of the span, whose head is within the Span.
|
||||
""" Tokens that are to the left of the span, whose head is within the
|
||||
`Span`.
|
||||
|
||||
Yields: Token A left-child of a token of the span.
|
||||
YIELDS (Token):A left-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
||||
|
@ -326,10 +351,10 @@ cdef class Span:
|
|||
yield left
|
||||
|
||||
property rights:
|
||||
"""
|
||||
Tokens that are to the right of the Span, whose head is within the Span.
|
||||
"""Tokens that are to the right of the Span, whose head is within the
|
||||
`Span`.
|
||||
|
||||
Yields: Token A right-child of a token of the span.
|
||||
YIELDS (Token): A right-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in self:
|
||||
|
@ -338,10 +363,9 @@ cdef class Span:
|
|||
yield right
|
||||
|
||||
property subtree:
|
||||
"""
|
||||
Tokens that descend from tokens in the span, but fall outside it.
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
||||
Yields: Token A descendant of a token within the span.
|
||||
YIELDS (Token): A descendant of a token within the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
|
@ -351,8 +375,9 @@ cdef class Span:
|
|||
yield from word.subtree
|
||||
|
||||
property ent_id:
|
||||
"""
|
||||
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (int): The entity ID.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id
|
||||
|
@ -362,9 +387,11 @@ cdef class Span:
|
|||
raise NotImplementedError(
|
||||
"Can't yet set ent_id from Span. Vote for this feature on the issue "
|
||||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property ent_id_:
|
||||
"""
|
||||
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""A (string) entity ID. Usually assigned by patterns in the `Matcher`.
|
||||
|
||||
RETURNS (unicode): The entity ID.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.root.ent_id_
|
||||
|
@ -376,26 +403,38 @@ cdef class Span:
|
|||
"tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
property orth_:
|
||||
# TODO: docstring
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self]).strip()
|
||||
|
||||
property lemma_:
|
||||
"""The span's lemma.
|
||||
|
||||
RETURNS (unicode): The span's lemma.
|
||||
"""
|
||||
def __get__(self):
|
||||
return ' '.join([t.lemma_ for t in self]).strip()
|
||||
|
||||
property upper_:
|
||||
# TODO: docstring
|
||||
def __get__(self):
|
||||
return ''.join([t.string.upper() for t in self]).strip()
|
||||
|
||||
property lower_:
|
||||
# TODO: docstring
|
||||
def __get__(self):
|
||||
return ''.join([t.string.lower() for t in self]).strip()
|
||||
|
||||
property string:
|
||||
# TODO: docstring
|
||||
def __get__(self):
|
||||
return ''.join([t.string for t in self])
|
||||
|
||||
property label_:
|
||||
"""The span's label.
|
||||
|
||||
RETURNS (unicode): The span's label.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.doc.vocab.strings[self.label]
|
||||
|
||||
|
|
|
@ -23,10 +23,14 @@ from .. import about
|
|||
|
||||
|
||||
cdef class Token:
|
||||
"""
|
||||
An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||
"""
|
||||
"""An individual token – i.e. a word, punctuation symbol, whitespace, etc."""
|
||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||
"""Construct a `Token` object.
|
||||
|
||||
vocab (Vocab): A storage container for lexical types.
|
||||
doc (Doc): The parent document.
|
||||
offset (int): The index of the token within the document.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.doc = doc
|
||||
self.c = &self.doc.c[offset]
|
||||
|
@ -36,8 +40,9 @@ cdef class Token:
|
|||
return hash((self.doc, self.i))
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
Number of unicode characters in token.text.
|
||||
"""The number of unicode characters in the token, i.e. `token.text`.
|
||||
|
||||
RETURNS (int): The number of unicode characters in the token.
|
||||
"""
|
||||
return self.c.lex.length
|
||||
|
||||
|
@ -75,37 +80,35 @@ cdef class Token:
|
|||
raise ValueError(op)
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
"""Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The ID of the flag attribute.
|
||||
Returns:
|
||||
is_set (bool): Whether the flag is set.
|
||||
flag_id (int): The ID of the flag attribute.
|
||||
RETURNS (bool): Whether the flag is set.
|
||||
|
||||
EXAMPLE:
|
||||
>>> from spacy.attrs import IS_TITLE
|
||||
>>> doc = nlp(u'Give it back! He pleaded.')
|
||||
>>> token = doc[0]
|
||||
>>> token.check_flag(IS_TITLE)
|
||||
True
|
||||
"""
|
||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||
|
||||
def nbor(self, int i=1):
|
||||
"""
|
||||
Get a neighboring token.
|
||||
"""Get a neighboring token.
|
||||
|
||||
Arguments:
|
||||
i (int): The relative position of the token to get. Defaults to 1.
|
||||
Returns:
|
||||
neighbor (Token): The token at position self.doc[self.i+i]
|
||||
i (int): The relative position of the token to get. Defaults to 1.
|
||||
RETURNS (Token): The token at position `self.doc[self.i+i]`.
|
||||
"""
|
||||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
The object to compare with. By default, accepts Doc, Span,
|
||||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
other (object): The object to compare with. By default, accepts `Doc`,
|
||||
`Span`, `Token` and `Lexeme` objects.
|
||||
RETURNS (float): A scalar similarity score. Higher is more similar.
|
||||
"""
|
||||
if 'similarity' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['similarity'](self)
|
||||
|
@ -114,10 +117,14 @@ cdef class Token:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property lex_id:
|
||||
"""ID of the token's lexical type.
|
||||
|
||||
RETURNS (int): ID of the token's lexical type."""
|
||||
def __get__(self):
|
||||
return self.c.lex.id
|
||||
|
||||
property rank:
|
||||
# TODO: add docstring
|
||||
def __get__(self):
|
||||
return self.c.lex.id
|
||||
|
||||
|
@ -126,10 +133,19 @@ cdef class Token:
|
|||
return self.text_with_ws
|
||||
|
||||
property text:
|
||||
"""A unicode representation of the token text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the token.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.orth_
|
||||
|
||||
property text_with_ws:
|
||||
"""The text content of the token with a trailing whitespace character if
|
||||
it has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing whitespace).
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||
if self.c.spacy:
|
||||
|
@ -184,6 +200,10 @@ cdef class Token:
|
|||
return self.c.lex.suffix
|
||||
|
||||
property lemma:
|
||||
"""Base form of the word, with no inflectional suffixes.
|
||||
|
||||
RETURNS (int): Token lemma.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.lemma
|
||||
def __set__(self, int lemma):
|
||||
|
@ -206,8 +226,10 @@ cdef class Token:
|
|||
self.c.dep = label
|
||||
|
||||
property has_vector:
|
||||
"""
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
"""A boolean value indicating whether a word vector is associated with
|
||||
the object.
|
||||
|
||||
RETURNS (bool): Whether a word vector is associated with the object.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
|
@ -220,10 +242,10 @@ cdef class Token:
|
|||
return False
|
||||
|
||||
property vector:
|
||||
"""
|
||||
A real-valued meaning representation.
|
||||
"""A real-valued meaning representation.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array
|
||||
representing the token's semantics.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
|
@ -239,15 +261,11 @@ cdef class Token:
|
|||
vector_view = <float[:length,]>self.c.lex.vector
|
||||
return numpy.asarray(vector_view)
|
||||
|
||||
property repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("repvec was renamed to vector in v0.100")
|
||||
|
||||
property has_repvec:
|
||||
def __get__(self):
|
||||
raise AttributeError("has_repvec was renamed to has_vector in v0.100")
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the token's vector representation.
|
||||
|
||||
RETURNS (float): The L2 norm of the vector representation.
|
||||
"""
|
||||
def __get__(self):
|
||||
if 'vector_norm' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector_norm'](self)
|
||||
|
@ -324,28 +342,26 @@ cdef class Token:
|
|||
yield from word.subtree
|
||||
|
||||
property left_edge:
|
||||
"""
|
||||
The leftmost token of this token's syntactic descendents.
|
||||
"""The leftmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The first token such that self.is_ancestor(token)
|
||||
RETURNS (Token): The first token such that `self.is_ancestor(token)`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.doc[self.c.l_edge]
|
||||
|
||||
property right_edge:
|
||||
"""
|
||||
The rightmost token of this token's syntactic descendents.
|
||||
"""The rightmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The last token such that self.is_ancestor(token)
|
||||
RETURNS (Token): The last token such that `self.is_ancestor(token)`.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.doc[self.c.r_edge]
|
||||
|
||||
property ancestors:
|
||||
"""
|
||||
A sequence of this token's syntactic ancestors.
|
||||
"""A sequence of this token's syntactic ancestors.
|
||||
|
||||
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||
YIELDS (Token): A sequence of ancestor tokens such that
|
||||
`ancestor.is_ancestor(self)`.
|
||||
"""
|
||||
def __get__(self):
|
||||
cdef const TokenC* head_ptr = self.c
|
||||
|
@ -357,33 +373,25 @@ cdef class Token:
|
|||
yield self.doc[head_ptr - (self.c - self.i)]
|
||||
i += 1
|
||||
|
||||
def is_ancestor_of(self, descendant):
|
||||
# TODO: Remove after backward compatibility check.
|
||||
return self.is_ancestor(descendant)
|
||||
|
||||
def is_ancestor(self, descendant):
|
||||
"""
|
||||
Check whether this token is a parent, grandparent, etc. of another
|
||||
"""Check whether this token is a parent, grandparent, etc. of another
|
||||
in the dependency tree.
|
||||
|
||||
Arguments:
|
||||
descendant (Token): Another token.
|
||||
Returns:
|
||||
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||
descendant (Token): Another token.
|
||||
RETURNS (bool): Whether this token is the ancestor of the descendant.
|
||||
"""
|
||||
if self.doc is not descendant.doc:
|
||||
return False
|
||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||
|
||||
property head:
|
||||
"""
|
||||
The syntactic parent, or "governor", of this token.
|
||||
"""The syntactic parent, or "governor", of this token.
|
||||
|
||||
Returns: Token
|
||||
RETURNS (Token): The token head.
|
||||
"""
|
||||
def __get__(self):
|
||||
"""
|
||||
The token predicted by the parser to be the head of the current token.
|
||||
"""The token predicted by the parser to be the head of the current
|
||||
token.
|
||||
"""
|
||||
return self.doc[self.i + self.c.head]
|
||||
def __set__(self, Token new_head):
|
||||
|
@ -399,7 +407,7 @@ cdef class Token:
|
|||
cdef int rel_newhead_i = new_head.i - self.i
|
||||
|
||||
# is the new head a descendant of the old head
|
||||
cdef bint is_desc = old_head.is_ancestor_of(new_head)
|
||||
cdef bint is_desc = old_head.is_ancestor(new_head)
|
||||
|
||||
cdef int new_edge
|
||||
cdef Token anc, child
|
||||
|
@ -477,10 +485,9 @@ cdef class Token:
|
|||
self.c.head = rel_newhead_i
|
||||
|
||||
property conjuncts:
|
||||
"""
|
||||
A sequence of coordinated tokens, including the token itself.
|
||||
"""A sequence of coordinated tokens, including the token itself.
|
||||
|
||||
Yields: Token A coordinated token
|
||||
YIELDS (Token): A coordinated token.
|
||||
"""
|
||||
def __get__(self):
|
||||
"""Get a list of conjoined words."""
|
||||
|
@ -495,25 +502,46 @@ cdef class Token:
|
|||
yield from word.conjuncts
|
||||
|
||||
property ent_type:
|
||||
"""Named entity type.
|
||||
|
||||
RETURNS (int): Named entity type.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_type
|
||||
|
||||
property ent_iob:
|
||||
"""IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
|
||||
is assigned.
|
||||
|
||||
RETURNS (int): IOB code of named entity tag.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_iob
|
||||
|
||||
property ent_type_:
|
||||
"""Named entity type.
|
||||
|
||||
RETURNS (unicode): Named entity type.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_type]
|
||||
|
||||
property ent_iob_:
|
||||
"""IOB code of named entity tag. "B" means the token begins an entity,
|
||||
"I" means it is inside an entity, "O" means it is outside an entity, and
|
||||
"" means no entity tag is set.
|
||||
|
||||
RETURNS (unicode): IOB code of named entity tag.
|
||||
"""
|
||||
def __get__(self):
|
||||
iob_strings = ('', 'I', 'O', 'B')
|
||||
return iob_strings[self.c.ent_iob]
|
||||
|
||||
property ent_id:
|
||||
"""
|
||||
An (integer) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
|
||||
RETURNS (int): ID of the entity.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.c.ent_id
|
||||
|
@ -522,8 +550,10 @@ cdef class Token:
|
|||
self.c.ent_id = key
|
||||
|
||||
property ent_id_:
|
||||
"""
|
||||
A (string) entity ID. Usually assigned by patterns in the Matcher.
|
||||
"""ID of the entity the token is an instance of, if any. Usually
|
||||
assigned by patterns in the Matcher.
|
||||
|
||||
RETURNS (unicode): ID of the entity.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_id]
|
||||
|
@ -564,6 +594,10 @@ cdef class Token:
|
|||
return self.vocab.strings[self.c.lex.lang]
|
||||
|
||||
property lemma_:
|
||||
"""Base form of the word, with no inflectional suffixes.
|
||||
|
||||
RETURNS (unicode): Token lemma.
|
||||
"""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lemma]
|
||||
def __set__(self, unicode lemma_):
|
||||
|
|
|
@ -145,7 +145,8 @@ def parse_package_meta(package_path, require=True):
|
|||
|
||||
|
||||
def is_in_jupyter():
|
||||
"""Check if user is in a Jupyter notebook. Mainly used for displaCy.
|
||||
"""Check if user is running spaCy from a Jupyter notebook by detecting the
|
||||
IPython kernel. Mainly used for the displaCy visualizer.
|
||||
|
||||
RETURNS (bool): True if in Jupyter, False if not.
|
||||
"""
|
||||
|
|
484
spacy/vocab.pyx
484
spacy/vocab.pyx
|
@ -36,79 +36,22 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||
instance also provides access to the `StringStore`, and owns underlying
|
||||
C-data that is shared between `Doc` objects.
|
||||
"""
|
||||
A map container for a language's LexemeC structs.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||
tag_map=True, oov_prob=True, **deprecated_kwargs):
|
||||
"""
|
||||
Deprecated --- replace in spaCy 2
|
||||
Load the vocabulary from a path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
Defaults to None.
|
||||
lemmatizer (object):
|
||||
A lemmatizer. Defaults to None.
|
||||
tag_map (dict):
|
||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||
and optionally morphological attributes.
|
||||
oov_prob (float):
|
||||
The default probability for out-of-vocabulary words.
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
if 'vectors' in deprecated_kwargs:
|
||||
raise AttributeError(
|
||||
"vectors argument to Vocab.load() deprecated. "
|
||||
"Install vectors after loading.")
|
||||
if tag_map is True and (path / 'vocab' / 'tag_map.json').exists():
|
||||
with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_:
|
||||
tag_map = ujson.load(file_)
|
||||
elif tag_map is True:
|
||||
tag_map = None
|
||||
if lex_attr_getters is not None \
|
||||
and oov_prob is True \
|
||||
and (path / 'vocab' / 'oov_prob').exists():
|
||||
with (path / 'vocab' / 'oov_prob').open('r', encoding='utf8') as file_:
|
||||
oov_prob = float(file_.read())
|
||||
lex_attr_getters[PROB] = lambda text: oov_prob
|
||||
if lemmatizer is True:
|
||||
lemmatizer = Lemmatizer.load(path)
|
||||
|
||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
strings_list = ujson.load(file_)
|
||||
cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map,
|
||||
lemmatizer=lemmatizer,
|
||||
strings=strings_list)
|
||||
self.load_lexemes(path / 'vocab' / 'lexemes.bin')
|
||||
return self
|
||||
|
||||
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
strings=tuple(), **deprecated_kwargs):
|
||||
"""
|
||||
Create the vocabulary.
|
||||
"""Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
Defaults to None.
|
||||
lemmatizer (object):
|
||||
A lemmatizer. Defaults to None.
|
||||
tag_map (dict):
|
||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||
and optionally morphological attributes.
|
||||
oov_prob (float):
|
||||
The default probability for out-of-vocabulary words.
|
||||
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
lex_attr_getters (dict): A dictionary mapping attribute IDs to functions
|
||||
to compute them. Defaults to `None`.
|
||||
tag_map (dict): A dictionary mapping fine-grained tags to coarse-grained
|
||||
parts-of-speech, and optionally morphological attributes.
|
||||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||
strings (StringStore): StringStore that maps strings to integers, and
|
||||
vice versa.
|
||||
RETURNS (Vocab): The newly constructed vocab object.
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
|
@ -148,33 +91,32 @@ cdef class Vocab:
|
|||
return langfunc('_') if langfunc else ''
|
||||
|
||||
def __len__(self):
|
||||
"""
|
||||
The current number of lexemes stored.
|
||||
"""The current number of lexemes stored.
|
||||
|
||||
RETURNS (int): The current number of lexemes stored.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
"""
|
||||
Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
The flag_setter function will be called over the words currently in the
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
"""Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
The flag_getter function will be called over the words currently in the
|
||||
vocab, and then applied to new words as they occur. You'll then be able
|
||||
to access the flag value on each token, using token.check_flag(flag_id).
|
||||
See also: `Lexeme.set_flag`, `Lexeme.check_flag`, `Token.set_flag`,
|
||||
`Token.check_flag`.
|
||||
|
||||
See also:
|
||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||
flag_getter (callable): A function `f(unicode) -> bool`, to get the flag
|
||||
value.
|
||||
flag_id (int): An integer between 1 and 63 (inclusive), specifying
|
||||
the bit at which the flag will be stored. If -1, the lowest
|
||||
available bit will be chosen.
|
||||
RETURNS (int): The integer ID by which the flag value can be checked.
|
||||
|
||||
Arguments:
|
||||
flag_getter:
|
||||
A function f(unicode) -> bool, to get the flag value.
|
||||
|
||||
flag_id (int):
|
||||
An integer between 1 and 63 (inclusive), specifying the bit at which the
|
||||
flag will be stored. If -1, the lowest available bit will be
|
||||
chosen.
|
||||
|
||||
Returns:
|
||||
flag_id (int): The integer ID by which the flag value can be checked.
|
||||
EXAMPLE:
|
||||
>>> MY_PRODUCT = nlp.vocab.add_flag(lambda text: text in ['spaCy', 'dislaCy'])
|
||||
>>> doc = nlp(u'I like spaCy')
|
||||
>>> assert doc[2].check_flag(MY_PRODUCT) == True
|
||||
"""
|
||||
if flag_id == -1:
|
||||
for bit in range(1, 64):
|
||||
|
@ -196,9 +138,8 @@ cdef class Vocab:
|
|||
return flag_id
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if string == u'':
|
||||
|
@ -216,9 +157,8 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, string)
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
"""Get a pointer to a `LexemeC` from the lexicon, creating a new `Lexeme`
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if orth == 0:
|
||||
|
@ -263,24 +203,19 @@ cdef class Vocab:
|
|||
self.length += 1
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
"""
|
||||
Check whether the string has an entry in the vocabulary.
|
||||
"""Check whether the string has an entry in the vocabulary.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The ID string.
|
||||
|
||||
Returns:
|
||||
bool Whether the string has an entry in the vocabulary.
|
||||
string (unicode): The ID string.
|
||||
RETURNS (bool) Whether the string has an entry in the vocabulary.
|
||||
"""
|
||||
key = hash_string(string)
|
||||
lex = self._by_hash.get(key)
|
||||
return lex is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""
|
||||
Iterate over the lexemes in the vocabulary.
|
||||
"""Iterate over the lexemes in the vocabulary.
|
||||
|
||||
Yields: Lexeme An entry in the vocabulary.
|
||||
YIELDS (Lexeme): An entry in the vocabulary.
|
||||
"""
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
|
@ -288,19 +223,19 @@ cdef class Vocab:
|
|||
yield Lexeme(self, orth)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
"""
|
||||
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
"""Retrieve a lexeme, given an int ID or a unicode string. If a
|
||||
previously unseen unicode string is given, a new lexeme is created and
|
||||
stored.
|
||||
|
||||
Arguments:
|
||||
id_or_string (int or unicode):
|
||||
The integer ID of a word, or its unicode string.
|
||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
||||
string. If `int >= Lexicon.size`, `IndexError` is raised. If
|
||||
`id_or_string` is neither an int nor a unicode string, `ValueError`
|
||||
is raised.
|
||||
RETURNS (Lexeme): The lexeme indicated by the given ID.
|
||||
|
||||
If an int >= Lexicon.size, IndexError is raised. If id_or_string
|
||||
is neither an int nor a unicode string, ValueError is raised.
|
||||
|
||||
Returns:
|
||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||
EXAMPLE:
|
||||
>>> apple = nlp.vocab.strings['apple']
|
||||
>>> assert nlp.vocab[apple] == nlp.vocab[u'apple']
|
||||
"""
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
|
@ -324,15 +259,29 @@ cdef class Vocab:
|
|||
return tokens
|
||||
|
||||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or `Path`-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
if not path.exists():
|
||||
path.mkdir()
|
||||
strings_loc = path / 'strings.json'
|
||||
with strings_loc.open('w', encoding='utf8') as file_:
|
||||
self.strings.dump(file_)
|
||||
self.dump(path / 'lexemes.bin')
|
||||
|
||||
# TODO: pickle
|
||||
# self.dump(path / 'lexemes.bin')
|
||||
|
||||
def from_disk(self, path):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (Vocab): The modified `Vocab` object.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
|
||||
strings_list = ujson.load(file_)
|
||||
|
@ -340,6 +289,23 @@ cdef class Vocab:
|
|||
self.strings[string]
|
||||
self.load_lexemes(path / 'lexemes.bin')
|
||||
|
||||
def to_bytes(self, **exclude):
|
||||
"""Serialize the current state to a binary string.
|
||||
|
||||
**exclude: Named attributes to prevent from being serialized.
|
||||
RETURNS (bytes): The serialized form of the `Vocab` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load state from a binary string.
|
||||
|
||||
bytes_data (bytes): The data to load from.
|
||||
**exclude: Named attributes to prevent from being loaded.
|
||||
RETURNS (Vocab): The `Vocab` object.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def lexemes_to_bytes(self, **exclude):
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
|
@ -365,9 +331,7 @@ cdef class Vocab:
|
|||
return byte_string
|
||||
|
||||
def lexemes_from_bytes(self, bytes bytes_data):
|
||||
"""
|
||||
Load the binary vocabulary data from the given string.
|
||||
"""
|
||||
"""Load the binary vocabulary data from the given string."""
|
||||
cdef LexemeC* lexeme
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
|
@ -391,16 +355,12 @@ cdef class Vocab:
|
|||
self.length += 1
|
||||
|
||||
# Deprecated --- delete these once stable
|
||||
|
||||
def dump_vectors(self, out_loc):
|
||||
"""
|
||||
Save the word vectors to a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to save to.
|
||||
Returns:
|
||||
None
|
||||
#"""
|
||||
def dump_vectors(self, out_loc):
|
||||
"""Save the word vectors to a binary file.
|
||||
|
||||
loc (Path): The path to save to.
|
||||
"""
|
||||
cdef int32_t vec_len = self.vectors_length
|
||||
cdef int32_t word_len
|
||||
cdef bytes word_str
|
||||
|
@ -424,17 +384,14 @@ cdef class Vocab:
|
|||
|
||||
|
||||
def load_vectors(self, file_):
|
||||
"""
|
||||
Load vectors from a text-based file.
|
||||
"""Load vectors from a text-based file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||
and each entry should be whitespace delimited. The first value of the entry
|
||||
should be the word string, and subsequent entries should be the values of the
|
||||
vector.
|
||||
file_ (buffer): The file to read from. Entries should be separated by
|
||||
newlines, and each entry should be whitespace delimited. The first value of the entry
|
||||
should be the word string, and subsequent entries should be the values of the
|
||||
vector.
|
||||
|
||||
Returns:
|
||||
vec_len (int): The length of the vectors loaded.
|
||||
RETURNS (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
|
@ -464,14 +421,11 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
def load_vectors_from_bin_loc(self, loc):
|
||||
"""
|
||||
Load vectors from the location of a binary file.
|
||||
"""Load vectors from the location of a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
|
||||
Returns:
|
||||
vec_len (int): The length of the vectors loaded.
|
||||
RETURNS (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef CFile file_ = CFile(loc, b'rb')
|
||||
cdef int32_t word_len
|
||||
|
@ -526,12 +480,10 @@ cdef class Vocab:
|
|||
|
||||
|
||||
def resize_vectors(self, int new_size):
|
||||
"""
|
||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||
vectors if necessary. The memory will be zeroed.
|
||||
"""Set vectors_length to a new size, and allocate more memory for the
|
||||
`Lexeme` vectors if necessary. The memory will be zeroed.
|
||||
|
||||
Arguments:
|
||||
new_size (int): The new size of the vectors.
|
||||
new_size (int): The new size of the vectors.
|
||||
"""
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
|
@ -633,237 +585,3 @@ class VectorReadError(Exception):
|
|||
"Vector size: %d\n"
|
||||
"Max size: %d\n"
|
||||
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
||||
|
||||
|
||||
#
|
||||
#Deprecated --- delete these once stable
|
||||
#
|
||||
# def dump_vectors(self, out_loc):
|
||||
# """
|
||||
# Save the word vectors to a binary file.
|
||||
#
|
||||
# Arguments:
|
||||
# loc (Path): The path to save to.
|
||||
# Returns:
|
||||
# None
|
||||
# #"""
|
||||
# cdef int32_t vec_len = self.vectors_length
|
||||
# cdef int32_t word_len
|
||||
# cdef bytes word_str
|
||||
# cdef char* chars
|
||||
#
|
||||
# cdef Lexeme lexeme
|
||||
# cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
# for lexeme in self:
|
||||
# word_str = lexeme.orth_.encode('utf8')
|
||||
# vec = lexeme.c.vector
|
||||
# word_len = len(word_str)
|
||||
#
|
||||
# out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||
#
|
||||
# chars = <char*>word_str
|
||||
# out_file.write_from(chars, word_len, sizeof(char))
|
||||
# out_file.write_from(vec, vec_len, sizeof(float))
|
||||
# out_file.close()
|
||||
#
|
||||
#
|
||||
#
|
||||
# def load_vectors(self, file_):
|
||||
# """
|
||||
# Load vectors from a text-based file.
|
||||
#
|
||||
# Arguments:
|
||||
# file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||
# and each entry should be whitespace delimited. The first value of the entry
|
||||
# should be the word string, and subsequent entries should be the values of the
|
||||
# vector.
|
||||
#
|
||||
# Returns:
|
||||
# vec_len (int): The length of the vectors loaded.
|
||||
# """
|
||||
# cdef LexemeC* lexeme
|
||||
# cdef attr_t orth
|
||||
# cdef int32_t vec_len = -1
|
||||
# cdef double norm = 0.0
|
||||
#
|
||||
# whitespace_pattern = re.compile(r'\s', re.UNICODE)
|
||||
#
|
||||
# for line_num, line in enumerate(file_):
|
||||
# pieces = line.split()
|
||||
# word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
||||
# if vec_len == -1:
|
||||
# vec_len = len(pieces)
|
||||
# elif vec_len != len(pieces):
|
||||
# raise VectorReadError.mismatched_sizes(file_, line_num,
|
||||
# vec_len, len(pieces))
|
||||
# orth = self.strings[word_str]
|
||||
# lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
||||
# lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
|
||||
# for i, val_str in enumerate(pieces):
|
||||
# lexeme.vector[i] = float(val_str)
|
||||
# norm = 0.0
|
||||
# for i in range(vec_len):
|
||||
# norm += lexeme.vector[i] * lexeme.vector[i]
|
||||
# lexeme.l2_norm = sqrt(norm)
|
||||
# self.vectors_length = vec_len
|
||||
# return vec_len
|
||||
#
|
||||
# def load_vectors_from_bin_loc(self, loc):
|
||||
# """
|
||||
# Load vectors from the location of a binary file.
|
||||
#
|
||||
# Arguments:
|
||||
# loc (unicode): The path of the binary file to load from.
|
||||
#
|
||||
# Returns:
|
||||
# vec_len (int): The length of the vectors loaded.
|
||||
# """
|
||||
# cdef CFile file_ = CFile(loc, b'rb')
|
||||
# cdef int32_t word_len
|
||||
# cdef int32_t vec_len = 0
|
||||
# cdef int32_t prev_vec_len = 0
|
||||
# cdef float* vec
|
||||
# cdef Address mem
|
||||
# cdef attr_t string_id
|
||||
# cdef bytes py_word
|
||||
# cdef vector[float*] vectors
|
||||
# cdef int line_num = 0
|
||||
# cdef Pool tmp_mem = Pool()
|
||||
# while True:
|
||||
# try:
|
||||
# file_.read_into(&word_len, sizeof(word_len), 1)
|
||||
# except IOError:
|
||||
# break
|
||||
# file_.read_into(&vec_len, sizeof(vec_len), 1)
|
||||
# if prev_vec_len != 0 and vec_len != prev_vec_len:
|
||||
# raise VectorReadError.mismatched_sizes(loc, line_num,
|
||||
# vec_len, prev_vec_len)
|
||||
# if 0 >= vec_len >= MAX_VEC_SIZE:
|
||||
# raise VectorReadError.bad_size(loc, vec_len)
|
||||
#
|
||||
# chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
||||
# vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
||||
#
|
||||
# string_id = self.strings[chars[:word_len]]
|
||||
# # Insert words into vocab to add vector.
|
||||
# self.get_by_orth(self.mem, string_id)
|
||||
# while string_id >= vectors.size():
|
||||
# vectors.push_back(EMPTY_VEC)
|
||||
# assert vec != NULL
|
||||
# vectors[string_id] = vec
|
||||
# line_num += 1
|
||||
# cdef LexemeC* lex
|
||||
# cdef size_t lex_addr
|
||||
# cdef double norm = 0.0
|
||||
# cdef int i
|
||||
# for orth, lex_addr in self._by_orth.items():
|
||||
# lex = <LexemeC*>lex_addr
|
||||
# if lex.lower < vectors.size():
|
||||
# lex.vector = vectors[lex.lower]
|
||||
# norm = 0.0
|
||||
# for i in range(vec_len):
|
||||
# norm += lex.vector[i] * lex.vector[i]
|
||||
# lex.l2_norm = sqrt(norm)
|
||||
# else:
|
||||
# lex.vector = EMPTY_VEC
|
||||
# self.vectors_length = vec_len
|
||||
# return vec_len
|
||||
#
|
||||
#
|
||||
#def write_binary_vectors(in_loc, out_loc):
|
||||
# cdef CFile out_file = CFile(out_loc, 'wb')
|
||||
# cdef Address mem
|
||||
# cdef int32_t word_len
|
||||
# cdef int32_t vec_len
|
||||
# cdef char* chars
|
||||
# with bz2.BZ2File(in_loc, 'r') as file_:
|
||||
# for line in file_:
|
||||
# pieces = line.split()
|
||||
# word = pieces.pop(0)
|
||||
# mem = Address(len(pieces), sizeof(float))
|
||||
# vec = <float*>mem.ptr
|
||||
# for i, val_str in enumerate(pieces):
|
||||
# vec[i] = float(val_str)
|
||||
#
|
||||
# word_len = len(word)
|
||||
# vec_len = len(pieces)
|
||||
#
|
||||
# out_file.write_from(&word_len, 1, sizeof(word_len))
|
||||
# out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
||||
#
|
||||
# chars = <char*>word
|
||||
# out_file.write_from(chars, len(word), sizeof(char))
|
||||
# out_file.write_from(vec, vec_len, sizeof(float))
|
||||
#
|
||||
#
|
||||
# def resize_vectors(self, int new_size):
|
||||
# """
|
||||
# Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||
# vectors if necessary. The memory will be zeroed.
|
||||
#
|
||||
# Arguments:
|
||||
# new_size (int): The new size of the vectors.
|
||||
# """
|
||||
# cdef hash_t key
|
||||
# cdef size_t addr
|
||||
# if new_size > self.vectors_length:
|
||||
# for key, addr in self._by_hash.items():
|
||||
# lex = <LexemeC*>addr
|
||||
# lex.vector = <float*>self.mem.realloc(lex.vector,
|
||||
# new_size * sizeof(lex.vector[0]))
|
||||
# self.vectors_length = new_size
|
||||
#
|
||||
#
|
||||
|
||||
#
|
||||
# def dump(self, loc=None):
|
||||
# """
|
||||
# Save the lexemes binary data to the given location, or
|
||||
# return a byte-string with the data if loc is None.
|
||||
#
|
||||
# Arguments:
|
||||
# loc (Path or None): The path to save to, or None.
|
||||
# """
|
||||
# if loc is None:
|
||||
# return self.to_bytes()
|
||||
# else:
|
||||
# return self.to_disk(loc)
|
||||
#
|
||||
# def load_lexemes(self, loc):
|
||||
# """
|
||||
# Load the binary vocabulary data from the given location.
|
||||
#
|
||||
# Arguments:
|
||||
# loc (Path): The path to load from.
|
||||
#
|
||||
# Returns:
|
||||
# None
|
||||
# """
|
||||
# fp = CFile(loc, 'rb',
|
||||
# on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||
# cdef LexemeC* lexeme = NULL
|
||||
# cdef SerializedLexemeC lex_data
|
||||
# cdef hash_t key
|
||||
# cdef unicode py_str
|
||||
# cdef attr_t orth = 0
|
||||
# assert sizeof(orth) == sizeof(lexeme.orth)
|
||||
# i = 0
|
||||
# while True:
|
||||
# try:
|
||||
# fp.read_into(&orth, 1, sizeof(orth))
|
||||
# except IOError:
|
||||
# break
|
||||
# lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
# # Copy data from the file into the lexeme
|
||||
# fp.read_into(&lex_data.data, 1, sizeof(lex_data.data))
|
||||
# Lexeme.c_from_bytes(lexeme, lex_data)
|
||||
#
|
||||
# lexeme.vector = EMPTY_VEC
|
||||
# py_str = self.strings[lexeme.orth]
|
||||
# key = hash_string(py_str)
|
||||
# self._by_hash.set(key, lexeme)
|
||||
# self._by_orth.set(lexeme.orth, lexeme)
|
||||
# self.length += 1
|
||||
# i += 1
|
||||
# fp.close()
|
||||
|
|
|
@ -80,6 +80,7 @@
|
|||
}
|
||||
],
|
||||
|
||||
"ALPHA": true,
|
||||
"V_CSS": "1.6",
|
||||
"V_JS": "1.2",
|
||||
"DEFAULT_SYNTAX": "python",
|
||||
|
|
|
@ -34,17 +34,17 @@ mixin src(url)
|
|||
+a(url)
|
||||
block
|
||||
|
||||
| #[+icon("code", 16).o-icon--inline.u-color-subtle]
|
||||
| #[+icon("code", 16).o-icon--inline.u-color-theme]
|
||||
|
||||
|
||||
//- API link (with added tag and automatically generated path)
|
||||
path - [string] path to API docs page relative to /docs/api/
|
||||
|
||||
mixin api(path)
|
||||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block
|
||||
+a("/docs/api/" + path, true)(target="_self").u-no-border.u-inline-block.u-nowrap
|
||||
block
|
||||
|
||||
| #[+icon("book", 18).o-icon--inline.u-color-subtle]
|
||||
| #[+icon("book", 18).o-icon--inline.u-color-theme]
|
||||
|
||||
|
||||
//- Help icon with tooltip
|
||||
|
@ -104,15 +104,31 @@ mixin button(url, trusted, ...style)
|
|||
language - [string] language for syntax highlighting (default: "python")
|
||||
supports basic relevant languages available for PrismJS
|
||||
|
||||
mixin code(label, language)
|
||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}")&attributes(attributes)
|
||||
mixin code(label, language, icon)
|
||||
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "")&attributes(attributes)
|
||||
if label
|
||||
h4.u-text-label.u-text-label--dark=label
|
||||
|
||||
if icon
|
||||
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
|
||||
.c-code-block__icon(class=classes[icon] || "")
|
||||
+icon(icon, 18)
|
||||
|
||||
code.c-code-block__content
|
||||
block
|
||||
|
||||
|
||||
//- Code blocks to display old/new versions
|
||||
|
||||
mixin code-old()
|
||||
+code(false, false, "reject").o-block-small
|
||||
block
|
||||
|
||||
mixin code-new()
|
||||
+code(false, false, "accept").o-block-small
|
||||
block
|
||||
|
||||
|
||||
//- CodePen embed
|
||||
slug - [string] ID of CodePen demo (taken from URL)
|
||||
height - [integer] height of demo embed iframe
|
||||
|
@ -164,6 +180,16 @@ mixin tag()
|
|||
block
|
||||
|
||||
|
||||
//- "Requires model" tag with tooltip and list of capabilities
|
||||
...capabs - [string] Required model capabilities, e.g. "vectors".
|
||||
|
||||
mixin tag-model(...capabs)
|
||||
- var intro = "To use this functionality, spaCy needs a model to be installed"
|
||||
- var ext = capabs.length ? " that supports the following capabilities: " + capabs.join(', ') : ""
|
||||
+tag Requires model
|
||||
+help(intro + ext + ".").u-color-theme
|
||||
|
||||
|
||||
//- List
|
||||
type - [string] "numbers", "letters", "roman" (bulleted list if none set)
|
||||
start - [integer] start number
|
||||
|
|
|
@ -9,6 +9,9 @@ nav.c-nav.u-text.js-nav(class=landing ? "c-nav--theme" : null)
|
|||
.u-text-label.u-padding-small.u-hidden-xs=SUBSECTION
|
||||
|
||||
ul.c-nav__menu
|
||||
if ALPHA
|
||||
- var NAVIGATION = { "Usage": "/docs/usage", "Reference": "/docs/api" }
|
||||
|
||||
each url, item in NAVIGATION
|
||||
li.c-nav__menu__item(class=(url == "/") ? "u-hidden-xs" : null)
|
||||
+a(url)=item
|
||||
|
|
|
@ -10,6 +10,14 @@ main.o-main.o-main--sidebar.o-main--aside
|
|||
if tag
|
||||
+tag=tag
|
||||
|
||||
if ALPHA
|
||||
+infobox("⚠️ You are viewing the spaCy v2.0 alpha docs")
|
||||
| This page is part of the alpha documentation for spaCy v2.0
|
||||
| and does not reflect the state of the latest stable release.
|
||||
| #[+a("#") See here] for more information on how to install
|
||||
| and test the new version. To read the official docs for
|
||||
| v1.x, #[+a("https://spacy.io/docs") go here].
|
||||
|
||||
!=yield
|
||||
|
||||
+grid.o-content.u-text
|
||||
|
|
|
@ -35,7 +35,10 @@ html(lang="en")
|
|||
link(rel="shortcut icon" href="/assets/img/favicon.ico")
|
||||
link(rel="icon" type="image/x-icon" href="/assets/img/favicon.ico")
|
||||
|
||||
if SUBSECTION == "usage"
|
||||
if ALPHA && SECTION == "docs"
|
||||
link(href="/assets/css/style_green.css?v#{V_CSS}" rel="stylesheet")
|
||||
|
||||
else if SUBSECTION == "usage"
|
||||
link(href="/assets/css/style_red.css?v#{V_CSS}" rel="stylesheet")
|
||||
|
||||
else
|
||||
|
|
|
@ -13,6 +13,17 @@
|
|||
white-space: pre
|
||||
direction: ltr
|
||||
|
||||
&.c-code-block--has-icon
|
||||
padding: 0
|
||||
display: flex
|
||||
|
||||
.c-code-block__icon
|
||||
padding: 0 0 0 1rem
|
||||
display: flex
|
||||
justify-content: center
|
||||
align-items: center
|
||||
border-left: 6px solid
|
||||
|
||||
|
||||
//- Code block content
|
||||
|
||||
|
@ -26,8 +37,8 @@
|
|||
|
||||
*:not(.c-code-block) > code
|
||||
font: normal 600 0.8em/#{1} $font-code
|
||||
background: rgba($color-front, 0.05)
|
||||
box-shadow: 1px 1px 0 rgba($color-front, 0.1)
|
||||
background: darken($color-theme-light, 5)
|
||||
box-shadow: 1px 1px 0 rgba($color-front, 0.05)
|
||||
text-shadow: 1px 1px 0 rgba($color-back, 0.5)
|
||||
color: $color-front
|
||||
padding: 0.1em 0.5em
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
background: rgba($color-subtle-light, 0.35)
|
||||
|
||||
&.c-table__row--foot
|
||||
background: rgba($color-theme, 0.025)
|
||||
background: $color-theme-light
|
||||
border-top: 2px solid $color-theme
|
||||
|
||||
.c-table__cell:first-child
|
||||
|
|
|
@ -11,9 +11,8 @@
|
|||
background: $color-front
|
||||
border-radius: 2px
|
||||
color: $color-back
|
||||
font-family: inherit
|
||||
font-size: 1.3rem
|
||||
line-height: 1.25
|
||||
font: normal 1.3rem/#{1.25} $font-primary
|
||||
text-transform: none
|
||||
opacity: 0
|
||||
padding: 0.5em 0.75em
|
||||
transform: translateX(-50%) translateY(-2px)
|
||||
|
|
|
@ -26,8 +26,7 @@ $font-code: 'Source Code Pro', Consolas, 'Andale Mono', Menlo, Monaco, Courier,
|
|||
|
||||
// Colors
|
||||
|
||||
$colors: ( blue: #09a3d5, red: #d9515d )
|
||||
$colors-light: (blue: #cceaf4, red: #f9d7da)
|
||||
$colors: ( blue: #09a3d5, red: #d9515d, green: #08c35e )
|
||||
|
||||
$color-back: #fff !default
|
||||
$color-front: #1a1e23 !default
|
||||
|
@ -35,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default
|
|||
|
||||
$color-theme: map-get($colors, $theme)
|
||||
$color-theme-dark: darken(map-get($colors, $theme), 5)
|
||||
$color-theme-light: map-get($colors-light, $theme)
|
||||
$color-theme-light: rgba($color-theme, 0.05)
|
||||
|
||||
$color-subtle: #ddd !default
|
||||
$color-subtle-light: #f6f6f6 !default
|
||||
|
|
4
website/assets/css/style_green.sass
Normal file
4
website/assets/css/style_green.sass
Normal file
|
@ -0,0 +1,4 @@
|
|||
//- 💫 STYLESHEET (GREEN)
|
||||
|
||||
$theme: green
|
||||
@import style
|
|
@ -30,5 +30,11 @@
|
|||
<symbol id="help" viewBox="0 0 24 24">
|
||||
<path d="M12 6c2.203 0 3.984 1.781 3.984 3.984 0 2.484-3 2.766-3 5.016h-1.969c0-3.234 3-3 3-5.016 0-1.078-0.938-1.969-2.016-1.969s-2.016 0.891-2.016 1.969h-1.969c0-2.203 1.781-3.984 3.984-3.984zM12 20.016c4.406 0 8.016-3.609 8.016-8.016s-3.609-8.016-8.016-8.016-8.016 3.609-8.016 8.016 3.609 8.016 8.016 8.016zM12 2.016c5.531 0 9.984 4.453 9.984 9.984s-4.453 9.984-9.984 9.984-9.984-4.453-9.984-9.984 4.453-9.984 9.984-9.984zM11.016 18v-2.016h1.969v2.016h-1.969z"/>
|
||||
</symbol>
|
||||
<symbol id="reject" viewBox="0 0 24 24">
|
||||
<path d="M18.984 6.422l-5.578 5.578 5.578 5.578-1.406 1.406-5.578-5.578-5.578 5.578-1.406-1.406 5.578-5.578-5.578-5.578 1.406-1.406 5.578 5.578 5.578-5.578z"/>
|
||||
</symbol>
|
||||
<symbol id="accept" viewBox="0 0 24 24">
|
||||
<path d="M9 16.172l10.594-10.594 1.406 1.406-12 12-5.578-5.578 1.406-1.406z"/>
|
||||
</symbol>
|
||||
</defs>
|
||||
</svg>
|
||||
|
|
Before Width: | Height: | Size: 5.4 KiB After Width: | Height: | Size: 5.8 KiB |
BIN
website/assets/img/pattern_green.jpg
Normal file
BIN
website/assets/img/pattern_green.jpg
Normal file
Binary file not shown.
After Width: | Height: | Size: 216 KiB |
|
@ -2,8 +2,13 @@
|
|||
"sidebar": {
|
||||
"Introduction": {
|
||||
"Facts & Figures": "./",
|
||||
"Languages": "language-models",
|
||||
"Philosophy": "philosophy"
|
||||
"Languages": "language-models"
|
||||
},
|
||||
"Top-level": {
|
||||
"spacy": "spacy",
|
||||
"displacy": "displacy",
|
||||
"Utility Functions": "util",
|
||||
"Command line": "cli"
|
||||
},
|
||||
"Classes": {
|
||||
"Doc": "doc",
|
||||
|
@ -21,9 +26,6 @@
|
|||
"GoldParse": "goldparse"
|
||||
},
|
||||
"Other": {
|
||||
"Command line": "cli",
|
||||
"displaCy": "displacy",
|
||||
"Utility Functions": "util",
|
||||
"Annotation Specs": "annotation",
|
||||
"Feature Scheme": "features"
|
||||
}
|
||||
|
@ -43,6 +45,26 @@
|
|||
"title": "Philosophy"
|
||||
},
|
||||
|
||||
"spacy": {
|
||||
"title": "spaCy top-level functions",
|
||||
"next": "displacy"
|
||||
},
|
||||
|
||||
"displacy": {
|
||||
"title": "displaCy",
|
||||
"tag": "module",
|
||||
"next": "util"
|
||||
},
|
||||
|
||||
"util": {
|
||||
"title": "Utility Functions",
|
||||
"next": "cli"
|
||||
},
|
||||
|
||||
"cli": {
|
||||
"title": "Command Line Interface"
|
||||
},
|
||||
|
||||
"language": {
|
||||
"title": "Language",
|
||||
"tag": "class"
|
||||
|
@ -113,20 +135,6 @@
|
|||
"tag": "class"
|
||||
},
|
||||
|
||||
"cli": {
|
||||
"title": "Command Line Interface",
|
||||
"next": "displacy"
|
||||
},
|
||||
|
||||
"displacy": {
|
||||
"title": "displaCy",
|
||||
"tag": "module"
|
||||
},
|
||||
|
||||
"util": {
|
||||
"title": "Utility Functions"
|
||||
},
|
||||
|
||||
"annotation": {
|
||||
"title": "Annotation Specifications"
|
||||
},
|
||||
|
|
|
@ -71,6 +71,44 @@ include _annotation/_dep-labels
|
|||
|
||||
include _annotation/_named-entities
|
||||
|
||||
+h(3, "biluo") BILUO Scheme
|
||||
|
||||
p
|
||||
| spaCy translates character offsets into the BILUO scheme, in order to
|
||||
| decide the cost of each action given the current state of the entity
|
||||
| recognizer. The costs are then used to calculate the gradient of the
|
||||
| loss, to train the model.
|
||||
|
||||
+aside("Why BILUO, not IOB?")
|
||||
| There are several coding schemes for encoding entity annotations as
|
||||
| token tags. These coding schemes are equally expressive, but not
|
||||
| necessarily equally learnable.
|
||||
| #[+a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth]
|
||||
| showed that the minimal #[strong Begin], #[strong In], #[strong Out]
|
||||
| scheme was more difficult to learn than the #[strong BILUO] scheme that
|
||||
| we use, which explicitly marks boundary tokens.
|
||||
|
||||
+table([ "Tag", "Description" ])
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme B] EGIN]
|
||||
+cell The first token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme I] N]
|
||||
+cell An inner token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme L] AST]
|
||||
+cell The final token of a multi-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme U] NIT]
|
||||
+cell A single-token entity.
|
||||
|
||||
+row
|
||||
+cell #[code #[span.u-color-theme O] UT]
|
||||
+cell A non-entity token.
|
||||
|
||||
+h(2, "json-input") JSON input format for training
|
||||
|
||||
p
|
||||
|
|
|
@ -10,11 +10,11 @@ p
|
|||
+aside("Why python -m?")
|
||||
| The problem with a global entry point is that it's resolved by looking up
|
||||
| entries in your #[code PATH] environment variable. This can give you
|
||||
| unexpected results, especially when using #[code virtualenv]. For
|
||||
| instance, you may have spaCy installed on your system but not in your
|
||||
| current environment. The command will then execute the wrong
|
||||
| spaCy installation. #[code python -m] prevents fallbacks to system modules
|
||||
| and makes sure the correct version of spaCy is used.
|
||||
| unexpected results, like executing the wrong spaCy installation
|
||||
| (especially when using #[code virtualenv]). #[code python -m] prevents
|
||||
| fallbacks to system modules and makes sure the correct spaCy version is
|
||||
| used. If you hate typing it every time, we recommend creating an
|
||||
| #[code alias] instead.
|
||||
|
||||
+h(2, "download") Download
|
||||
|
||||
|
@ -45,13 +45,24 @@ p
|
|||
+cell flag
|
||||
+cell Show help message and available arguments.
|
||||
|
||||
+infobox("Important note")
|
||||
| The #[code download] command is mostly intended as a convenient,
|
||||
| interactive wrapper – it performs compatibility checks and prints
|
||||
| detailed messages in case things go wrong. It's #[strong not recommended]
|
||||
| to use this command as part of an automated process. If you know which
|
||||
| model your project needs, you should consider a
|
||||
| #[+a("/docs/usage/models#download-pip") direct download via pip], or
|
||||
| uploading the model to a local PyPi installation and fetching it straight
|
||||
| from there. This will also allow you to add it as a versioned package
|
||||
| dependency to your project.
|
||||
|
||||
+h(2, "link") Link
|
||||
|
||||
p
|
||||
| Create a #[+a("/docs/usage/models#usage") shortcut link] for a model,
|
||||
| either a Python package or a local directory. This will let you load
|
||||
| models from any location via #[code spacy.load()].
|
||||
| models from any location using a custom name via
|
||||
| #[+api("spacy#load") #[code spacy.load()]].
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy link [origin] [link_name] [--force]
|
||||
|
@ -92,7 +103,7 @@ p
|
|||
+row
|
||||
+cell #[code model]
|
||||
+cell positional
|
||||
+cell Shortcut link of model (optional).
|
||||
+cell A model, i.e. shortcut link, package name or path (optional).
|
||||
|
||||
+row
|
||||
+cell #[code --markdown], #[code -md]
|
||||
|
@ -114,7 +125,7 @@ p
|
|||
| the input file. Currently only supports #[code .conllu].
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy convert [input_file] [output_dir] [--n_sents] [--morphology]
|
||||
python -m spacy convert [input_file] [output_dir] [--n-sents] [--morphology]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -128,7 +139,7 @@ p
|
|||
+cell Output directory for converted JSON file.
|
||||
|
||||
+row
|
||||
+cell #[code --n_sents], #[code -n]
|
||||
+cell #[code --n-sents], #[code -n]
|
||||
+cell option
|
||||
+cell Number of sentences per document.
|
||||
|
||||
|
@ -191,7 +202,7 @@ p
|
|||
| #[+a("/docs/api/annotation#json-input") JSON format].
|
||||
|
||||
+code(false, "bash").
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner]
|
||||
python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n-iter] [--parser-L1] [--no-tagger] [--no-parser] [--no-ner]
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
|
@ -215,27 +226,37 @@ p
|
|||
+cell Location of JSON-formatted dev data (optional).
|
||||
|
||||
+row
|
||||
+cell #[code --n_iter], #[code -n]
|
||||
+cell #[code --n-iter], #[code -n]
|
||||
+cell option
|
||||
+cell Number of iterations (default: #[code 15]).
|
||||
|
||||
+row
|
||||
+cell #[code --parser_L1], #[code -L]
|
||||
+cell #[code --nsents]
|
||||
+cell option
|
||||
+cell Number of sentences (default: #[code 0]).
|
||||
|
||||
+row
|
||||
+cell #[code --parser-L1], #[code -L]
|
||||
+cell option
|
||||
+cell L1 regularization penalty for parser (default: #[code 0.0]).
|
||||
|
||||
+row
|
||||
+cell #[code --no_tagger], #[code -T]
|
||||
+cell #[code --use-gpu], #[code -g]
|
||||
+cell flag
|
||||
+cell Use GPU.
|
||||
|
||||
+row
|
||||
+cell #[code --no-tagger], #[code -T]
|
||||
+cell flag
|
||||
+cell Don't train tagger.
|
||||
|
||||
+row
|
||||
+cell #[code --no_parser], #[code -P]
|
||||
+cell #[code --no-parser], #[code -P]
|
||||
+cell flag
|
||||
+cell Don't train parser.
|
||||
|
||||
+row
|
||||
+cell #[code --no_ner], #[code -N]
|
||||
+cell #[code --no-ner], #[code -N]
|
||||
+cell flag
|
||||
+cell Don't train NER.
|
||||
|
||||
|
|
|
@ -4,32 +4,6 @@ include ../../_includes/_mixins
|
|||
|
||||
p Annotate syntactic dependencies on #[code Doc] objects.
|
||||
|
||||
+h(2, "load") DependencyParser.load
|
||||
+tag classmethod
|
||||
|
||||
p Load the statistical model from the supplied path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The vocabulary. Must be shared by the documents to be processed.
|
||||
|
||||
+row
|
||||
+cell #[code require]
|
||||
+cell bool
|
||||
+cell Whether to raise an error if the files are not found.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code DependencyParser]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "init") DependencyParser.__init__
|
||||
+tag method
|
||||
|
||||
|
@ -47,7 +21,7 @@ p Create a #[code DependencyParser].
|
|||
+cell The statistical model.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code DependencyParser]
|
||||
+cell The newly constructed object.
|
||||
|
||||
|
@ -65,7 +39,7 @@ p
|
|||
+cell The document to be processed.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
|
@ -93,7 +67,7 @@ p Process a stream of documents.
|
|||
| parallel.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents, in order.
|
||||
|
||||
|
@ -114,7 +88,7 @@ p Update the statistical model.
|
|||
+cell The gold-standard annotations, to calculate the loss.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The loss on this example.
|
||||
|
||||
|
@ -130,6 +104,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
|
|||
+cell The document to step through.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code StepwiseState]
|
||||
+cell A state object, to step through the annotation process.
|
||||
|
|
|
@ -8,7 +8,7 @@ p
|
|||
| #[+a("/docs/usage/visualizers") visualizing spaCy].
|
||||
|
||||
|
||||
+h(2, "serve") serve
|
||||
+h(2, "serve") displacy.serve
|
||||
+tag method
|
||||
|
||||
p
|
||||
|
@ -60,7 +60,7 @@ p
|
|||
+cell Port to serve visualization.
|
||||
+cell #[code 5000]
|
||||
|
||||
+h(2, "render") render
|
||||
+h(2, "render") displacy.render
|
||||
+tag method
|
||||
|
||||
p Render a dependency parse tree or named entity visualization.
|
||||
|
@ -112,7 +112,7 @@ p Render a dependency parse tree or named entity visualization.
|
|||
+cell #[code {}]
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell unicode
|
||||
+cell Rendered HTML markup.
|
||||
+cell
|
||||
|
@ -218,7 +218,7 @@ p
|
|||
+cell #[code colors]
|
||||
+cell dict
|
||||
+cell
|
||||
| Color overrides. Entity types in lowercase should be mapped to
|
||||
| Color overrides. Entity types in uppercase should be mapped to
|
||||
| color names or values.
|
||||
+cell #[code {}]
|
||||
|
||||
|
|
|
@ -4,9 +4,508 @@ include ../../_includes/_mixins
|
|||
|
||||
p A container for accessing linguistic annotations.
|
||||
|
||||
p
|
||||
| A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects.
|
||||
| Access sentences and named entities, export annotations to numpy arrays,
|
||||
| losslessly serialize to compressed binary strings. The #[code Doc] object
|
||||
| holds an array of #[code TokenC] structs. The Python-level #[code Token]
|
||||
| and #[+api("span") #[code Span]] objects are views of this array, i.e.
|
||||
| they don't own the data themselves.
|
||||
|
||||
+aside-code("Example").
|
||||
# Construction 1
|
||||
doc = nlp(u'Some text')
|
||||
|
||||
# Construction 2
|
||||
from spacy.tokens import Doc
|
||||
doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
spaces=[True, False, False])
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Construct a #[code Doc] object. The most common way to get a #[code Doc]
|
||||
| object is via the #[code nlp] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell -
|
||||
+cell A list of strings to add to the container.
|
||||
|
||||
+row
|
||||
+cell #[code spaces]
|
||||
+cell -
|
||||
+cell
|
||||
| A list of boolean values indicating whether each word has a
|
||||
| subsequent space. Must have the same length as #[code words], if
|
||||
| specified. Defaults to a sequence of #[code True].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "getitem") Doc.__getitem__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Get a #[+api("token") #[code Token]] object at position #[code i], where
|
||||
| #[code i] is an integer. Negative indexing is supported, and follows the
|
||||
| usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The index of the token.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The token at #[code doc[i]].
|
||||
|
||||
p
|
||||
| Get a #[+api("span") #[code Span]] object, starting at position
|
||||
| #[code start] (token index) and ending at position #[code end] (token
|
||||
| index).
|
||||
|
||||
p
|
||||
| For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3
|
||||
| and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not
|
||||
| supported, as #[code Span] objects must be contiguous (cannot have gaps).
|
||||
| You can use negative indices and open-ended ranges, which have their
|
||||
| normal Python semantics.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_end]
|
||||
+cell tuple
|
||||
+cell The slice of the document to get.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The span at #[code doc[start : end]].
|
||||
|
||||
+h(2, "iter") Doc.__iter__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Iterate over #[code Token] objects, from which the annotations can be
|
||||
| easily accessed.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back')
|
||||
assert [t.text for t in doc] == [u'Give', u'it', u'back']
|
||||
|
||||
p
|
||||
| This is the main way of accessing #[+api("token") #[code Token]] objects,
|
||||
| which are the main way annotations are accessed from Python. If
|
||||
| faster-than-Python speeds are required, you can instead access the
|
||||
| annotations as a numpy array, or access the underlying C data directly
|
||||
| from Cython.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A #[code Token] object.
|
||||
|
||||
+h(2, "len") Doc.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of tokens in the document.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert len(doc) == 7
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| Make a semantic similarity estimate. The default estimate is cosine
|
||||
| similarity using an average of word vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apples = nlp(u'I like apples')
|
||||
oranges = nlp(u'I like oranges')
|
||||
apples_oranges = apples.similarity(oranges)
|
||||
oranges_apples = oranges.similarity(apples)
|
||||
assert apples_oranges == oranges_apples
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "count_by") Doc.count_by
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Count the frequencies of a given attribute. Produces a dict of
|
||||
| #[code {attr (int): count (ints)}] frequencies, keyed by the values
|
||||
| of the given attribute ID.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import ORTH
|
||||
doc = nlp(u'apple apple orange banana')
|
||||
assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
|
||||
doc.to_array([attrs.ORTH])
|
||||
# array([[11880], [11880], [7561], [12800]])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_id]
|
||||
+cell int
|
||||
+cell The attribute ID
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell A dictionary mapping attributes to integer counts.
|
||||
|
||||
+h(2, "to_array") Doc.to_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Export the document annotations to a numpy array of shape #[code N*M]
|
||||
| where #[code N] is the length of the document and #[code M] is the number
|
||||
| of attribute IDs to export. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
||||
+h(2, "from_array") Doc.from_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Load attributes from a numpy array. Write to a #[code Doc] object, from
|
||||
| an #[code (M, N)] array of attributes.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
|
||||
from spacy.tokens import Doc
|
||||
doc = nlp(text)
|
||||
np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
doc2 = Doc(doc.vocab)
|
||||
doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
|
||||
assert doc.text == doc2.text
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attrs]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+row
|
||||
+cell #[code array]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell Itself.
|
||||
|
||||
+h(2, "to_bytes") Doc.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize, i.e. export the document contents to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
doc_bytes = doc.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell
|
||||
| A losslessly serialized copy of the #[code Doc], including all
|
||||
| annotations.
|
||||
|
||||
+h(2, "from_bytes") Doc.from_bytes
|
||||
+tag method
|
||||
|
||||
p Deserialize, i.e. import the document contents from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens import Doc
|
||||
text = u'Give it back! He pleaded.'
|
||||
doc = nlp(text)
|
||||
bytes = doc.to_bytes()
|
||||
doc2 = Doc(doc.vocab).from_bytes(bytes)
|
||||
assert doc.text == doc2.text
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code data]
|
||||
+cell bytes
|
||||
+cell The string to load from.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell The #[code Doc] object.
|
||||
|
||||
+h(2, "merge") Doc.merge
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Retokenize the document, such that the span at
|
||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||
| boundaries, the document remains unchanged.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Los Angeles start.')
|
||||
doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE')
|
||||
assert [t.text for t in doc] == [u'Los Angeles', u'start', u'.']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_idx]
|
||||
+cell int
|
||||
+cell The character index of the start of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code end_idx]
|
||||
+cell int
|
||||
+cell The character index after the end of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
+cell -
|
||||
+cell
|
||||
| Attributes to assign to the merged token. By default,
|
||||
| attributes are inherited from the syntactic root token of
|
||||
| the span.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| The newly merged token, or #[code None] if the start and end
|
||||
| indices did not fall at token boundaries
|
||||
|
||||
+h(2, "print_tree") Doc.print_tree
|
||||
+tag method
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| Returns the parse trees in JSON (dict) format. Especially useful for
|
||||
| web applications.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp('Alice ate the pizza.')
|
||||
trees = doc.print_tree()
|
||||
# {'modifiers': [
|
||||
# {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
|
||||
# {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'},
|
||||
# {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}
|
||||
# ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code light]
|
||||
+cell bool
|
||||
+cell Don't include lemmas or entities.
|
||||
|
||||
+row
|
||||
+cell #[code flat]
|
||||
+cell bool
|
||||
+cell Don't include arcs or modifiers.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell Parse tree as dict.
|
||||
|
||||
+h(2, "ents") Doc.ents
|
||||
+tag property
|
||||
+tag-model("NER")
|
||||
|
||||
p
|
||||
| Iterate over the entities in the document. Yields named-entity
|
||||
| #[code Span] objects, if the entity recognizer has been applied to the
|
||||
| document.
|
||||
|
||||
+aside-code("Example").
|
||||
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
|
||||
ents = list(tokens.ents)
|
||||
assert ents[0].label == 346
|
||||
assert ents[0].label_ == 'PERSON'
|
||||
assert ents[0].text == 'Mr. Best'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Span]
|
||||
+cell Entities in the document.
|
||||
|
||||
+h(2, "noun_chunks") Doc.noun_chunks
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| Iterate over the base noun phrases in the document. Yields base
|
||||
| noun-phrase #[code Span] objects, if the document has been syntactically
|
||||
| parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not
|
||||
| permit other NPs to be nested within it – so no NP-level coordination, no
|
||||
| prepositional phrases, and no relative clauses.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'A phrase with another phrase occurs.')
|
||||
chunks = list(doc.noun_chunks)
|
||||
assert chunks[0].text == "A phrase"
|
||||
assert chunks[1].text == "another phrase"
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Span]
|
||||
+cell Noun chunks in the document.
|
||||
|
||||
+h(2, "sents") Doc.sents
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| Iterate over the sentences in the document. Sentence spans have no label.
|
||||
| To improve accuracy on informal texts, spaCy calculates sentence boundaries
|
||||
| from the syntactic dependency parse. If the parser is disabled,
|
||||
| the #[code sents] iterator will be unavailable.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u"This is a sentence. Here's another...")
|
||||
sents = list(doc.sents)
|
||||
assert len(sents) == 2
|
||||
assert [s.root.text for s in sents] == ["is", "'s"]
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Span]
|
||||
+cell Sentences in the document.
|
||||
|
||||
+h(2, "has_vector") Doc.has_vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
assert doc.has_vector
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the document has a vector data attached.
|
||||
|
||||
+h(2, "vector") Doc.vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A real-valued meaning representation. Defaults to an average of the
|
||||
| token vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apples = nlp(u'I like apples')
|
||||
assert doc.vector.dtype == 'float32'
|
||||
assert doc.vector.shape == (300,)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "vector_norm") Doc.vector_norm
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| The L2 norm of the document's vector representation.
|
||||
|
||||
+aside-code("Example").
|
||||
doc1 = nlp(u'I like apples')
|
||||
doc2 = nlp(u'I like oranges')
|
||||
doc1.vector_norm # 4.54232424414368
|
||||
doc2.vector_norm # 3.304373298575751
|
||||
assert doc1.vector_norm != doc2.vector_norm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell The L2 norm of the vector representation.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell A unicode representation of the document text.
|
||||
|
||||
+row
|
||||
+cell #[code text_with_ws]
|
||||
+cell unicode
|
||||
+cell
|
||||
| An alias of #[code Doc.text], provided for duck-type compatibility
|
||||
| with #[code Span] and #[code Token].
|
||||
|
||||
+row
|
||||
+cell #[code mem]
|
||||
+cell #[code Pool]
|
||||
|
@ -17,6 +516,11 @@ p A container for accessing linguistic annotations.
|
|||
+cell #[code Vocab]
|
||||
+cell The store of lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code tensor]
|
||||
+cell object
|
||||
+cell Container for dense vector representations.
|
||||
|
||||
+row
|
||||
+cell #[code user_data]
|
||||
+cell -
|
||||
|
@ -59,358 +563,3 @@ p A container for accessing linguistic annotations.
|
|||
+cell
|
||||
| A dictionary that allows customisation of properties of
|
||||
| #[code Span] children.
|
||||
|
||||
+h(2, "init") Doc.__init__
|
||||
+tag method
|
||||
|
||||
p Construct a #[code Doc] object.
|
||||
|
||||
+aside("Note")
|
||||
| The most common way to get a #[code Doc] object is via the #[code nlp]
|
||||
| object. This method is usually only used for deserialization or preset
|
||||
| tokenization.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell -
|
||||
+cell A list of strings to add to the container.
|
||||
|
||||
+row
|
||||
+cell #[code spaces]
|
||||
+cell -
|
||||
+cell
|
||||
| A list of boolean values indicating whether each word has a
|
||||
| subsequent space. Must have the same length as #[code words], if
|
||||
| specified. Defaults to a sequence of #[code True].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "getitem") Doc.__getitem__
|
||||
+tag method
|
||||
|
||||
p Get a #[code Token] object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
assert doc[0].text == 'Give'
|
||||
assert doc[-1].text == '.'
|
||||
span = doc[1:1]
|
||||
assert span.text == 'it back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The index of the token.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The token at #[code doc[i]].
|
||||
|
||||
p Get a #[code Span] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_end]
|
||||
+cell tuple
|
||||
+cell The slice of the document to get.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Span]
|
||||
+cell The span at #[code doc[start : end]].
|
||||
|
||||
+h(2, "iter") Doc.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over #[code Token] objects.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A #[code Token] object.
|
||||
|
||||
+h(2, "len") Doc.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of tokens in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The number of tokens in the document.
|
||||
|
||||
+h(2, "similarity") Doc.similarity
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Make a semantic similarity estimate. The default estimate is cosine
|
||||
| similarity using an average of word vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "to_array") Doc.to_array
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Export the document annotations to a numpy array of shape #[code N*M]
|
||||
| where #[code N] is the length of the document and #[code M] is the number
|
||||
| of attribute IDs to export. The values will be 32-bit integers.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy import attrs
|
||||
doc = nlp(text)
|
||||
# All strings mapped to integers, for easy export to numpy
|
||||
np_array = doc.to_array([attrs.LOWER, attrs.POS,
|
||||
attrs.ENT_TYPE, attrs.IS_ALPHA])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell
|
||||
| The exported attributes as a 2D numpy array, with one row per
|
||||
| token and one column per attribute.
|
||||
|
||||
+h(2, "count_by") Doc.count_by
|
||||
+tag method
|
||||
|
||||
p Count the frequencies of a given attribute.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_id]
|
||||
+cell int
|
||||
+cell The attribute ID
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell dict
|
||||
+cell A dictionary mapping attributes to integer counts.
|
||||
|
||||
+h(2, "from_array") Doc.from_array
|
||||
+tag method
|
||||
|
||||
p Load attributes from a numpy array.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code attr_ids]
|
||||
+cell ints
|
||||
+cell A list of attribute ID ints.
|
||||
|
||||
+row
|
||||
+cell #[code values]
|
||||
+cell #[code numpy.ndarray[ndim=2, dtype='int32']]
|
||||
+cell The attribute values to load.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "to_bytes") Doc.to_bytes
|
||||
+tag method
|
||||
|
||||
p Export the document contents to a binary string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bytes
|
||||
+cell
|
||||
| A losslessly serialized copy of the #[code Doc] including all
|
||||
| annotations.
|
||||
|
||||
+h(2, "from_bytes") Doc.from_bytes
|
||||
+tag method
|
||||
|
||||
p Import the document contents from a binary string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code byte_string]
|
||||
+cell bytes
|
||||
+cell The string to load from.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell The #[code self] variable.
|
||||
|
||||
+h(2, "merge") Doc.merge
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Retokenize the document, such that the span at
|
||||
| #[code doc.text[start_idx : end_idx]] is merged into a single token. If
|
||||
| #[code start_idx] and #[end_idx] do not mark start and end token
|
||||
| boundaries, the document remains unchanged.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_idx]
|
||||
+cell int
|
||||
+cell The character index of the start of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code end_idx]
|
||||
+cell int
|
||||
+cell The character index after the end of the slice to merge.
|
||||
|
||||
+row
|
||||
+cell #[code **attributes]
|
||||
+cell -
|
||||
+cell
|
||||
| Attributes to assign to the merged token. By default,
|
||||
| attributes are inherited from the syntactic root token of
|
||||
| the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| The newly merged token, or None if the start and end
|
||||
| indices did not fall at token boundaries
|
||||
|
||||
+h(2, "read_bytes") Doc.read_bytes
|
||||
+tag staticmethod
|
||||
|
||||
p A static method, used to read serialized #[code Doc] objects from a file.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokens.doc import Doc
|
||||
loc = 'test_serialize.bin'
|
||||
with open(loc, 'wb') as file_:
|
||||
file_.write(nlp(u'This is a document.').to_bytes())
|
||||
file_.write(nlp(u'This is another.').to_bytes())
|
||||
docs = []
|
||||
with open(loc, 'rb') as file_:
|
||||
for byte_string in Doc.read_bytes(file_):
|
||||
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
|
||||
assert len(docs) == 2
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell file
|
||||
+cell buffer
|
||||
+cell A binary buffer to read the serialized annotations from.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell bytes
|
||||
+cell Binary strings from with documents can be loaded.
|
||||
|
||||
+h(2, "text") Doc.text
|
||||
+tag property
|
||||
|
||||
p A unicode representation of the document text.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "text_with_ws") Doc.text_with_ws
|
||||
+tag property
|
||||
|
||||
p
|
||||
| An alias of #[code Doc.text], provided for duck-type compatibility with
|
||||
| #[code Span] and #[code Token].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the document.
|
||||
|
||||
+h(2, "sents") Doc.sents
|
||||
+tag property
|
||||
|
||||
p Iterate over the sentences in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Sentences in the document.
|
||||
|
||||
+h(2, "ents") Doc.ents
|
||||
+tag property
|
||||
|
||||
p Iterate over the entities in the document.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Entities in the document.
|
||||
|
||||
+h(2, "noun_chunks") Doc.noun_chunks
|
||||
+tag property
|
||||
|
||||
p
|
||||
| Iterate over the base noun phrases in the document. A base noun phrase,
|
||||
| or "NP chunk", is a noun phrase that does not permit other NPs to be
|
||||
| nested within it.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Span]
|
||||
+cell Noun chunks in the document
|
||||
|
||||
+h(2, "vector") Doc.vector
|
||||
+tag property
|
||||
|
||||
p
|
||||
| A real-valued meaning representation. Defaults to an average of the
|
||||
| token vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the document's semantics.
|
||||
|
||||
+h(2, "has_vector") Doc.has_vector
|
||||
+tag property
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the document has a vector data attached.
|
||||
|
|
|
@ -4,32 +4,6 @@ include ../../_includes/_mixins
|
|||
|
||||
p Annotate named entities on #[code Doc] objects.
|
||||
|
||||
+h(2, "load") EntityRecognizer.load
|
||||
+tag classmethod
|
||||
|
||||
p Load the statistical model from the supplied path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The vocabulary. Must be shared by the documents to be processed.
|
||||
|
||||
+row
|
||||
+cell #[code require]
|
||||
+cell bool
|
||||
+cell Whether to raise an error if the files are not found.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code EntityRecognizer]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "init") EntityRecognizer.__init__
|
||||
+tag method
|
||||
|
||||
|
@ -47,7 +21,7 @@ p Create an #[code EntityRecognizer].
|
|||
+cell The statistical model.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code EntityRecognizer]
|
||||
+cell The newly constructed object.
|
||||
|
||||
|
@ -63,7 +37,7 @@ p Apply the entity recognizer, setting the NER tags onto the #[code Doc] object.
|
|||
+cell The document to be processed.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
|
@ -91,7 +65,7 @@ p Process a stream of documents.
|
|||
| parallel.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents, in order.
|
||||
|
||||
|
@ -112,7 +86,7 @@ p Update the statistical model.
|
|||
+cell The gold-standard annotations, to calculate the loss.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The loss on this example.
|
||||
|
||||
|
@ -128,6 +102,6 @@ p Set up a stepwise state, to introspect and control the transition sequence.
|
|||
+cell The document to step through.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code StepwiseState]
|
||||
+cell A state object, to step through the annotation process.
|
||||
|
|
|
@ -4,6 +4,72 @@ include ../../_includes/_mixins
|
|||
|
||||
p Collection for training annotations.
|
||||
|
||||
+h(2, "init") GoldParse.__init__
|
||||
+tag method
|
||||
|
||||
p Create a GoldParse.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The document the annotations refer to.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell iterable
|
||||
+cell A sequence of unicode word strings.
|
||||
|
||||
+row
|
||||
+cell #[code tags]
|
||||
+cell iterable
|
||||
+cell A sequence of strings, representing tag annotations.
|
||||
|
||||
+row
|
||||
+cell #[code heads]
|
||||
+cell iterable
|
||||
+cell A sequence of integers, representing syntactic head offsets.
|
||||
|
||||
+row
|
||||
+cell #[code deps]
|
||||
+cell iterable
|
||||
+cell A sequence of strings, representing the syntactic relation types.
|
||||
|
||||
+row
|
||||
+cell #[code entities]
|
||||
+cell iterable
|
||||
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code GoldParse]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "len") GoldParse.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of gold-standard tokens.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of gold-standard tokens.
|
||||
|
||||
+h(2, "is_projective") GoldParse.is_projective
|
||||
+tag property
|
||||
|
||||
p
|
||||
| Whether the provided syntactic annotations form a projective dependency
|
||||
| tree.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether annotations form projective tree.
|
||||
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
@ -37,67 +103,57 @@ p Collection for training annotations.
|
|||
+cell list
|
||||
+cell The alignment from gold tokenization to candidate tokenization.
|
||||
|
||||
+h(2, "init") GoldParse.__init__
|
||||
+tag method
|
||||
|
||||
p Create a GoldParse.
|
||||
+h(2, "util") Utilities
|
||||
|
||||
+h(3, "biluo_tags_from_offsets") gold.biluo_tags_from_offsets
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Encode labelled spans into per-token tags, using the
|
||||
| #[+a("/docs/api/annotation#biluo") BILUO scheme] (Begin/In/Last/Unit/Out).
|
||||
|
||||
p
|
||||
| Returns a list of unicode strings, describing the tags. Each tag string
|
||||
| will be of the form either #[code ""], #[code "O"] or
|
||||
| #[code "{action}-{label}"], where action is one of #[code "B"],
|
||||
| #[code "I"], #[code "L"], #[code "U"]. The string #[code "-"]
|
||||
| is used where the entity offsets don't align with the tokenization in the
|
||||
| #[code Doc] object. The training algorithm will view these as missing
|
||||
| values. #[code O] denotes a non-entity token. #[code B] denotes the
|
||||
| beginning of a multi-token entity, #[code I] the inside of an entity
|
||||
| of three or more tokens, and #[code L] the end of an entity of two or
|
||||
| more tokens. #[code U] denotes a single-token entity.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.gold import biluo_tags_from_offsets
|
||||
text = 'I like London.'
|
||||
entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
doc = tokenizer(text)
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The document the annotations refer to.
|
||||
|
||||
+row
|
||||
+cell #[code words]
|
||||
+cell -
|
||||
+cell A sequence of unicode word strings.
|
||||
|
||||
+row
|
||||
+cell #[code tags]
|
||||
+cell -
|
||||
+cell A sequence of strings, representing tag annotations.
|
||||
|
||||
+row
|
||||
+cell #[code heads]
|
||||
+cell -
|
||||
+cell A sequence of integers, representing syntactic head offsets.
|
||||
|
||||
+row
|
||||
+cell #[code deps]
|
||||
+cell -
|
||||
+cell A sequence of strings, representing the syntactic relation types.
|
||||
+cell
|
||||
| The document that the entity offsets refer to. The output tags
|
||||
| will refer to the token boundaries within the document.
|
||||
|
||||
+row
|
||||
+cell #[code entities]
|
||||
+cell -
|
||||
+cell A sequence of named entity annotations, either as BILUO tag strings, or as #[code (start_char, end_char, label)] tuples, representing the entity positions.
|
||||
+cell iterable
|
||||
+cell
|
||||
| A sequence of #[code (start, end, label)] triples. #[code start]
|
||||
| and #[code end] should be character-offset integers denoting the
|
||||
| slice into the original string.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code GoldParse]
|
||||
+cell The newly constructed object.
|
||||
+cell returns
|
||||
+cell list
|
||||
+cell
|
||||
| Unicode strings, describing the
|
||||
| #[+a("/docs/api/annotation#biluo") BILUO] tags.
|
||||
|
||||
+h(2, "len") GoldParse.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of gold-standard tokens.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The number of gold-standard tokens.
|
||||
|
||||
+h(2, "is_projective") GoldParse.is_projective
|
||||
+tag property
|
||||
|
||||
p
|
||||
| Whether the provided syntactic annotations form a projective dependency
|
||||
| tree.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether annotations form projective tree.
|
||||
|
|
|
@ -2,79 +2,69 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p A text processing pipeline.
|
||||
p
|
||||
| A text-processing pipeline. Usually you'll load this once per process,
|
||||
| and pass the instance around your application.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
+h(2, "init") Language.__init__
|
||||
+tag method
|
||||
|
||||
p Initialise a #[code Language] object.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.language import Language
|
||||
nlp = Language(pipeline=['token_vectors', 'tags',
|
||||
'dependencies'])
|
||||
|
||||
from spacy.lang.en import English
|
||||
nlp = English()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A container for the lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code tokenizer]
|
||||
+cell #[code Tokenizer]
|
||||
+cell Find word boundaries and create #[code Doc] object.
|
||||
|
||||
+row
|
||||
+cell #[code tagger]
|
||||
+cell #[code Tagger]
|
||||
+cell Annotate #[code Doc] objects with POS tags.
|
||||
|
||||
+row
|
||||
+cell #[code parser]
|
||||
+cell #[code DependencyParser]
|
||||
+cell Annotate #[code Doc] objects with syntactic dependencies.
|
||||
|
||||
+row
|
||||
+cell #[code entity]
|
||||
+cell #[code EntityRecognizer]
|
||||
+cell Annotate #[code Doc] objects with named entities.
|
||||
|
||||
+row
|
||||
+cell #[code matcher]
|
||||
+cell #[code Matcher]
|
||||
+cell Rule-based sequence matcher.
|
||||
+cell
|
||||
| A #[code Vocab] object. If #[code True], a vocab is created via
|
||||
| #[code Language.Defaults.create_vocab].
|
||||
|
||||
+row
|
||||
+cell #[code make_doc]
|
||||
+cell #[code lambda text: Doc]
|
||||
+cell Create a #[code Doc] object from unicode text.
|
||||
+cell callable
|
||||
+cell
|
||||
| A function that takes text and returns a #[code Doc] object.
|
||||
| Usually a #[code Tokenizer].
|
||||
|
||||
+row
|
||||
+cell #[code pipeline]
|
||||
+cell -
|
||||
+cell Sequence of annotation functions.
|
||||
+cell list
|
||||
+cell
|
||||
| A list of annotation processes or IDs of annotation, processes,
|
||||
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
|
||||
| up in #[code Language.Defaults.factories].
|
||||
|
||||
|
||||
+h(2, "init") Language.__init__
|
||||
+tag method
|
||||
|
||||
p Create or load the pipeline.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **overrides]
|
||||
+cell -
|
||||
+cell Keyword arguments indicating which defaults to override.
|
||||
+cell #[code meta]
|
||||
+cell dict
|
||||
+cell
|
||||
| Custom meta data for the #[code Language] class. Is written to by
|
||||
| models to add model meta data.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "call") Language.__call__
|
||||
+tag method
|
||||
|
||||
p Apply the pipeline to a single text.
|
||||
p
|
||||
| Apply the pipeline to some text. The text can span multiple sentences,
|
||||
| and can contain arbtrary whitespace. Alignment into the original string
|
||||
| is preserved.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.en import English
|
||||
nlp = English()
|
||||
doc = nlp('An example sentence. Another example sentence.')
|
||||
doc[0].orth_, doc[0].head.tag_
|
||||
# ('An', 'NN')
|
||||
doc = nlp(u'An example sentence. Another sentence.')
|
||||
assert (doc[0].text, doc[0].head.tag_) == ('An', 'NN')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -83,24 +73,104 @@ p Apply the pipeline to a single text.
|
|||
+cell The text to be processed.
|
||||
|
||||
+row
|
||||
+cell #[code tag]
|
||||
+cell bool
|
||||
+cell Whether to apply the part-of-speech tagger.
|
||||
|
||||
+row
|
||||
+cell #[code parse]
|
||||
+cell bool
|
||||
+cell Whether to apply the syntactic dependency parser.
|
||||
|
||||
+row
|
||||
+cell #[code entity]
|
||||
+cell bool
|
||||
+cell Whether to apply the named entity recognizer.
|
||||
+cell #[code **disabled]
|
||||
+cell -
|
||||
+cell Elements of the pipeline that should not be run.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell A container for accessing the linguistic annotations.
|
||||
+cell A container for accessing the annotations.
|
||||
|
||||
+h(2, "update") Language.update
|
||||
+tag method
|
||||
|
||||
p Update the models in the pipeline.
|
||||
|
||||
+aside-code("Example").
|
||||
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
for epoch in trainer.epochs(gold):
|
||||
for docs, golds in epoch:
|
||||
state = nlp.update(docs, golds, sgd=optimizer)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code docs]
|
||||
+cell iterable
|
||||
+cell A batch of #[code Doc] objects.
|
||||
|
||||
+row
|
||||
+cell #[code golds]
|
||||
+cell iterable
|
||||
+cell A batch of #[code GoldParse] objects.
|
||||
|
||||
+row
|
||||
+cell #[code drop]
|
||||
+cell float
|
||||
+cell The dropout rate.
|
||||
|
||||
+row
|
||||
+cell #[code sgd]
|
||||
+cell callable
|
||||
+cell An optimizer.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell Results from the update.
|
||||
|
||||
+h(2, "begin_training") Language.begin_training
|
||||
+tag contextmanager
|
||||
|
||||
p
|
||||
| Allocate models, pre-process training data and acquire a trainer and
|
||||
| optimizer. Used as a contextmanager.
|
||||
|
||||
+aside-code("Example").
|
||||
with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
|
||||
for epoch in trainer.epochs(gold):
|
||||
for docs, golds in epoch:
|
||||
state = nlp.update(docs, golds, sgd=optimizer)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code gold_tuples]
|
||||
+cell iterable
|
||||
+cell Gold-standard training data.
|
||||
|
||||
+row
|
||||
+cell #[code **cfg]
|
||||
+cell -
|
||||
+cell Config parameters.
|
||||
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell tuple
|
||||
+cell A trainer and an optimizer.
|
||||
|
||||
+h(2, "use_params") Language.use_params
|
||||
+tag contextmanager
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Replace weights of models in the pipeline with those provided in the
|
||||
| params dictionary. Can be used as a contextmanager, in which case, models
|
||||
| go back to their original weights after the block.
|
||||
|
||||
+aside-code("Example").
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk('/tmp/checkpoint')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code params]
|
||||
+cell dict
|
||||
+cell A dictionary of parameters keyed by model ID.
|
||||
|
||||
+row
|
||||
+cell #[code **cfg]
|
||||
+cell -
|
||||
+cell Config parameters.
|
||||
|
||||
+h(2, "pipe") Language.pipe
|
||||
+tag method
|
||||
|
@ -133,22 +203,142 @@ p
|
|||
+cell The number of texts to buffer.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Containers for accessing the linguistic annotations.
|
||||
+cell Documents in the order of the original text.
|
||||
|
||||
+h(2, "save_to_directory") Language.save_to_directory
|
||||
+h(2, "to_disk") Language.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the #[code Vocab], #[code StringStore] and pipeline to a directory.
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.to_disk('/path/to/models')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell string or pathlib path
|
||||
+cell Path to save the model.
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being saved.
|
||||
|
||||
+h(2, "from_disk") Language.from_disk
|
||||
+tag method
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.language import Language
|
||||
nlp = Language().from_disk('/path/to/models')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell The modified #[code Language] object.
|
||||
|
||||
+h(2, "to_bytes") Language.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Language] object.
|
||||
|
||||
+h(2, "from_bytes") Language.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.lang.en import English
|
||||
nlp_bytes = nlp.to_bytes()
|
||||
nlp2 = English()
|
||||
nlp2.from_bytes(nlp_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell The #[code Language] object.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A container for the lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code make_doc]
|
||||
+cell #[code lambda text: Doc]
|
||||
+cell Create a #[code Doc] object from unicode text.
|
||||
|
||||
+row
|
||||
+cell #[code pipeline]
|
||||
+cell list
|
||||
+cell Sequence of annotation functions.
|
||||
|
||||
+row
|
||||
+cell #[code meta]
|
||||
+cell dict
|
||||
+cell
|
||||
| Custom meta data for the Language class. If a model is loaded,
|
||||
| contains meta data of the model.
|
||||
|
||||
+h(2, "class-attributes") Class attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code Defaults]
|
||||
+cell class
|
||||
+cell
|
||||
| Settings, data and factory methods for creating the
|
||||
| #[code nlp] object and processing pipeline.
|
||||
|
||||
+row
|
||||
+cell #[code lang]
|
||||
+cell unicode
|
||||
+cell
|
||||
| Two-letter language ID, i.e.
|
||||
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
|
||||
|
|
|
@ -2,7 +2,154 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p An entry in the vocabulary.
|
||||
p
|
||||
| An entry in the vocabulary. A #[code Lexeme] has no string context – it's
|
||||
| a word-type, as opposed to a word token. It therefore has no
|
||||
| part-of-speech tag, dependency parse, or lemma (if lemmatization depends
|
||||
| on the part-of-speech tag).
|
||||
|
||||
+h(2, "init") Lexeme.__init__
|
||||
+tag method
|
||||
|
||||
p Create a #[code Lexeme] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The parent vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell The orth id of the lexeme.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Lexeme]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "set_flag") Lexeme.set_flag
|
||||
+tag method
|
||||
|
||||
p Change the value of a boolean flag.
|
||||
|
||||
+aside-code("Example").
|
||||
COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
|
||||
nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to set.
|
||||
|
||||
+row
|
||||
+cell #[code value]
|
||||
+cell bool
|
||||
+cell The new value of the flag.
|
||||
|
||||
+h(2, "check_flag") Lexeme.check_flag
|
||||
+tag method
|
||||
|
||||
p Check the value of a boolean flag.
|
||||
|
||||
+aside-code("Example").
|
||||
is_my_library = lambda text: text in ['spaCy', 'Thinc']
|
||||
MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
|
||||
assert nlp.vocab[u'spaCy'].check_flag(MY_LIBRARY) == True
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to query.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell The value of the flag.
|
||||
|
||||
+h(2, "similarity") Lexeme.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
||||
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab[u'apple']
|
||||
orange = nlp.vocab[u'orange']
|
||||
apple_orange = apple.similarity(orange)
|
||||
orange_apple = orange.similarity(apple)
|
||||
assert apple_orange == orange_apple
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell other
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
|
||||
+h(2, "has_vector") Lexeme.has_vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| lexeme.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab[u'apple']
|
||||
assert apple.has_vector
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the lexeme has a vector data attached.
|
||||
|
||||
+h(2, "vector") Lexeme.vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p A real-valued meaning representation.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab[u'apple']
|
||||
assert apple.vector.dtype == 'float32'
|
||||
assert apple.vector.shape == (300,)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the lexeme's semantics.
|
||||
|
||||
+h(2, "vector_norm") Lexeme.vector_norm
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p The L2 norm of the lexeme's vector representation.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab[u'apple']
|
||||
pasta = nlp.vocab[u'pasta']
|
||||
apple.vector_norm # 7.1346845626831055
|
||||
pasta.vector_norm # 7.759851932525635
|
||||
assert apple.vector_norm != pasta.vector_norm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell The L2 norm of the vector representation.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
|
@ -12,6 +159,16 @@ p An entry in the vocabulary.
|
|||
+cell #[code Vocab]
|
||||
+cell
|
||||
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell Verbatim text content.
|
||||
|
||||
+row
|
||||
+cell #[code lex_id]
|
||||
+cell int
|
||||
+cell ID of the lexeme's lexical type.
|
||||
|
||||
+row
|
||||
+cell #[code lower]
|
||||
+cell int
|
||||
|
@ -124,116 +281,9 @@ p An entry in the vocabulary.
|
|||
+row
|
||||
+cell #[code prob]
|
||||
+cell float
|
||||
+cell Smoothed log probability estimate of token's type.
|
||||
+cell Smoothed log probability estimate of lexeme's type.
|
||||
|
||||
+row
|
||||
+cell #[code sentiment]
|
||||
+cell float
|
||||
+cell A scalar value indicating the positivity or negativity of the token.
|
||||
+row
|
||||
+cell #[code lex_id]
|
||||
+cell int
|
||||
+cell ID of the token's lexical type.
|
||||
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell Verbatim text content.
|
||||
|
||||
+h(2, "init") Lexeme.__init__
|
||||
+tag method
|
||||
|
||||
p Create a #[code Lexeme] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The parent vocabulary.
|
||||
|
||||
+row
|
||||
+cell #[code orth]
|
||||
+cell int
|
||||
+cell The orth id of the lexeme.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Lexeme]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "set_flag") Lexeme.set_flag
|
||||
+tag method
|
||||
|
||||
p Change the value of a boolean flag.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to set.
|
||||
|
||||
+row
|
||||
+cell #[code value]
|
||||
+cell bool
|
||||
+cell The new value of the flag.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "check_flag") Lexeme.check_flag
|
||||
+tag method
|
||||
|
||||
p Check the value of a boolean flag.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to query.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell The value of the flag.
|
||||
|
||||
+h(2, "similarity") Lexeme.similarity
|
||||
+tag method
|
||||
|
||||
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "vector") Lexeme.vector
|
||||
+tag property
|
||||
|
||||
p A real-valued meaning representation.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A real-valued meaning representation.
|
||||
|
||||
+h(2, "has_vector") Lexeme.has_vector
|
||||
+tag property
|
||||
|
||||
p A boolean value indicating whether a word vector is associated with the object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether a word vector is associated with the object.
|
||||
+cell A scalar value indicating the positivity or negativity of the lexeme.
|
||||
|
|
|
@ -4,31 +4,26 @@ include ../../_includes/_mixins
|
|||
|
||||
p Match sequences of tokens, based on pattern rules.
|
||||
|
||||
+h(2, "load") Matcher.load
|
||||
+tag classmethod
|
||||
|
||||
p Load the matcher and patterns from a file path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell Path to a JSON-formatted patterns file.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The vocabulary that the documents to match over will refer to.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Matcher]
|
||||
+cell The newly constructed object.
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
||||
| are deprecated and have been replaced with a simpler
|
||||
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
||||
| patterns and a callback for a given match ID. #[code Matcher.get_entity]
|
||||
| is now called #[+api("matcher#get") #[code matcher.get]].
|
||||
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
||||
| and #[code Matcher.has_entity] (now redundant) have been removed.
|
||||
|
||||
+h(2, "init") Matcher.__init__
|
||||
+tag method
|
||||
|
||||
p Create the Matcher.
|
||||
p Create the rule-based #[code Matcher].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER
|
||||
|
||||
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -41,17 +36,38 @@ p Create the Matcher.
|
|||
+row
|
||||
+cell #[code patterns]
|
||||
+cell dict
|
||||
+cell Patterns to add to the matcher.
|
||||
+cell Patterns to add to the matcher, keyed by ID.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Matcher]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "call") Matcher.__call__
|
||||
+tag method
|
||||
|
||||
p Find all token sequences matching the supplied patterns on the Doc.
|
||||
p Find all token sequences matching the supplied patterns on the #[code Doc].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER
|
||||
|
||||
matcher = Matcher(nlp.vocab)
|
||||
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
|
||||
matcher.add("HelloWorld", on_match=None, pattern)
|
||||
doc = nlp(u'hello world!')
|
||||
matches = matcher(doc)
|
||||
|
||||
+infobox("Important note")
|
||||
| By default, the matcher #[strong does not perform any action] on matches,
|
||||
| like tagging matched phrases with entity types. Instead, actions need to
|
||||
| be specified when #[strong adding patterns or entities], by
|
||||
| passing in a callback function as the #[code on_match] argument on
|
||||
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
||||
| actions per pattern within the same matcher. For example, you might only
|
||||
| want to merge some entity types, and set custom flags for other matched
|
||||
| patterns. For more details and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -60,23 +76,28 @@ p Find all token sequences matching the supplied patterns on the Doc.
|
|||
+cell The document to match over.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell list
|
||||
+cell
|
||||
| A list of#[code (entity_key, label_id, start, end)] tuples,
|
||||
| describing the matches. A match tuple describes a
|
||||
| #[code span doc[start:end]]. The #[code label_id] and
|
||||
| #[code entity_key] are both integers.
|
||||
| A list of #[code (match_id, start, end)] tuples, describing the
|
||||
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||
| The #[code match_id] is the ID of the added match pattern.
|
||||
|
||||
+h(2, "pipe") Matcher.pipe
|
||||
+tag method
|
||||
|
||||
p Match a stream of documents, yielding them in turn.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
matcher = Matcher(nlp.vocab)
|
||||
for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
|
||||
pass
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code docs]
|
||||
+cell -
|
||||
+cell iterable
|
||||
+cell A stream of documents.
|
||||
|
||||
+row
|
||||
|
@ -93,87 +114,132 @@ p Match a stream of documents, yielding them in turn.
|
|||
| multi-threading.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents, in order.
|
||||
|
||||
+h(2, "add_entity") Matcher.add_entity
|
||||
+h(2, "len") Matcher.__len__
|
||||
+tag method
|
||||
|
||||
p Add an entity to the matcher.
|
||||
p
|
||||
| Get the number of rules added to the matcher. Note that this only returns
|
||||
| the number of rules (identical with the number of IDs), not the number
|
||||
| of individual patterns.
|
||||
|
||||
+aside-code("Example").
|
||||
matcher = Matcher(nlp.vocab)
|
||||
assert len(matcher) == 0
|
||||
matcher.add('Rule', None, [{ORTH: 'test'}])
|
||||
assert len(matcher) == 1
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of rules.
|
||||
|
||||
+h(2, "contains") Matcher.__contains__
|
||||
+tag method
|
||||
|
||||
p Check whether the matcher contains rules for a match ID.
|
||||
|
||||
+aside-code("Example").
|
||||
matcher = Matcher(nlp.vocab)
|
||||
assert 'Rule' in matcher == False
|
||||
matcher.add('Rule', None, [{ORTH: 'test'}])
|
||||
assert 'Rule' in matcher == True
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code entity_key]
|
||||
+cell unicode / int
|
||||
+cell An ID for the entity.
|
||||
|
||||
+row
|
||||
+cell #[code attrs]
|
||||
+cell -
|
||||
+cell Attributes to associate with the Matcher.
|
||||
|
||||
+row
|
||||
+cell #[code if_exists]
|
||||
+cell #[code key]
|
||||
+cell unicode
|
||||
+cell
|
||||
| #[code 'raise'], #[code 'ignore'] or #[code 'update']. Controls
|
||||
| what happens if the entity ID already exists. Defaults to
|
||||
| #[code 'raise'].
|
||||
+cell The match ID.
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell Whether the matcher contains rules for this match ID.
|
||||
|
||||
+h(2, "add") Matcher.add
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
|
||||
| a callback function to act on the matches. The callback function will
|
||||
| receive the arguments #[code matcher], #[code doc], #[code i] and
|
||||
| #[code matches]. If a pattern already exists for the given ID, the
|
||||
| patterns will be extended. An #[code on_match] callback will be
|
||||
| overwritten.
|
||||
|
||||
+aside-code("Example").
|
||||
def on_match(matcher, doc, id, matches):
|
||||
print('Matched!', matches)
|
||||
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
|
||||
matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
|
||||
doc = nlp(u'HELLO WORLD on Google Maps.')
|
||||
matches = matcher(doc)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code acceptor]
|
||||
+cell -
|
||||
+cell Callback function to filter matches of the entity.
|
||||
+cell #[code match_id]
|
||||
+cell unicode
|
||||
+cell An ID for the thing you're matching.
|
||||
|
||||
+row
|
||||
+cell #[code on_match]
|
||||
+cell -
|
||||
+cell Callback function to act on matches of the entity.
|
||||
+cell callable or #[code None]
|
||||
+cell
|
||||
| Callback function to act on matches. Takes the arguments
|
||||
| #[code matcher], #[code doc], #[code i] and #[code matches].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
+row
|
||||
+cell #[code *patterns]
|
||||
+cell list
|
||||
+cell
|
||||
| Match pattern. A pattern consists of a list of dicts, where each
|
||||
| dict describes a token.
|
||||
|
||||
+h(2, "add_pattern") Matcher.add_pattern
|
||||
+h(2, "remove") Matcher.remove
|
||||
+tag method
|
||||
|
||||
p Add a pattern to the matcher.
|
||||
p
|
||||
| Remove a rule from the matcher. A #[code KeyError] is raised if the match
|
||||
| ID does not exist.
|
||||
|
||||
+aside-code("Example").
|
||||
matcher.add('Rule', None, [{ORTH: 'test'}])
|
||||
assert 'Rule' in matcher == True
|
||||
matcher.remove('Rule')
|
||||
assert 'Rule' in matcher == False
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code entity_key]
|
||||
+cell unicode / int
|
||||
+cell An ID for the entity.
|
||||
+cell #[code key]
|
||||
+cell unicode
|
||||
+cell The ID of the match rule.
|
||||
|
||||
+row
|
||||
+cell #[code token_specs]
|
||||
+cell -
|
||||
+cell Description of the pattern to be matched.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell unicode / int
|
||||
+cell Label to assign to the matched pattern. Defaults to #[code ""].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "has_entity") Matcher.has_entity
|
||||
+h(2, "get") Matcher.get
|
||||
+tag method
|
||||
|
||||
p Check whether the matcher has an entity.
|
||||
p
|
||||
| Retrieve the pattern stored for a key. Returns the rule as an
|
||||
| #[code (on_match, patterns)] tuple containing the callback and available
|
||||
| patterns.
|
||||
|
||||
+aside-code("Example").
|
||||
pattern = [{ORTH: 'test'}]
|
||||
matcher.add('Rule', None, pattern)
|
||||
(on_match, patterns) = matcher.get('Rule')
|
||||
assert patterns = [pattern]
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code entity_key]
|
||||
+cell unicode / int
|
||||
+cell The entity key to check.
|
||||
+cell #[code key]
|
||||
+cell unicode
|
||||
+cell The ID of the match rule.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the matcher has the entity.
|
||||
+cell returns
|
||||
+cell tuple
|
||||
+cell The rule, as an #[code (on_match, patterns)] tuple.
|
||||
|
|
95
website/docs/api/spacy.jade
Normal file
95
website/docs/api/spacy.jade
Normal file
|
@ -0,0 +1,95 @@
|
|||
//- 💫 DOCS > API > SPACY
|
||||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
+h(2, "load") spacy.load
|
||||
+tag function
|
||||
+tag-model
|
||||
|
||||
p
|
||||
| Load a model via its #[+a("/docs/usage/models#usage") shortcut link],
|
||||
| the name of an installed
|
||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||
| argument in this order. The #[code Language] class to initialise will be
|
||||
| determined based on the model's settings.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp = spacy.load('en') # shortcut link
|
||||
nlp = spacy.load('en_core_web_sm') # package
|
||||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
+infobox("⚠️ Deprecation note")
|
||||
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
|
||||
| will also raise an error if no model could be loaded and never just
|
||||
| return an empty #[code Language] object. If you need a blank language,
|
||||
| you need to import it explicitly (#[code from spacy.lang.en import English])
|
||||
| or use #[+api("util#get_lang_class") #[code util.get_lang_class]].
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code name]
|
||||
+cell unicode or #[code Path]
|
||||
+cell Model to load, i.e. shortcut link, package name or path.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell A #[code Language] object with the loaded model.
|
||||
|
||||
+h(2, "info") spacy.info
|
||||
+tag function
|
||||
|
||||
p
|
||||
| The same as the #[+api("cli#info") #[code info] command]. Pretty-print
|
||||
| information about your installation, models and local setup from within
|
||||
| spaCy. To get the model meta data as a dictionary instead, you can
|
||||
| use the #[code meta] attribute on your #[code nlp] object with a
|
||||
| loaded model, e.g. #[code nlp['meta']].
|
||||
|
||||
+aside-code("Example").
|
||||
spacy.info()
|
||||
spacy.info('en')
|
||||
spacy.info('de', markdown=True)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code model]
|
||||
+cell unicode
|
||||
+cell A model, i.e. shortcut link, package name or path (optional).
|
||||
|
||||
+row
|
||||
+cell #[code markdown]
|
||||
+cell bool
|
||||
+cell Print information as Markdown.
|
||||
|
||||
|
||||
+h(2, "explain") spacy.explain
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Get a description for a given POS tag, dependency label or entity type.
|
||||
| For a list of available terms, see
|
||||
| #[+src(gh("spacy", "spacy/glossary.py")) glossary.py].
|
||||
|
||||
+aside-code("Example").
|
||||
spacy.explain('NORP')
|
||||
# Nationalities or religious or political groups
|
||||
|
||||
doc = nlp(u'Hello world')
|
||||
for word in doc:
|
||||
print(word.text, word.tag_, spacy.explain(word.tag_))
|
||||
# Hello UH interjection
|
||||
# world NN noun, singular or mass
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code term]
|
||||
+cell unicode
|
||||
+cell Term to explain.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell unicode
|
||||
+cell The explanation, or #[code None] if not found in the glossary.
|
|
@ -2,66 +2,18 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p A slice from a #[code Doc] object.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The parent document.
|
||||
|
||||
+row
|
||||
+cell #[code start]
|
||||
+cell int
|
||||
+cell The token offset for the start of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end]
|
||||
+cell int
|
||||
+cell The token offset for the end of the span.
|
||||
|
||||
+row
|
||||
+cell #[code start_char]
|
||||
+cell int
|
||||
+cell The character offset for the start of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end_char]
|
||||
+cell int
|
||||
+cell The character offset for the end of the span.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell int
|
||||
+cell The span's label.
|
||||
|
||||
+row
|
||||
+cell #[code label_]
|
||||
+cell unicode
|
||||
+cell The span's label.
|
||||
|
||||
+row
|
||||
+cell #[code lemma_]
|
||||
+cell unicode
|
||||
+cell The span's lemma.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id]
|
||||
+cell int
|
||||
+cell The integer ID of the named entity the token is an instance of.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id_]
|
||||
+cell unicode
|
||||
+cell The string ID of the named entity the token is an instance of.
|
||||
p A slice from a #[+api("doc") #[code Doc]] object.
|
||||
|
||||
+h(2, "init") Span.__init__
|
||||
+tag method
|
||||
|
||||
p Create a Span object from the #[code slice doc[start : end]].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
span = doc[1:4]
|
||||
assert [t.text for t in span] == [u'it', u'back', u'!']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
|
@ -89,7 +41,7 @@ p Create a Span object from the #[code slice doc[start : end]].
|
|||
+cell A meaning representation of the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The newly constructed object.
|
||||
|
||||
|
@ -98,6 +50,11 @@ p Create a Span object from the #[code slice doc[start : end]].
|
|||
|
||||
p Get a #[code Token] object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
span = doc[1:4]
|
||||
assert span[1].text == 'back'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
|
@ -105,12 +62,17 @@ p Get a #[code Token] object.
|
|||
+cell The index of the token within the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The token at #[code span[i]].
|
||||
|
||||
p Get a #[code Span] object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
span = doc[1:4]
|
||||
assert span[1:3].text == 'back!'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code start_end]
|
||||
|
@ -118,7 +80,7 @@ p Get a #[code Span] object.
|
|||
+cell The slice of the span to get.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Span]
|
||||
+cell The span at #[code span[start : end]].
|
||||
|
||||
|
@ -127,9 +89,14 @@ p Get a #[code Span] object.
|
|||
|
||||
p Iterate over #[code Token] objects.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
span = doc[1:4]
|
||||
assert [t.text for t in span] == ['it', 'back', '!']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A #[code Token] object.
|
||||
|
||||
|
@ -138,19 +105,33 @@ p Iterate over #[code Token] objects.
|
|||
|
||||
p Get the number of tokens in the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
span = doc[1:4]
|
||||
assert len(span) == 3
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of tokens in the span.
|
||||
|
||||
+h(2, "similarity") Span.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| Make a semantic similarity estimate. The default estimate is cosine
|
||||
| similarity using an average of word vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'green apples and red oranges')
|
||||
green_apples = doc[:2]
|
||||
red_oranges = doc[3:]
|
||||
apples_oranges = green_apples.similarity(red_oranges)
|
||||
oranges_apples = red_oranges.similarity(green_apples)
|
||||
assert apples_oranges == oranges_apples
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code other]
|
||||
|
@ -160,7 +141,7 @@ p
|
|||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
|
@ -178,87 +159,205 @@ p Retokenize the document, such that the span is merged into a single token.
|
|||
| are inherited from the syntactic root token of the span.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The newly merged token.
|
||||
|
||||
+h(2, "text") Span.text
|
||||
+tag property
|
||||
|
||||
p A unicode representation of the span text.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The original verbatim text of the span.
|
||||
|
||||
+h(2, "text_with_ws") Span.text_with_ws
|
||||
+tag property
|
||||
|
||||
p
|
||||
| The text content of the span with a trailing whitespace character if the
|
||||
| last token has one.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode
|
||||
+cell The text content of the span (with trailing whitespace).
|
||||
|
||||
+h(2, "sent") Span.sent
|
||||
+tag property
|
||||
|
||||
p The sentence span that this span is a part of.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Span]
|
||||
+cell The sentence this is part of.
|
||||
|
||||
+h(2, "root") Span.root
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| The token within the span that's highest in the parse tree. If there's a
|
||||
| tie, the earlist is prefered.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
i, like, new, york, in_, autumn, dot = range(len(doc))
|
||||
assert doc[new].head.text == 'York'
|
||||
assert doc[york].head.text == 'like'
|
||||
new_york = doc[new:york+1]
|
||||
assert new_york.root.text == 'York'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The root token.
|
||||
|
||||
+h(2, "lefts") Span.lefts
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p Tokens that are to the left of the span, whose head is within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
lefts = [t.text for t in doc[3:7].lefts]
|
||||
assert lefts == [u'New']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A left-child of a token of the span.
|
||||
|
||||
+h(2, "rights") Span.rights
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p Tokens that are to the right of the span, whose head is within the span.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like New York in Autumn.')
|
||||
rights = [t.text for t in doc[2:4].rights]
|
||||
assert rights == [u'in']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A right-child of a token of the span.
|
||||
|
||||
+h(2, "subtree") Span.subtree
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
subtree = [t.text for t in doc[:3].subtree]
|
||||
assert subtree == [u'Give', u'it', u'back', u'!']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A descendant of a token within the span.
|
||||
|
||||
+h(2, "has_vector") Span.has_vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
assert doc[1:].has_vector
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the span has a vector data attached.
|
||||
|
||||
+h(2, "vector") Span.vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A real-valued meaning representation. Defaults to an average of the
|
||||
| token vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
assert doc[1:].vector.dtype == 'float32'
|
||||
assert doc[1:].vector.shape == (300,)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the span's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| The L2 norm of the span's vector representation.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
doc[1:].vector_norm # 4.800883928527915
|
||||
doc[2:].vector_norm # 6.895897646384268
|
||||
assert doc[1:].vector_norm != doc[2:].vector_norm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell The L2 norm of the vector representation.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The parent document.
|
||||
|
||||
+row
|
||||
+cell #[code sent]
|
||||
+cell #[code Span]
|
||||
+cell The sentence span that this span is a part of.
|
||||
|
||||
+row
|
||||
+cell #[code start]
|
||||
+cell int
|
||||
+cell The token offset for the start of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end]
|
||||
+cell int
|
||||
+cell The token offset for the end of the span.
|
||||
|
||||
+row
|
||||
+cell #[code start_char]
|
||||
+cell int
|
||||
+cell The character offset for the start of the span.
|
||||
|
||||
+row
|
||||
+cell #[code end_char]
|
||||
+cell int
|
||||
+cell The character offset for the end of the span.
|
||||
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell A unicode representation of the span text.
|
||||
|
||||
+row
|
||||
+cell #[code text_with_ws]
|
||||
+cell unicode
|
||||
+cell
|
||||
| The text content of the span with a trailing whitespace character
|
||||
| if the last token has one.
|
||||
|
||||
+row
|
||||
+cell #[code label]
|
||||
+cell int
|
||||
+cell The span's label.
|
||||
|
||||
+row
|
||||
+cell #[code label_]
|
||||
+cell unicode
|
||||
+cell The span's label.
|
||||
|
||||
+row
|
||||
+cell #[code lemma_]
|
||||
+cell unicode
|
||||
+cell The span's lemma.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id]
|
||||
+cell int
|
||||
+cell The integer ID of the named entity the token is an instance of.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id_]
|
||||
+cell unicode
|
||||
+cell The string ID of the named entity the token is an instance of.
|
||||
|
|
|
@ -7,16 +7,22 @@ p Map strings to and from integer IDs.
|
|||
+h(2, "init") StringStore.__init__
|
||||
+tag method
|
||||
|
||||
p Create the #[code StringStore].
|
||||
p
|
||||
| Create the #[code StringStore]. Note that a newly initialised store will
|
||||
| always include an empty string #[code ''] at position #[code 0].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.strings import StringStore
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code strings]
|
||||
+cell -
|
||||
+cell iterable
|
||||
+cell A sequence of unicode strings to add to the store.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code StringStore]
|
||||
+cell The newly constructed object.
|
||||
|
||||
|
@ -25,9 +31,13 @@ p Create the #[code StringStore].
|
|||
|
||||
p Get the number of strings in the store.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
assert len(stringstore) == 2
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of strings in the store.
|
||||
|
||||
|
@ -36,22 +46,32 @@ p Get the number of strings in the store.
|
|||
|
||||
p Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
int_id = stringstore[u'apple'] # 1
|
||||
assert stringstore[int_id] == u'apple'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string_or_id]
|
||||
+cell bytes / unicode / int
|
||||
+cell bytes, unicode or int
|
||||
+cell The value to encode.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell unicode / int
|
||||
+cell The value to retrieved.
|
||||
+cell returns
|
||||
+cell unicode or int
|
||||
+cell The value to be retrieved.
|
||||
|
||||
+h(2, "contains") StringStore.__contains__
|
||||
+tag method
|
||||
|
||||
p Check whether a string is in the store.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
assert u'apple' in stringstore == True
|
||||
assert u'cherry' in stringstore == False
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
|
@ -59,49 +79,108 @@ p Check whether a string is in the store.
|
|||
+cell The string to check.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the store contains the string.
|
||||
|
||||
+h(2, "iter") StringStore.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over the strings in the store, in order.
|
||||
p
|
||||
| Iterate over the strings in the store, in order. Note that a newly
|
||||
| initialised store will always include an empty string #[code ''] at
|
||||
| position #[code 0].
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
all_strings = [s for s in stringstore]
|
||||
assert all_strings == [u'', u'apple', u'orange']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell unicode
|
||||
+cell A string in the store.
|
||||
|
||||
+h(2, "dump") StringStore.dump
|
||||
+h(2, "to_disk") StringStore.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the strings to a JSON file.
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore.to_disk('/path/to/strings')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code file]
|
||||
+cell buffer
|
||||
+cell The file to save the strings.
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "load") StringStore.load
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+tag method
|
||||
|
||||
p Load the strings from a JSON file.
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.strings import StringStore
|
||||
stringstore = StringStore().from_disk('/path/to/strings')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code file]
|
||||
+cell buffer
|
||||
+cell The file from which to load the strings.
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
store_bytes = stringstore.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.strings import StringStore
|
||||
store_bytes = stringstore.to_bytes()
|
||||
new_store = StringStore().from_bytes(store_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code StringStore]
|
||||
+cell The #[code StringStore] object.
|
||||
|
|
|
@ -4,32 +4,6 @@ include ../../_includes/_mixins
|
|||
|
||||
p Annotate part-of-speech tags on #[code Doc] objects.
|
||||
|
||||
+h(2, "load") Tagger.load
|
||||
+tag classmethod
|
||||
|
||||
p Load the statistical model from the supplied path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell The vocabulary. Must be shared by the documents to be processed.
|
||||
|
||||
+row
|
||||
+cell #[code require]
|
||||
+cell bool
|
||||
+cell Whether to raise an error if the files are not found.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Tagger]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "init") Tagger.__init__
|
||||
+tag method
|
||||
|
||||
|
@ -47,7 +21,7 @@ p Create a #[code Tagger].
|
|||
+cell The statistical model.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Tagger]
|
||||
+cell The newly constructed object.
|
||||
|
||||
|
@ -63,7 +37,7 @@ p Apply the tagger, setting the POS tags onto the #[code Doc] object.
|
|||
+cell The tokens to be tagged.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
|
@ -91,7 +65,7 @@ p Tag a stream of documents.
|
|||
| parallel.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell Documents, in order.
|
||||
|
||||
|
@ -112,6 +86,6 @@ p Update the statistical model, with tags supplied for the given document.
|
|||
+cell Manager for the gold-standard tags.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell Number of tags predicted correctly.
|
||||
|
|
|
@ -4,9 +4,296 @@ include ../../_includes/_mixins
|
|||
|
||||
p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
||||
|
||||
+h(2, "init") Token.__init__
|
||||
+tag method
|
||||
|
||||
p Construct a #[code Token] object.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
token = doc[0]
|
||||
assert token.text == u'Give'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The parent document.
|
||||
|
||||
+row
|
||||
+cell #[code offset]
|
||||
+cell int
|
||||
+cell The index of the token within the document.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "len") Token.__len__
|
||||
+tag method
|
||||
|
||||
p The number of unicode characters in the token, i.e. #[code token.text].
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
token = doc[0]
|
||||
assert len(token) == 4
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of unicode characters in the token.
|
||||
|
||||
+h(2, "check_flag") Token.check_flag
|
||||
+tag method
|
||||
|
||||
p Check the value of a boolean flag.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import IS_TITLE
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
token = doc[0]
|
||||
assert token.check_flag(IS_TITLE) == True
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to check.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the flag is set.
|
||||
|
||||
+h(2, "similarity") Token.similarity
|
||||
+tag method
|
||||
+tag-model("vectors")
|
||||
|
||||
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
+aside-code("Example").
|
||||
apples, _, oranges = nlp(u'apples and oranges')
|
||||
apples_oranges = apples.similarity(oranges)
|
||||
oranges_apples = oranges.similarity(apples)
|
||||
assert apples_oranges == oranges_apples
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell other
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "nbor") Token.nbor
|
||||
+tag method
|
||||
|
||||
p Get a neighboring token.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
give_nbor = doc[0].nbor()
|
||||
assert give_nbor.text == u'it'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The relative position of the token to get. Defaults to #[code 1].
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Token]
|
||||
+cell The token at position #[code self.doc[self.i+i]].
|
||||
|
||||
+h(2, "is_ancestor") Token.is_ancestor
|
||||
+tag method
|
||||
+tag-model("parse")
|
||||
|
||||
p
|
||||
| Check whether this token is a parent, grandparent, etc. of another
|
||||
| in the dependency tree.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
give = doc[0]
|
||||
it = doc[1]
|
||||
assert give.is_ancestor(it)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell descendant
|
||||
+cell #[code Token]
|
||||
+cell Another token.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether this token is the ancestor of the descendant.
|
||||
|
||||
+h(2, "ancestors") Token.ancestors
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p The rightmost token of this token's syntactic descendants.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
it_ancestors = doc[1].ancestors
|
||||
assert [t.text for t in it_ancestors] == [u'Give']
|
||||
he_ancestors = doc[4].ancestors
|
||||
assert [t.text for t in he_ancestors] == [u'pleaded']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| A sequence of ancestor tokens such that
|
||||
| #[code ancestor.is_ancestor(self)].
|
||||
|
||||
+h(2, "conjuncts") Token.conjuncts
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p A sequence of coordinated tokens, including the token itself.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples and oranges')
|
||||
apples_conjuncts = doc[2].conjuncts
|
||||
assert [t.text for t in apples_conjuncts] == [u'oranges']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A coordinated token.
|
||||
|
||||
+h(2, "children") Token.children
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p A sequence of the token's immediate syntactic children.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
give_children = doc[0].children
|
||||
assert [t.text for t in give_children] == [u'it', u'back', u'!']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A child token such that #[code child.head==self].
|
||||
|
||||
+h(2, "subtree") Token.subtree
|
||||
+tag property
|
||||
+tag-model("parse")
|
||||
|
||||
p A sequence of all the token's syntactic descendents.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'Give it back! He pleaded.')
|
||||
give_subtree = doc[0].subtree
|
||||
assert [t.text for t in give_subtree] == [u'Give', u'it', u'back', u'!']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Token]
|
||||
+cell A descendant token such that #[code self.is_ancestor(descendant)].
|
||||
|
||||
+h(2, "has_vector") Token.has_vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| token.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
apples = doc[2]
|
||||
assert apples.has_vector
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the token has a vector data attached.
|
||||
|
||||
+h(2, "vector") Token.vector
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p A real-valued meaning representation.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples')
|
||||
apples = doc[2]
|
||||
assert apples.vector.dtype == 'float32'
|
||||
assert apples.vector.shape == (300,)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the token's semantics.
|
||||
|
||||
+h(2, "vector_norm") Span.vector_norm
|
||||
+tag property
|
||||
+tag-model("vectors")
|
||||
|
||||
p The L2 norm of the token's vector representation.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I like apples and pasta')
|
||||
apples = doc[2]
|
||||
pasta = doc[4]
|
||||
apples.vector_norm # 6.89589786529541
|
||||
pasta.vector_norm # 7.759851932525635
|
||||
assert apples.vector_norm != pasta.vector_norm
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell float
|
||||
+cell The L2 norm of the vector representation.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell Verbatim text content.
|
||||
+row
|
||||
+cell #[code text_with_ws]
|
||||
+cell unicode
|
||||
+cell Text content, with trailing space character if present.
|
||||
|
||||
+row
|
||||
+cell #[code whitespace]
|
||||
+cell int
|
||||
+cell Trailing space character if present.
|
||||
+row
|
||||
+cell #[code whitespace_]
|
||||
+cell unicode
|
||||
+cell Trailing space character if present.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
|
@ -17,14 +304,31 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
|||
+cell #[code Doc]
|
||||
+cell The parent document.
|
||||
|
||||
+row
|
||||
+cell #[code head]
|
||||
+cell #[code Token]
|
||||
+cell The syntactic parent, or "governor", of this token.
|
||||
|
||||
+row
|
||||
+cell #[code left_edge]
|
||||
+cell #[code Token]
|
||||
+cell The leftmost token of this token's syntactic descendants.
|
||||
|
||||
+row
|
||||
+cell #[code right_edge]
|
||||
+cell #[code Token]
|
||||
+cell The rightmost token of this token's syntactic descendents.
|
||||
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The index of the token within the parent document.
|
||||
|
||||
+row
|
||||
+cell #[code ent_type]
|
||||
+cell int
|
||||
+cell Named entity type.
|
||||
|
||||
+row
|
||||
+cell #[code ent_type_]
|
||||
+cell unicode
|
||||
|
@ -42,19 +346,23 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
|||
+cell unicode
|
||||
+cell
|
||||
| IOB code of named entity tag. #[code "B"]
|
||||
| means the token begins an entity, #[code "I"] means it inside an
|
||||
| entity, #[code "O"] means it is outside an entity, and
|
||||
| means the token begins an entity, #[code "I"] means it is inside
|
||||
| an entity, #[code "O"] means it is outside an entity, and
|
||||
| #[code ""] means no entity tag is set.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id]
|
||||
+cell int
|
||||
+cell ID of the entity the token is an instance of, if any.
|
||||
+cell
|
||||
| ID of the entity the token is an instance of, if any. Usually
|
||||
| assigned by patterns in the Matcher.
|
||||
|
||||
+row
|
||||
+cell #[code ent_id_]
|
||||
+cell unicode
|
||||
+cell ID of the entity the token is an instance of, if any.
|
||||
+cell
|
||||
| ID of the entity the token is an instance of, if any. Usually
|
||||
| assigned by patterns in the Matcher.
|
||||
|
||||
+row
|
||||
+cell #[code lemma]
|
||||
|
@ -229,232 +537,3 @@ p An individual token — i.e. a word, punctuation symbol, whitespace, etc.
|
|||
+cell #[code lex_id]
|
||||
+cell int
|
||||
+cell ID of the token's lexical type.
|
||||
|
||||
+row
|
||||
+cell #[code text]
|
||||
+cell unicode
|
||||
+cell Verbatim text content.
|
||||
+row
|
||||
+cell #[code text_with_ws]
|
||||
+cell unicode
|
||||
+cell Text content, with trailing space character if present.
|
||||
|
||||
+row
|
||||
+cell #[code whitespace]
|
||||
+cell int
|
||||
+cell Trailing space character if present.
|
||||
+row
|
||||
+cell #[code whitespace_]
|
||||
+cell unicode
|
||||
+cell Trailing space character if present.
|
||||
|
||||
|
||||
+h(2, "init") Token.__init__
|
||||
+tag method
|
||||
|
||||
p Construct a #[code Token] object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The parent document.
|
||||
|
||||
+row
|
||||
+cell #[code offset]
|
||||
+cell int
|
||||
+cell The index of the token within the document.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "len") Token.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of unicode characters in the token.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The number of unicode characters in the token.
|
||||
|
||||
|
||||
+h(2, "check_flag") Token.check_flag
|
||||
+tag method
|
||||
|
||||
p Check the value of a boolean flag.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code flag_id]
|
||||
+cell int
|
||||
+cell The attribute ID of the flag to check.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the flag is set.
|
||||
|
||||
+h(2, "nbor") Token.nbor
|
||||
+tag method
|
||||
|
||||
p Get a neighboring token.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell The relative position of the token to get. Defaults to #[code 1].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The token at position #[code self.doc[self.i+i]]
|
||||
|
||||
+h(2, "similarity") Token.similarity
|
||||
+tag method
|
||||
|
||||
p Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell other
|
||||
+cell -
|
||||
+cell
|
||||
| The object to compare with. By default, accepts #[code Doc],
|
||||
| #[code Span], #[code Token] and #[code Lexeme] objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell float
|
||||
+cell A scalar similarity score. Higher is more similar.
|
||||
|
||||
+h(2, "is_ancestor") Token.is_ancestor
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Check whether this token is a parent, grandparent, etc. of another
|
||||
| in the dependency tree.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell descendant
|
||||
+cell #[code Token]
|
||||
+cell Another token.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether this token is the ancestor of the descendant.
|
||||
|
||||
|
||||
+h(2, "vector") Token.vector
|
||||
+tag property
|
||||
|
||||
p A real-valued meaning representation.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
|
||||
+cell A 1D numpy array representing the token's semantics.
|
||||
|
||||
+h(2, "has_vector") Token.has_vector
|
||||
+tag property
|
||||
|
||||
p
|
||||
| A boolean value indicating whether a word vector is associated with the
|
||||
| object.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell bool
|
||||
+cell Whether the token has a vector data attached.
|
||||
|
||||
+h(2, "head") Token.head
|
||||
+tag property
|
||||
|
||||
p The syntactic parent, or "governor", of this token.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The head.
|
||||
|
||||
+h(2, "conjuncts") Token.conjuncts
|
||||
+tag property
|
||||
|
||||
p A sequence of coordinated tokens, including the token itself.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A coordinated token.
|
||||
|
||||
+h(2, "children") Token.children
|
||||
+tag property
|
||||
|
||||
p A sequence of the token's immediate syntactic children.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A child token such that #[code child.head==self].
|
||||
|
||||
+h(2, "subtree") Token.subtree
|
||||
+tag property
|
||||
|
||||
p A sequence of all the token's syntactic descendents.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell A descendant token such that #[code self.is_ancestor(descendant)].
|
||||
|
||||
+h(2, "left_edge") Token.left_edge
|
||||
+tag property
|
||||
|
||||
p The leftmost token of this token's syntactic descendants.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The first token such that #[code self.is_ancestor(token)].
|
||||
|
||||
+h(2, "right_edge") Token.right_edge
|
||||
+tag property
|
||||
|
||||
p The rightmost token of this token's syntactic descendents.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Token]
|
||||
+cell The last token such that #[code self.is_ancestor(token)].
|
||||
|
||||
+h(2, "ancestors") Token.ancestors
|
||||
+tag property
|
||||
|
||||
p The rightmost token of this token's syntactic descendants.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Token]
|
||||
+cell
|
||||
| A sequence of ancestor tokens such that
|
||||
| #[code ancestor.is_ancestor(self)].
|
||||
|
|
|
@ -6,6 +6,283 @@ p
|
|||
| Segment text, and create #[code Doc] objects with the discovered segment
|
||||
| boundaries.
|
||||
|
||||
+h(2, "init") Tokenizer.__init__
|
||||
+tag method
|
||||
|
||||
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
||||
|
||||
+aside-code("Example").
|
||||
# Construction 1
|
||||
from spacy.tokenizer import Tokenizer
|
||||
tokenizer = Tokenizer(nlp.vocab)
|
||||
|
||||
# Construction 2
|
||||
from spacy.lang.en import English
|
||||
tokenizer = English().Defaults.create_tokenizer(nlp)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code rules]
|
||||
+cell dict
|
||||
+cell Exceptions and special-cases for the tokenizer.
|
||||
|
||||
+row
|
||||
+cell #[code prefix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match prefixes.
|
||||
|
||||
+row
|
||||
+cell #[code suffix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code infix_finditer]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).finditer] to find infixes.
|
||||
|
||||
+row
|
||||
+cell #[code token_match]
|
||||
+cell callable
|
||||
+cell A boolean function matching strings to be recognised as tokens.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "call") Tokenizer.__call__
|
||||
+tag method
|
||||
|
||||
p Tokenize a string.
|
||||
|
||||
+aside-code("Example").
|
||||
tokens = tokenizer(u'This is a sentence')
|
||||
assert len(tokens) == 4
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to tokenize.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Doc]
|
||||
+cell A container for linguistic annotations.
|
||||
|
||||
+h(2, "pipe") Tokenizer.pipe
|
||||
+tag method
|
||||
|
||||
p Tokenize a stream of texts.
|
||||
|
||||
+aside-code("Example").
|
||||
texts = [u'One document.', u'...', u'Lots of documents']
|
||||
for doc in tokenizer.pipe(texts, batch_size=50):
|
||||
pass
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code texts]
|
||||
+cell -
|
||||
+cell A sequence of unicode texts.
|
||||
|
||||
+row
|
||||
+cell #[code batch_size]
|
||||
+cell int
|
||||
+cell The number of texts to accumulate in an internal buffer.
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
+cell
|
||||
| The number of threads to use, if the implementation supports
|
||||
| multi-threading. The default tokenizer is single-threaded.
|
||||
|
||||
+footrow
|
||||
+cell yields
|
||||
+cell #[code Doc]
|
||||
+cell A sequence of Doc objects, in order.
|
||||
|
||||
+h(2, "find_infix") Tokenizer.find_infix
|
||||
+tag method
|
||||
|
||||
p Find internal split points of the string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to split.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell list
|
||||
+cell
|
||||
| A list of #[code re.MatchObject] objects that have #[code .start()]
|
||||
| and #[code .end()] methods, denoting the placement of internal
|
||||
| segment separators, e.g. hyphens.
|
||||
|
||||
+h(2, "find_prefix") Tokenizer.find_prefix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Find the length of a prefix that should be segmented from the string, or
|
||||
| #[code None] if no prefix rules match.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to segment.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The length of the prefix if present, otherwise #[code None].
|
||||
|
||||
+h(2, "find_suffix") Tokenizer.find_suffix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Find the length of a suffix that should be segmented from the string, or
|
||||
| #[code None] if no suffix rules match.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to segment.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell int / #[code None]
|
||||
+cell The length of the suffix if present, otherwise #[code None].
|
||||
|
||||
+h(2, "add_special_case") Tokenizer.add_special_case
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Add a special-case tokenization rule. This mechanism is also used to add
|
||||
| custom tokenizer exceptions to the language data. See the usage workflow
|
||||
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
|
||||
| for more details and examples.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.attrs import ORTH, LEMMA
|
||||
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
|
||||
tokenizer.add_special_case(case)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to specially tokenize.
|
||||
|
||||
+row
|
||||
+cell #[code token_attrs]
|
||||
+cell iterable
|
||||
+cell
|
||||
| A sequence of dicts, where each dict describes a token and its
|
||||
| attributes. The #[code ORTH] fields of the attributes must
|
||||
| exactly match the string when they are concatenated.
|
||||
|
||||
+h(2, "to_disk") Tokenizer.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
tokenizer.to_disk('/path/to/tokenizer')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+h(2, "from_disk") Tokenizer.from_disk
|
||||
+tag method
|
||||
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.tokenizer import Tokenizer
|
||||
tokenizer = Tokenizer(nlp.vocab)
|
||||
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The modified #[code Tokenizer] object.
|
||||
|
||||
+h(2, "to_bytes") Tokenizer.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Tokenizer] object.
|
||||
|
||||
+h(2, "from_bytes") Tokenizer.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.tokenizer import Tokenizer
|
||||
tokenizer_bytes = tokenizer.to_bytes()
|
||||
new_tokenizer = Tokenizer(nlp.vocab)
|
||||
new_tokenizer.from_bytes(tokenizer_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Tokenizer]
|
||||
+cell The #[code Tokenizer] object.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
|
@ -35,215 +312,3 @@ p
|
|||
| A function to find internal segment separators, e.g. hyphens.
|
||||
| Returns a (possibly empty) list of #[code re.MatchObject]
|
||||
| objects.
|
||||
|
||||
+h(2, "load") Tokenizer.load
|
||||
+tag classmethod
|
||||
|
||||
p Load a #[code Tokenizer], reading unsupplied components from the path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code rules]
|
||||
+cell dict
|
||||
+cell Exceptions and special-cases for the tokenizer.
|
||||
|
||||
+row
|
||||
+cell #[code prefix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match prefixes.
|
||||
|
||||
+row
|
||||
+cell #[code suffix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code infix_finditer]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).finditer] to find infixes.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Tokenizer]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "init") Tokenizer.__init__
|
||||
+tag method
|
||||
|
||||
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code vocab]
|
||||
+cell #[code Vocab]
|
||||
+cell A storage container for lexical types.
|
||||
|
||||
+row
|
||||
+cell #[code rules]
|
||||
+cell dict
|
||||
+cell Exceptions and special-cases for the tokenizer.
|
||||
|
||||
+row
|
||||
+cell #[code prefix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match prefixes.
|
||||
|
||||
+row
|
||||
+cell #[code suffix_search]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).search] to match suffixes.
|
||||
|
||||
+row
|
||||
+cell #[code infix_finditer]
|
||||
+cell callable
|
||||
+cell
|
||||
| A function matching the signature of
|
||||
| #[code re.compile(string).finditer] to find infixes.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Tokenizer]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "call") Tokenizer.__call__
|
||||
+tag method
|
||||
|
||||
p Tokenize a string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to tokenize.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Doc]
|
||||
+cell A container for linguistic annotations.
|
||||
|
||||
+h(2, "pipe") Tokenizer.pipe
|
||||
+tag method
|
||||
|
||||
p Tokenize a stream of texts.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code texts]
|
||||
+cell -
|
||||
+cell A sequence of unicode texts.
|
||||
|
||||
+row
|
||||
+cell #[code batch_size]
|
||||
+cell int
|
||||
+cell The number of texts to accumulate in an internal buffer.
|
||||
|
||||
+row
|
||||
+cell #[code n_threads]
|
||||
+cell int
|
||||
+cell
|
||||
| The number of threads to use, if the implementation supports
|
||||
| multi-threading. The default tokenizer is single-threaded.
|
||||
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell #[code Doc]
|
||||
+cell A sequence of Doc objects, in order.
|
||||
|
||||
+h(2, "find_infix") Tokenizer.find_infix
|
||||
+tag method
|
||||
|
||||
p Find internal split points of the string.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to split.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code List[re.MatchObject]]
|
||||
+cell
|
||||
| A list of objects that have #[code .start()] and #[code .end()]
|
||||
| methods, denoting the placement of internal segment separators,
|
||||
| e.g. hyphens.
|
||||
|
||||
+h(2, "find_prefix") Tokenizer.find_prefix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Find the length of a prefix that should be segmented from the string, or
|
||||
| #[code None] if no prefix rules match.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to segment.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int / #[code None]
|
||||
+cell The length of the prefix if present, otherwise #[code None].
|
||||
|
||||
+h(2, "find_suffix") Tokenizer.find_suffix
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Find the length of a suffix that should be segmented from the string, or
|
||||
| #[code None] if no suffix rules match.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to segment.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int / #[code None]
|
||||
+cell The length of the suffix if present, otherwise #[code None].
|
||||
|
||||
+h(2, "add_special_case") Tokenizer.add_special_case
|
||||
+tag method
|
||||
|
||||
p Add a special-case tokenization rule.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to specially tokenize.
|
||||
|
||||
+row
|
||||
+cell #[code token_attrs]
|
||||
+cell -
|
||||
+cell
|
||||
| A sequence of dicts, where each dict describes a token and its
|
||||
| attributes. The #[code ORTH] fields of the attributes must
|
||||
| exactly match the string when they are concatenated.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
|
|
@ -14,7 +14,7 @@ p
|
|||
| recommend having additional tests in place if your application depends on
|
||||
| any of spaCy's utilities.
|
||||
|
||||
+h(2, "get_data_path") get_data_path
|
||||
+h(2, "get_data_path") util.get_data_path
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -28,11 +28,11 @@ p
|
|||
+cell Only return path if it exists, otherwise return #[code None].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Path] / #[code None]
|
||||
+cell Data path or #[code None].
|
||||
|
||||
+h(2, "set_data_path") set_data_path
|
||||
+h(2, "set_data_path") util.set_data_path
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -49,7 +49,7 @@ p
|
|||
+cell unicode or #[code Path]
|
||||
+cell Path to new data directory.
|
||||
|
||||
+h(2, "get_lang_class") get_lang_class
|
||||
+h(2, "get_lang_class") util.get_lang_class
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -70,11 +70,11 @@ p
|
|||
+cell Two-letter language code, e.g. #[code 'en'].
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Language]
|
||||
+cell Language class.
|
||||
|
||||
+h(2, "resolve_model_path") resolve_model_path
|
||||
+h(2, "resolve_model_path") util.resolve_model_path
|
||||
+tag function
|
||||
|
||||
p Resolve a model name or string to a model path.
|
||||
|
@ -90,11 +90,11 @@ p Resolve a model name or string to a model path.
|
|||
+cell Package name, shortcut link or model path.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
|
||||
+h(2, "is_package") is_package
|
||||
+h(2, "is_package") util.is_package
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -112,11 +112,11 @@ p
|
|||
+cell Name of package.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code bool]
|
||||
+cell #[code True] if installed package, #[code False] if not.
|
||||
|
||||
+h(2, "get_model_package_path") get_model_package_path
|
||||
+h(2, "get_model_package_path") util.get_model_package_path
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -134,11 +134,11 @@ p
|
|||
+cell Name of installed package.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Path]
|
||||
+cell Path to model data directory.
|
||||
|
||||
+h(2, "parse_package_meta") parse_package_meta
|
||||
+h(2, "parse_package_meta") util.parse_package_meta
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -163,11 +163,31 @@ p
|
|||
+cell If #[code True], raise error if no #[code meta.json] is found.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell dict / #[code None]
|
||||
+cell Model meta data or #[code None].
|
||||
|
||||
+h(2, "update_exc") update_exc
|
||||
+h(2, "is_in_jupyter") util.is_in_jupyter
|
||||
+tag function
|
||||
|
||||
p
|
||||
| Check if user is running spaCy from a #[+a("https://jupyter.org") Jupyter]
|
||||
| notebook by detecting the IPython kernel. Mainly used for the
|
||||
| #[+api("displacy") #[code displacy]] visualizer.
|
||||
|
||||
+aside-code("Example").
|
||||
html = '<h1>Hello world!</h1>'
|
||||
if util.is_in_jupyter():
|
||||
from IPython.core.display import display, HTML
|
||||
return display(HTML(html))
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell #[code True] if in Jupyter, #[code False] if not.
|
||||
|
||||
+h(2, "update_exc") util.update_exc
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
@ -194,12 +214,12 @@ p
|
|||
+cell Exception dictionaries to add to the base exceptions, in order.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell dict
|
||||
+cell Combined tokenizer exceptions.
|
||||
|
||||
|
||||
+h(2, "prints") prints
|
||||
+h(2, "prints") util.prints
|
||||
+tag function
|
||||
|
||||
p
|
||||
|
|
|
@ -7,59 +7,6 @@ p
|
|||
| #[code Vocab] instance also provides access to the #[code StringStore],
|
||||
| and owns underlying C-data that is shared between #[code Doc] objects.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code strings]
|
||||
+cell #[code StringStore]
|
||||
+cell A table managing the string-to-int mapping.
|
||||
|
||||
+row
|
||||
+cell #[code vectors_length]
|
||||
+cell int
|
||||
+cell The dimensionality of the word vectors, if present.
|
||||
|
||||
+h(2, "load") Vocab.load
|
||||
+tag classmethod
|
||||
|
||||
p Load the vocabulary from a path.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code path]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+row
|
||||
+cell #[code lex_attr_getters]
|
||||
+cell dict
|
||||
+cell
|
||||
| A dictionary mapping attribute IDs to functions to compute them.
|
||||
| Defaults to #[code None].
|
||||
|
||||
+row
|
||||
+cell #[code lemmatizer]
|
||||
+cell -
|
||||
+cell A lemmatizer. Defaults to #[code None].
|
||||
|
||||
+row
|
||||
+cell #[code tag_map]
|
||||
+cell dict
|
||||
+cell
|
||||
| A dictionary mapping fine-grained tags to coarse-grained
|
||||
| parts-of-speech, and optionally morphological attributes.
|
||||
|
||||
+row
|
||||
+cell #[code oov_prob]
|
||||
+cell float
|
||||
+cell The default probability for out-of-vocabulary words.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code Vocab]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "init") Vocab.__init__
|
||||
+tag method
|
||||
|
||||
|
@ -73,11 +20,6 @@ p Create the vocabulary.
|
|||
| A dictionary mapping attribute IDs to functions to compute them.
|
||||
| Defaults to #[code None].
|
||||
|
||||
+row
|
||||
+cell #[code lemmatizer]
|
||||
+cell -
|
||||
+cell A lemmatizer. Defaults to #[code None].
|
||||
|
||||
+row
|
||||
+cell #[code tag_map]
|
||||
+cell dict
|
||||
|
@ -86,23 +28,34 @@ p Create the vocabulary.
|
|||
| parts-of-speech, and optionally morphological attributes.
|
||||
|
||||
+row
|
||||
+cell #[code oov_prob]
|
||||
+cell float
|
||||
+cell The default probability for out-of-vocabulary words.
|
||||
+cell #[code lemmatizer]
|
||||
+cell object
|
||||
+cell A lemmatizer. Defaults to #[code None].
|
||||
|
||||
+row
|
||||
+cell #[code strings]
|
||||
+cell #[code StringStore]
|
||||
+cell
|
||||
| A #[code StringStore] that maps strings to integers, and vice
|
||||
| versa.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Vocab]
|
||||
+cell The newly constructed object.
|
||||
|
||||
+h(2, "len") Vocab.__len__
|
||||
+tag method
|
||||
|
||||
p Get the number of lexemes in the vocabulary.
|
||||
p Get the current number of lexemes in the vocabulary.
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'This is a sentence.')
|
||||
assert len(nlp.vocab) > 0
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The number of lexems in the vocabulary.
|
||||
|
||||
|
@ -113,6 +66,10 @@ p
|
|||
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
| unseen unicode string is given, a new lexeme is created and stored.
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab.strings['apple']
|
||||
assert nlp.vocab[apple] == nlp.vocab[u'apple']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code id_or_string]
|
||||
|
@ -120,25 +77,37 @@ p
|
|||
+cell The integer ID of a word, or its unicode string.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell #[code Lexeme]
|
||||
+cell The lexeme indicated by the given ID.
|
||||
|
||||
+h(2, "iter") Span.__iter__
|
||||
+h(2, "iter") Vocab.__iter__
|
||||
+tag method
|
||||
|
||||
p Iterate over the lexemes in the vocabulary.
|
||||
|
||||
+aside-code("Example").
|
||||
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
+cell yield
|
||||
+cell yields
|
||||
+cell #[code Lexeme]
|
||||
+cell An entry in the vocabulary.
|
||||
|
||||
+h(2, "contains") Vocab.__contains__
|
||||
+tag method
|
||||
|
||||
p Check whether the string has an entry in the vocabulary.
|
||||
p
|
||||
| Check whether the string has an entry in the vocabulary. To get the ID
|
||||
| for a given string, you need to look it up in
|
||||
| #[+api("vocab#attributes") #[code vocab.strings]].
|
||||
|
||||
+aside-code("Example").
|
||||
apple = nlp.vocab.strings['apple']
|
||||
oov = nlp.vocab.strings['dskfodkfos']
|
||||
assert apple in nlp.vocab
|
||||
assert oov not in nlp.vocab
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -147,32 +116,27 @@ p Check whether the string has an entry in the vocabulary.
|
|||
+cell The ID string.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell bool
|
||||
+cell Whether the string has an entry in the vocabulary.
|
||||
|
||||
+h(2, "resize_vectors") Vocab.resize_vectors
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Set #[code vectors_length] to a new size, and allocate more memory for
|
||||
| the #[code Lexeme] vectors if necessary. The memory will be zeroed.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code new_size]
|
||||
+cell int
|
||||
+cell The new size of the vectors.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "add_flag") Vocab.add_flag
|
||||
+tag method
|
||||
|
||||
p Set a new boolean flag to words in the vocabulary.
|
||||
p
|
||||
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
|
||||
| function will be called over the words currently in the vocab, and then
|
||||
| applied to new words as they occur. You'll then be able to access the flag
|
||||
| value on each token, using #[code token.check_flag(flag_id)].
|
||||
|
||||
+aside-code("Example").
|
||||
def is_my_product(text):
|
||||
products = [u'spaCy', u'Thinc', u'displaCy']
|
||||
return text in products
|
||||
|
||||
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
|
||||
doc = nlp(u'I like spaCy')
|
||||
assert doc[2].check_flag(MY_PRODUCT) == True
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
|
@ -189,90 +153,104 @@ p Set a new boolean flag to words in the vocabulary.
|
|||
| available bit will be chosen.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell returns
|
||||
+cell int
|
||||
+cell The integer ID by which the flag value can be checked.
|
||||
|
||||
+h(2, "dump") Vocab.dump
|
||||
+h(2, "to_disk") Vocab.to_disk
|
||||
+tag method
|
||||
|
||||
p Save the lexemes binary data to the given location.
|
||||
p Save the current state to a directory.
|
||||
|
||||
+aside-code("Example").
|
||||
nlp.vocab.to_disk('/path/to/vocab')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code loc]
|
||||
+cell #[code Path]
|
||||
+cell The path to load from.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "load_lexemes") Vocab.load_lexemes
|
||||
+tag method
|
||||
|
||||
p
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code loc]
|
||||
+cell unicode
|
||||
+cell Path to load the lexemes.bin file from.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "dump_vectors") Vocab.dump_vectors
|
||||
+tag method
|
||||
|
||||
p Save the word vectors to a binary file.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code loc]
|
||||
+cell #[code Path]
|
||||
+cell The path to save to.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell #[code None]
|
||||
+cell -
|
||||
|
||||
+h(2, "load_vectors") Vocab.load_vectors
|
||||
+tag method
|
||||
|
||||
p Load vectors from a text-based file.
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code file_]
|
||||
+cell buffer
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| The file to read from. Entries should be separated by newlines,
|
||||
| and each entry should be whitespace delimited. The first value
|
||||
| of the entry should be the word string, and subsequent entries
|
||||
| should be the values of the vector.
|
||||
| A path to a directory, which will be created if it doesn't exist.
|
||||
| Paths may be either strings or #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The length of the vectors loaded.
|
||||
|
||||
+h(2, "load_vectors_from_bin_loc") Vocab.load_vectors_from_bin_loc
|
||||
+h(2, "from_disk") Vocab.from_disk
|
||||
+tag method
|
||||
|
||||
p Load vectors from the location of a binary file.
|
||||
p Loads state from a directory. Modifies the object in place and returns it.
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.vocab import Vocab
|
||||
vocab = Vocab().from_disk('/path/to/vocab')
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code loc]
|
||||
+cell unicode
|
||||
+cell The path of the binary file to load from.
|
||||
+cell #[code path]
|
||||
+cell unicode or #[code Path]
|
||||
+cell
|
||||
| A path to a directory. Paths may be either strings or
|
||||
| #[code Path]-like objects.
|
||||
|
||||
+footrow
|
||||
+cell return
|
||||
+cell int
|
||||
+cell The length of the vectors loaded.
|
||||
+cell returns
|
||||
+cell #[code Vocab]
|
||||
+cell The modified #[code Vocab] object.
|
||||
|
||||
+h(2, "to_bytes") Vocab.to_bytes
|
||||
+tag method
|
||||
|
||||
p Serialize the current state to a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being serialized.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell bytes
|
||||
+cell The serialized form of the #[code Vocab] object.
|
||||
|
||||
+h(2, "from_bytes") Vocab.from_bytes
|
||||
+tag method
|
||||
|
||||
p Load state from a binary string.
|
||||
|
||||
+aside-code("Example").
|
||||
fron spacy.vocab import Vocab
|
||||
vocab_bytes = nlp.vocab.to_bytes()
|
||||
vocab = Vocab()
|
||||
vocab.from_bytes(vocab_bytes)
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code bytes_data]
|
||||
+cell bytes
|
||||
+cell The data to load from.
|
||||
|
||||
+row
|
||||
+cell #[code **exclude]
|
||||
+cell -
|
||||
+cell Named attributes to prevent from being loaded.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell #[code Vocab]
|
||||
+cell The #[code Vocab] object.
|
||||
|
||||
+h(2, "attributes") Attributes
|
||||
|
||||
+aside-code("Example").
|
||||
apple_id = nlp.vocab.strings['apple']
|
||||
assert type(apple_id) == int
|
||||
PERSON = nlp.vocab.strings['PERSON']
|
||||
assert type(PERSON) == int
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code strings]
|
||||
+cell #[code StringStore]
|
||||
+cell A table managing the string-to-int mapping.
|
||||
|
|
|
@ -56,20 +56,22 @@ p
|
|||
from ...attrs import LANG
|
||||
from ...util import update_exc
|
||||
|
||||
# create Defaults class in the module scope (necessary for pickling!)
|
||||
class XxxxxDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
|
||||
|
||||
# optional: replace flags with custom functions, e.g. like_num()
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
# merge base exceptions and custom tokenizer exceptions
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
|
||||
# create actual Language class
|
||||
class Xxxxx(Language):
|
||||
lang = 'xx' # language ISO code
|
||||
|
||||
# override defaults
|
||||
class Defaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters[LANG] = lambda text: 'xx' # language ISO code
|
||||
|
||||
# optional: replace flags with custom functions, e.g. like_num()
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
|
||||
# merge base exceptions and custom tokenizer exceptions
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = set(STOP_WORDS)
|
||||
Defaults = XxxxxDefaults # override defaults
|
||||
|
||||
# set default export – this allows the language class to be lazy-loaded
|
||||
__all__ = ['Xxxxx']
|
||||
|
|
|
@ -141,11 +141,11 @@ p
|
|||
include ../api/_annotation/_named-entities
|
||||
|
||||
+aside("Install")
|
||||
| The #[+api("load") spacy.load()] function configures a pipeline that
|
||||
| The #[+api("load") #[code spacy.load()]] function configures a pipeline that
|
||||
| includes all of the available annotators for the given ID. In the example
|
||||
| above, the #[code 'en'] ID tells spaCy to load the default English
|
||||
| pipeline. If you have installed the data with
|
||||
| #[code python -m spacy.en.download] this will include the entity
|
||||
| #[code python -m spacy download en], this will include the entity
|
||||
| recognition model.
|
||||
|
||||
+h(2, "updating") Training and updating
|
||||
|
|
|
@ -4,58 +4,190 @@ include ../../_includes/_mixins
|
|||
|
||||
p
|
||||
| spaCy features a rule-matching engine that operates over tokens, similar
|
||||
| to regular expressions. The rules can refer to token annotations and
|
||||
| flags, and matches support callbacks to accept, modify and/or act on the
|
||||
| match. The rule matcher also allows you to associate patterns with
|
||||
| entity IDs, to allow some basic entity linking or disambiguation.
|
||||
| to regular expressions. The rules can refer to token annotations (e.g.
|
||||
| the token #[code text] or #[code tag_], and flags (e.g. #[code IS_PUNCT]).
|
||||
| The rule matcher also lets you pass in a custom callback
|
||||
| to act on matches – for example, to merge entities and apply custom labels.
|
||||
| You can also associate patterns with entity IDs, to allow some basic
|
||||
| entity linking or disambiguation.
|
||||
|
||||
p Here's a minimal example. We first add a pattern that specifies three tokens:
|
||||
+aside("What about \"real\" regular expressions?")
|
||||
|
||||
+list("numbers")
|
||||
+item A token whose lower-case form matches "hello"
|
||||
+item A token whose #[code is_punct] flag is set to #[code True]
|
||||
+item A token whose lower-case form matches "world"
|
||||
+h(2, "adding-patterns") Adding patterns
|
||||
|
||||
p
|
||||
| Once we've added the pattern, we can use the #[code matcher] as a
|
||||
| callable, to receive a list of #[code (ent_id, start, end)] tuples.
|
||||
| Note that #[code LOWER] and #[code IS_PUNCT] are data attributes
|
||||
| of #[code spacy.attrs].
|
||||
| Let's say we want to enable spaCy to find a combination of three tokens:
|
||||
|
||||
+list("numbers")
|
||||
+item
|
||||
| A token whose #[strong lower-case form matches "hello"], e.g. "Hello"
|
||||
| or "HELLO".
|
||||
+item
|
||||
| A token whose #[strong #[code is_punct] flag is set to #[code True]],
|
||||
| i.e. any punctuation.
|
||||
+item
|
||||
| A token whose #[strong lower-case form matches "world"], e.g. "World"
|
||||
| or "WORLD".
|
||||
|
||||
+code.
|
||||
from spacy.matcher import Matcher
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add_pattern("HelloWorld", [{LOWER: "hello"}, {IS_PUNCT: True}, {LOWER: "world"}])
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}]
|
||||
|
||||
doc = nlp(u'Hello, world!')
|
||||
p
|
||||
| First, we initialise the #[code Matcher] with a vocab. The matcher must
|
||||
| always share the same vocab with the documents it will operate on. We
|
||||
| can now call #[+api("matcher#add") #[code matcher.add()]] with an ID and
|
||||
| our custom pattern. The second argument lets you pass in an optional
|
||||
| callback function to invoke on a successful match. For now, we set it
|
||||
| to #[code None].
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER, IS_PUNCT # don't forget to import the attrs!
|
||||
|
||||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
# add match ID "HelloWorld" with no callback and one pattern
|
||||
matcher.add('HelloWorld', on_match=None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}])
|
||||
|
||||
doc = nlp(u'Hello, world! Hello world!')
|
||||
matches = matcher(doc)
|
||||
|
||||
p
|
||||
| The returned matches include the ID, to let you associate the matches
|
||||
| with the patterns. You can also group multiple patterns together, which
|
||||
| is useful when you have a knowledge base of entities you want to match,
|
||||
| and you want to write multiple patterns for each entity.
|
||||
|
||||
+h(2, "entities-patterns") Entities and patterns
|
||||
| The matcher returns a list of #[code (match_id, start, end)] tuples – in
|
||||
| this case, #[code [('HelloWorld', 0, 2)]], which maps to the span
|
||||
| #[code doc[0:2]] of our original document. Optionally, we could also
|
||||
| choose to add more than one pattern, for example to also match sequences
|
||||
| without punctuation between "hello" and "world":
|
||||
|
||||
+code.
|
||||
matcher.add_entity(
|
||||
"GoogleNow", # Entity ID -- Helps you act on the match.
|
||||
{"ent_type": "PRODUCT", "wiki_en": "Google_Now"}, # Arbitrary attributes (optional)
|
||||
)
|
||||
matcher.add('HelloWorld', on_match=None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
|
||||
matcher.add_pattern(
|
||||
"GoogleNow", # Entity ID -- Created if doesn't exist.
|
||||
[ # The pattern is a list of *Token Specifiers*.
|
||||
{ # This Token Specifier matches tokens whose orth field is "Google"
|
||||
ORTH: "Google"
|
||||
},
|
||||
{ # This Token Specifier matches tokens whose orth field is "Now"
|
||||
ORTH: "Now"
|
||||
}
|
||||
],
|
||||
label=None # Can associate a label to the pattern-match, to handle it better.
|
||||
)
|
||||
p
|
||||
| By default, the matcher will only return the matches and
|
||||
| #[strong not do anything else], like merge entities or assign labels.
|
||||
| This is all up to you and can be defined individually for each pattern,
|
||||
| by passing in a callback function as the #[code on_match] argument on
|
||||
| #[code add()]. This is useful, because it lets you write entirely custom
|
||||
| and #[strong pattern-specific logic]. For example, you might want to
|
||||
| merge #[em some] patterns into one token, while adding entity labels for
|
||||
| other pattern types. You shouldn't have to create different matchers for
|
||||
| each of those processes.
|
||||
|
||||
+h(2, "on_match") Adding #[code on_match] rules
|
||||
|
||||
p
|
||||
| To move on to a more realistic example, let's say you're working with a
|
||||
| large corpus of blog articles, and you want to match all mentions of
|
||||
| "Google I/O" (which spaCy tokenizes as #[code ['Google', 'I', '/', 'O']]).
|
||||
| To be safe, you only match on the uppercase versions, in case someone has
|
||||
| written it as "Google i/o". You also add a second pattern with an added
|
||||
| #[code {IS_DIGIT: True}] token – this will make sure you also match on
|
||||
| "Google I/O 2017". If your pattern matches, spaCy should execute your
|
||||
| custom callback function #[code add_event_ent].
|
||||
|
||||
+code.
|
||||
import spacy
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import ORTH, UPPER, LOWER, IS_DIGIT
|
||||
|
||||
nlp = spacy.load('en')
|
||||
matcher = Matcher(nlp.vocab)
|
||||
|
||||
matcher.add('GoogleIO', on_match=add_event_ent,
|
||||
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}],
|
||||
[{ORTH: 'Google'}, {UPPER: 'I'}, {ORTH: '/'}, {UPPER: 'O'}, {IS_DIGIT: True}])
|
||||
|
||||
# Get the ID of the 'EVENT' entity type. This is required to set an entity.
|
||||
EVENT = nlp.vocab.strings['EVENT']
|
||||
|
||||
def add_event_ent(matcher, doc, i, matches):
|
||||
# Get the current match and create tuple of entity label, start and end.
|
||||
# Append entity to the doc's entity. (Don't overwrite doc.ents!)
|
||||
match_id, start, end = matches[i]
|
||||
doc.ents += ((EVENT, start, end),)
|
||||
|
||||
p
|
||||
| In addition to mentions of "Google I/O", your data also contains some
|
||||
| annoying pre-processing artefacts, like leftover HTML line breaks
|
||||
| (e.g. #[code <br>] or #[code <BR/>]). While you're at it,
|
||||
| you want to merge those into one token and flag them, to make sure you
|
||||
| can easily ignore them later. So you add a second pattern and pass in a
|
||||
| function #[code merge_and_flag]:
|
||||
|
||||
+code.
|
||||
matcher.add('BAD_HTML', on_match=merge_and_flag,
|
||||
[{ORTH: '<'}, {LOWER: 'br'}, {ORTH: '>'}],
|
||||
[{ORTH: '<'}, {LOWER: 'br/'}, {ORTH: '>'}])
|
||||
|
||||
# Add a new custom flag to the vocab, which is always False by default.
|
||||
# BAD_HTML_FLAG will be the flag ID, which we can use to set it to True on the span.
|
||||
BAD_HTML_FLAG = doc.vocab.add_flag(lambda text: False)
|
||||
|
||||
def merge_and_flag(matcher, doc, i, matches):
|
||||
match_id, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span.merge(is_stop=True) # merge (and mark it as a stop word, just in case)
|
||||
span.set_flag(BAD_HTML_FLAG, True) # set BAD_HTML_FLAG
|
||||
|
||||
+aside("Tip: Visualizing matches")
|
||||
| When working with entities, you can use #[+api("displacy") displaCy]
|
||||
| to quickly generate a NER visualization from your updated #[code Doc],
|
||||
| which can be exported as an HTML file:
|
||||
|
||||
+code.o-no-block.
|
||||
from spacy import displacy
|
||||
html = displacy.render(doc, style='ent', page=True,
|
||||
options={'ents': ['EVENT']})
|
||||
|
||||
| For more info and examples, see the usage workflow on
|
||||
| #[+a("/docs/usage/visualizers") visualizing spaCy].
|
||||
|
||||
p
|
||||
| We can now call the matcher on our documents. The patterns will be
|
||||
| matched in the order they occur in the text.
|
||||
|
||||
+code.
|
||||
doc = nlp(LOTS_OF_TEXT)
|
||||
matcher(doc)
|
||||
|
||||
+h(3, "on_match-callback") The callback function
|
||||
|
||||
p
|
||||
| The matcher will first collect all matches over the document. It will
|
||||
| then iterate over the matches, lookup the callback for the entity ID
|
||||
| that was matched, and invoke it. When the callback is invoked, it is
|
||||
| passed four arguments: the matcher itself, the document, the position of
|
||||
| the current match, and the total list of matches. This allows you to
|
||||
| write callbacks that consider the entire set of matched phrases, so that
|
||||
| you can resolve overlaps and other conflicts in whatever way you prefer.
|
||||
|
||||
+table(["Argument", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code matcher]
|
||||
+cell #[code Matcher]
|
||||
+cell The matcher instance.
|
||||
|
||||
+row
|
||||
+cell #[code doc]
|
||||
+cell #[code Doc]
|
||||
+cell The document the matcher was used on.
|
||||
|
||||
+row
|
||||
+cell #[code i]
|
||||
+cell int
|
||||
+cell Index of the current match (#[code matches[i]]).
|
||||
|
||||
+row
|
||||
+cell #[code matches]
|
||||
+cell list
|
||||
+cell
|
||||
| A list of #[code (match_id, start, end)] tuples, describing the
|
||||
| matches. A match tuple describes a span #[code doc[start:end]].
|
||||
| The #[code match_id] is the ID of the added match pattern.
|
||||
|
||||
+h(2, "quantifiers") Using quantifiers
|
||||
|
||||
|
@ -82,78 +214,4 @@ p
|
|||
|
||||
p
|
||||
| There are no nested or scoped quantifiers. You can build those
|
||||
| behaviours with acceptors and
|
||||
| #[+api("matcher#add_entity") #[code on_match]] callbacks.
|
||||
|
||||
+h(2, "acceptor-functions") Acceptor functions
|
||||
|
||||
p
|
||||
| The #[code acceptor] keyword of #[code matcher.add_entity()] allows you to
|
||||
| pass a function to reject or modify matches. The function you pass should
|
||||
| take five arguments: #[code doc], #[code ent_id], #[code label], #[code start],
|
||||
| and #[code end]. You can return a falsey value to reject the match, or
|
||||
| return a 4-tuple #[code (ent_id, label, start, end)].
|
||||
|
||||
+code.
|
||||
from spacy.tokens.doc import Doc
|
||||
def trim_title(doc, ent_id, label, start, end):
|
||||
if doc[start].check_flag(IS_TITLE_TERM):
|
||||
return (ent_id, label, start+1, end)
|
||||
else:
|
||||
return (ent_id, label, start, end)
|
||||
titles = set(title.lower() for title in [u'Mr.', 'Dr.', 'Ms.', u'Admiral'])
|
||||
IS_TITLE_TERM = matcher.vocab.add_flag(lambda string: string.lower() in titles)
|
||||
matcher.add_entity('PersonName', acceptor=trim_title)
|
||||
matcher.add_pattern('PersonName', [{LOWER: 'mr.'}, {LOWER: 'cruise'}])
|
||||
matcher.add_pattern('PersonName', [{LOWER: 'dr.'}, {LOWER: 'seuss'}])
|
||||
doc = Doc(matcher.vocab, words=[u'Mr.', u'Cruise', u'likes', 'Dr.', u'Seuss'])
|
||||
for ent_id, label, start, end in matcher(doc):
|
||||
print(doc[start:end].text)
|
||||
# Cruise
|
||||
# Seuss
|
||||
|
||||
p
|
||||
| Passing an #[code acceptor] function allows you to match patterns with
|
||||
| arbitrary logic that can't easily be expressed by a finite-state machine.
|
||||
| You can look at the entirety of the
|
||||
| matched phrase, and its context in the document, and decide to move
|
||||
| the boundaries or reject the match entirely.
|
||||
|
||||
+h(2, "callback-functions") Callback functions
|
||||
|
||||
p
|
||||
| In spaCy <1.0, the #[code Matcher] automatically tagged matched phrases
|
||||
| with entity types. Since spaCy 1.0, the matcher no longer acts on matches
|
||||
| automatically. By default, the match list is returned for the user to action.
|
||||
| However, it's often more convenient to register the required actions as a
|
||||
| callback. You can do this by passing a function to the #[code on_match]
|
||||
| keyword argument of #[code matcher.add_entity].
|
||||
|
||||
+aside-code("Example").
|
||||
def merge_phrases(matcher, doc, i, matches):
|
||||
'''
|
||||
Merge a phrase. We have to be careful here because we'll change the token indices.
|
||||
To avoid problems, merge all the phrases once we're called on the last match.
|
||||
'''
|
||||
if i != len(matches)-1:
|
||||
return None
|
||||
# Get Span objects
|
||||
spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]
|
||||
for ent_id, label, span in spans:
|
||||
span.merge(label=label, tag='NNP' if label else span.root.tag_)
|
||||
|
||||
matcher.add_entity('GoogleNow', on_match=merge_phrases)
|
||||
matcher.add_pattern('GoogleNow', [{ORTH: 'Google'}, {ORTH: 'Now'}])
|
||||
doc = Doc(matcher.vocab, words=[u'Google', u'Now', u'is', u'being', u'rebranded'])
|
||||
matcher(doc)
|
||||
print([w.text for w in doc])
|
||||
# [u'Google Now', u'is', u'being', u'rebranded']
|
||||
|
||||
p
|
||||
| The matcher will first collect all matches over the document. It will
|
||||
| then iterate over the matches, look-up the callback for the entity ID
|
||||
| that was matched, and invoke it. When the callback is invoked, it is
|
||||
| passed four arguments: the matcher itself, the document, the position of
|
||||
| the current match, and the total list of matches. This allows you to
|
||||
| write callbacks that consider the entire set of matched phrases, so that
|
||||
| you can resolve overlaps and other conflicts in whatever way you prefer.
|
||||
| behaviours with #[code on_match] callbacks.
|
||||
|
|
|
@ -2,9 +2,218 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p
|
||||
| We also re-wrote a large part of the documentation and usage workflows,
|
||||
| and added more examples.
|
||||
|
||||
+h(2, "features") New features
|
||||
|
||||
+h(3, "features-displacy") displaCy visualizer with Jupyter support
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy import displacy
|
||||
doc = nlp(u'This is a sentence about Facebook.')
|
||||
displacy.serve(doc, style='dep') # run the web server
|
||||
html = displacy.render(doc, style='ent') # generate HTML
|
||||
|
||||
p
|
||||
| Our popular dependency and named entity visualizers are now an official
|
||||
| part of the spaCy library! displaCy can run a simple web server, or
|
||||
| generate raw HTML markup or SVG files to be exported. You can pass in one
|
||||
| or more docs, and customise the style. displaCy also auto-detects whether
|
||||
| you're running #[+a("https://jupyter.org") Jupyter] and will render the
|
||||
| visualizations in your notebook.
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("displacy") #[code displacy]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/visualizers") Visualizing spaCy]
|
||||
|
||||
+h(3, "features-loading") Loading
|
||||
|
||||
+aside-code("Example").
|
||||
nlp = spacy.load('en') # shortcut link
|
||||
nlp = spacy.load('en_core_web_sm') # package
|
||||
nlp = spacy.load('/path/to/en') # unicode path
|
||||
nlp = spacy.load(Path('/path/to/en')) # pathlib Path
|
||||
|
||||
p
|
||||
| The improved #[code spacy.load] makes loading models easier and more
|
||||
| transparent. You can load a model by supplying its
|
||||
| #[+a("/docs/usage/models#usage") shortcut link], the name of an installed
|
||||
| #[+a("/docs/usage/saving-loading#generating") model package], a unicode
|
||||
| path or a #[code Path]-like object. spaCy will try resolving the load
|
||||
| argument in this order. The #[code path] keyword argument is now deprecated.
|
||||
|
||||
p
|
||||
| The #[code Language] class to initialise will be determined based on the
|
||||
| model's settings. If no model is found, spaCy will let you know and won't
|
||||
| just return an empty #[code Language] object anymore. If you want a blank
|
||||
| language, you can always import the class directly, e.g.
|
||||
| #[code from spacy.lang.en import English].
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("spacy#load") #[code spacy.load]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||
|
||||
+h(3, "features-language") Improved language data and processing pipelines
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.language import Language
|
||||
nlp = Language(pipeline=['token_vectors', 'tags',
|
||||
'dependencies'])
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
|
||||
|
||||
+h(3, "features-lemmatizer") Simple lookup-based lemmatization
|
||||
|
||||
+aside-code("Example").
|
||||
LOOKUP = {
|
||||
"aba": "abar",
|
||||
"ababa": "abar",
|
||||
"ababais": "abar",
|
||||
"ababan": "abar",
|
||||
"ababanes": "ababán"
|
||||
}
|
||||
|
||||
p
|
||||
| spaCy now supports simple lookup-based lemmatization. The data is stored
|
||||
| in a dictionary mapping a string to its lemma. To determine a token's
|
||||
| lemma, spaCy simply looks it up in the table. The lookup lemmatizer can
|
||||
| be imported from #[code spacy.lemmatizerlookup]. It's initialised with
|
||||
| the lookup table, and should be returned by the #[code create_lemmatizer]
|
||||
| classmethod of the language's defaults.
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/adding-languages") Adding languages]
|
||||
|
||||
+h(3, "features-matcher") Revised matcher API
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.matcher import Matcher
|
||||
from spacy.attrs import LOWER, IS_PUNCT
|
||||
matcher = Matcher(nlp.vocab)
|
||||
matcher.add('HelloWorld', on_match=None,
|
||||
[{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
|
||||
[{LOWER: 'hello'}, {LOWER: 'world'}])
|
||||
assert len(matcher) == 1
|
||||
assert 'HelloWorld' in matcher
|
||||
|
||||
p
|
||||
| Patterns can now be added to the matcher by calling
|
||||
| #[+api("matcher-add") #[code matcher.add()]] with a match ID, an optional
|
||||
| callback function to be invoked on each match, and one or more patterns.
|
||||
| This allows you to write powerful, pattern-specific logic using only one
|
||||
| matcher. For example, you might only want to merge some entity types,
|
||||
| and set custom flags for other matched patterns.
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("matcher") #[code Matcher]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/rule-based-matching") Rule-based matching]
|
||||
|
||||
+h(3, "features-serializer") Serialization
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("serializer") #[code Serializer]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
|
||||
|
||||
+h(3, "features-models") Neural network models for English, German, French and Spanish
|
||||
|
||||
+infobox
|
||||
| #[strong Details:] #[+src(gh("spacy-models")) spacy-models]
|
||||
| #[strong Usage:] #[+a("/docs/usage/models") Models]
|
||||
|
||||
+h(2, "incompat") Backwards incompatibilities
|
||||
|
||||
+table(["Old", "New"])
|
||||
+row
|
||||
+cell #[code Language.save_to_directory]
|
||||
+cell #[+api("language#to_disk") #[code Language.to_disk]]
|
||||
|
||||
+row
|
||||
+cell #[code Tokenizer.load]
|
||||
+cell
|
||||
| #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
|
||||
| #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Tagger.load]
|
||||
+cell
|
||||
| #[+api("tagger#from_disk") #[code Tagger.from_disk]]
|
||||
| #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code DependencyParser.load]
|
||||
+cell
|
||||
| #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
|
||||
| #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code EntityRecognizer.load]
|
||||
+cell
|
||||
| #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
|
||||
| #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
| #[code Vocab.load]
|
||||
| #[code Vocab.load_lexemes]
|
||||
| #[code Vocab.load_vectors]
|
||||
| #[code Vocab.load_vectors_from_bin_loc]
|
||||
+cell
|
||||
| #[+api("vocab#from_disk") #[code Vocab.from_disk]]
|
||||
| #[+api("vocab#from_bytes") #[code Vocab.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
| #[code Vocab.dump]
|
||||
| #[code Vocab.dump_vectors]
|
||||
+cell
|
||||
| #[+api("vocab#to_disk") #[code Vocab.to_disk]]
|
||||
| #[+api("vocab#to_bytes") #[code Vocab.to_bytes]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
| #[code StringStore.load]
|
||||
+cell
|
||||
| #[+api("stringstore#from_disk") #[code StringStore.from_disk]]
|
||||
| #[+api("stringstore#from_bytes") #[code StringStore.from_bytes]]
|
||||
|
||||
+row
|
||||
+cell
|
||||
| #[code StringStore.dump]
|
||||
+cell
|
||||
| #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
|
||||
| #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
|
||||
|
||||
+row
|
||||
+cell #[code Matcher.load]
|
||||
+cell -
|
||||
|
||||
+row
|
||||
+cell
|
||||
| #[code Matcher.add_pattern]
|
||||
| #[code Matcher.add_entity]
|
||||
+cell #[+api("matcher#add") #[code Matcher.add]]
|
||||
|
||||
+row
|
||||
+cell #[code Matcher.get_entity]
|
||||
+cell #[+api("matcher#get") #[code Matcher.get]]
|
||||
|
||||
+row
|
||||
+cell #[code Matcher.has_entity]
|
||||
+cell #[+api("matcher#contains") #[code Matcher.__contains__]]
|
||||
|
||||
+row
|
||||
+cell #[code Doc.read_bytes]
|
||||
+cell
|
||||
|
||||
+row
|
||||
+cell #[code Token.is_ancestor_of]
|
||||
+cell #[+api("token#is_ancestor") #[code Token.is_ancestor]]
|
||||
|
||||
|
||||
|
||||
+h(2, "migrating") Migrating from spaCy 1.x
|
||||
|
|
Loading…
Reference in New Issue
Block a user