mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix doc strings
This commit is contained in:
parent
18aab4f71e
commit
b86f8af0c1
|
@ -212,6 +212,7 @@ def _consume_ent(tags):
|
||||||
|
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
|
"""Collection for training annotations."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
||||||
_, words, tags, heads, deps, entities = annot_tuples
|
_, words, tags, heads, deps, entities = annot_tuples
|
||||||
|
@ -220,6 +221,25 @@ cdef class GoldParse:
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, make_projective=False):
|
deps=None, entities=None, make_projective=False):
|
||||||
|
"""Create a GoldParse.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc):
|
||||||
|
The document the annotations refer to.
|
||||||
|
words:
|
||||||
|
A sequence of unicode word strings.
|
||||||
|
tags:
|
||||||
|
A sequence of strings, representing tag annotations.
|
||||||
|
heads:
|
||||||
|
A sequence of integers, representing syntactic head offsets.
|
||||||
|
deps:
|
||||||
|
A sequence of strings, representing the syntactic relation types.
|
||||||
|
entities:
|
||||||
|
A sequence of named entity annotations, either as BILUO tag strings,
|
||||||
|
or as (start_char, end_char, label) tuples, representing the entity
|
||||||
|
positions.
|
||||||
|
Returns (GoldParse): The newly constructed object.
|
||||||
|
"""
|
||||||
if words is None:
|
if words is None:
|
||||||
words = [token.text for token in doc]
|
words = [token.text for token in doc]
|
||||||
if tags is None:
|
if tags is None:
|
||||||
|
@ -280,10 +300,16 @@ cdef class GoldParse:
|
||||||
self.heads = proj_heads
|
self.heads = proj_heads
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
"""Get the number of gold-standard tokens.
|
||||||
|
|
||||||
|
Returns (int): The number of gold-standard tokens.
|
||||||
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_projective(self):
|
def is_projective(self):
|
||||||
|
"""Whether the provided syntactic annotations form a projective dependency
|
||||||
|
tree."""
|
||||||
return not nonproj.is_nonproj_tree(self.heads)
|
return not nonproj.is_nonproj_tree(self.heads)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -293,13 +293,14 @@ class Language(object):
|
||||||
text (unicode): The text to be processed.
|
text (unicode): The text to be processed.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (spacy.tokens.Doc):
|
doc (Doc): A container for accessing the annotations.
|
||||||
|
|
||||||
>>> from spacy.en import English
|
Example:
|
||||||
>>> nlp = English()
|
>>> from spacy.en import English
|
||||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
>>> nlp = English()
|
||||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||||
('An', 'NN')
|
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||||
|
('An', 'NN')
|
||||||
"""
|
"""
|
||||||
doc = self.make_doc(text)
|
doc = self.make_doc(text)
|
||||||
if self.entity and entity:
|
if self.entity and entity:
|
||||||
|
@ -314,6 +315,16 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||||
|
'''Process texts as a stream, and yield Doc objects in order.
|
||||||
|
|
||||||
|
Supports GIL-free multi-threading.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
texts (iterator)
|
||||||
|
tag (bool)
|
||||||
|
parse (bool)
|
||||||
|
entity (bool)
|
||||||
|
'''
|
||||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||||
stream = (self.make_doc(text) for text in texts)
|
stream = (self.make_doc(text) for text in texts)
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
|
|
|
@ -36,6 +36,13 @@ cdef class Lexeme:
|
||||||
tag).
|
tag).
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, int orth):
|
def __init__(self, Vocab vocab, int orth):
|
||||||
|
"""Create a Lexeme object.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab (Vocab): The parent vocabulary
|
||||||
|
orth (int): The orth id of the lexeme.
|
||||||
|
Returns (Lexeme): The newly constructd object.
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.orth = orth
|
self.orth = orth
|
||||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||||
|
@ -73,12 +80,33 @@ cdef class Lexeme:
|
||||||
return self.c.orth
|
return self.c.orth
|
||||||
|
|
||||||
def set_flag(self, attr_id_t flag_id, bint value):
|
def set_flag(self, attr_id_t flag_id, bint value):
|
||||||
|
"""Change the value of a boolean flag.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
flag_id (int): The attribute ID of the flag to set.
|
||||||
|
value (bool): The new value of the flag.
|
||||||
|
"""
|
||||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||||
|
|
||||||
def check_flag(self, attr_id_t flag_id):
|
def check_flag(self, attr_id_t flag_id):
|
||||||
|
"""Check the value of a boolean flag.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
flag_id (int): The attribute ID of the flag to query.
|
||||||
|
Returns (bool): The value of the flag.
|
||||||
|
"""
|
||||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
other:
|
||||||
|
The object to compare with. By default, accepts Doc, Span,
|
||||||
|
Token and Lexeme objects.
|
||||||
|
Returns:
|
||||||
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
|
'''
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
|
@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store):
|
||||||
|
|
||||||
|
|
||||||
cdef class Matcher:
|
cdef class Matcher:
|
||||||
|
'''Match sequences of tokens, based on pattern rules.'''
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef vector[TokenPatternC*] patterns
|
cdef vector[TokenPatternC*] patterns
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
|
@ -175,6 +176,16 @@ cdef class Matcher:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab):
|
def load(cls, path, vocab):
|
||||||
|
'''Load the matcher and patterns from a file path.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
path (Path):
|
||||||
|
Path to a JSON-formatted patterns file.
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary that the documents to match over will refer to.
|
||||||
|
Returns:
|
||||||
|
Matcher: The newly constructed object.
|
||||||
|
'''
|
||||||
if (path / 'gazetteer.json').exists():
|
if (path / 'gazetteer.json').exists():
|
||||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||||
patterns = json.load(file_)
|
patterns = json.load(file_)
|
||||||
|
@ -183,6 +194,16 @@ cdef class Matcher:
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
def __init__(self, vocab, patterns={}):
|
def __init__(self, vocab, patterns={}):
|
||||||
|
"""Create the Matcher.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary object, which must be shared with the documents
|
||||||
|
the matcher will operate on.
|
||||||
|
patterns (dict): Patterns to add to the matcher.
|
||||||
|
Returns:
|
||||||
|
The newly constructed object.
|
||||||
|
"""
|
||||||
self._patterns = {}
|
self._patterns = {}
|
||||||
self._entities = {}
|
self._entities = {}
|
||||||
self._acceptors = {}
|
self._acceptors = {}
|
||||||
|
@ -203,6 +224,22 @@ cdef class Matcher:
|
||||||
|
|
||||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||||
acceptor=None, on_match=None):
|
acceptor=None, on_match=None):
|
||||||
|
"""Add an entity to the matcher.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
entity_key (unicode or int):
|
||||||
|
An ID for the entity.
|
||||||
|
attrs:
|
||||||
|
Attributes to associate with the Matcher.
|
||||||
|
if_exists ('raise', 'ignore' or 'update'):
|
||||||
|
Controls what happens if the entity ID already exists. Defaults to 'raise'.
|
||||||
|
acceptor:
|
||||||
|
Callback function to filter matches of the entity.
|
||||||
|
on_match:
|
||||||
|
Callback function to act on matches of the entity.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
if if_exists not in ('raise', 'ignore', 'update'):
|
if if_exists not in ('raise', 'ignore', 'update'):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Unexpected value for if_exists: %s.\n"
|
"Unexpected value for if_exists: %s.\n"
|
||||||
|
@ -224,6 +261,18 @@ cdef class Matcher:
|
||||||
self._callbacks[entity_key] = on_match
|
self._callbacks[entity_key] = on_match
|
||||||
|
|
||||||
def add_pattern(self, entity_key, token_specs, label=""):
|
def add_pattern(self, entity_key, token_specs, label=""):
|
||||||
|
"""Add a pattern to the matcher.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
entity_key (unicode or int):
|
||||||
|
An ID for the entity.
|
||||||
|
token_specs:
|
||||||
|
Description of the pattern to be matched.
|
||||||
|
label:
|
||||||
|
Label to assign to the matched pattern. Defaults to "".
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
entity_key = self.normalize_entity_key(entity_key)
|
entity_key = self.normalize_entity_key(entity_key)
|
||||||
if not self.has_entity(entity_key):
|
if not self.has_entity(entity_key):
|
||||||
self.add_entity(entity_key)
|
self.add_entity(entity_key)
|
||||||
|
@ -249,10 +298,24 @@ cdef class Matcher:
|
||||||
return entity_key
|
return entity_key
|
||||||
|
|
||||||
def has_entity(self, entity_key):
|
def has_entity(self, entity_key):
|
||||||
|
"""Check whether the matcher has an entity.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
entity_key (string or int): The entity key to check.
|
||||||
|
Returns:
|
||||||
|
bool: Whether the matcher has the entity.
|
||||||
|
"""
|
||||||
entity_key = self.normalize_entity_key(entity_key)
|
entity_key = self.normalize_entity_key(entity_key)
|
||||||
return entity_key in self._entities
|
return entity_key in self._entities
|
||||||
|
|
||||||
def get_entity(self, entity_key):
|
def get_entity(self, entity_key):
|
||||||
|
"""Retrieve the attributes stored for an entity.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
entity_key (unicode or int): The entity to retrieve.
|
||||||
|
Returns:
|
||||||
|
The entity attributes if present, otherwise None.
|
||||||
|
"""
|
||||||
entity_key = self.normalize_entity_key(entity_key)
|
entity_key = self.normalize_entity_key(entity_key)
|
||||||
if entity_key in self._entities:
|
if entity_key in self._entities:
|
||||||
return self._entities[entity_key]
|
return self._entities[entity_key]
|
||||||
|
@ -260,6 +323,17 @@ cdef class Matcher:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc, acceptor=None):
|
def __call__(self, Doc doc, acceptor=None):
|
||||||
|
"""Find all token sequences matching the supplied patterns on the Doc.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc):
|
||||||
|
The document to match over.
|
||||||
|
Returns:
|
||||||
|
list
|
||||||
|
A list of (entity_key, label_id, start, end) tuples,
|
||||||
|
describing the matches. A match tuple describes a span doc[start:end].
|
||||||
|
The label_id and entity_key are both integers.
|
||||||
|
"""
|
||||||
if acceptor is not None:
|
if acceptor is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
|
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
|
||||||
|
@ -340,6 +414,18 @@ cdef class Matcher:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||||
|
"""Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
docs: A stream of documents.
|
||||||
|
batch_size (int):
|
||||||
|
The number of documents to accumulate into a working set.
|
||||||
|
n_threads (int):
|
||||||
|
The number of threads with which to work on the buffer in parallel,
|
||||||
|
if the Matcher implementation supports multi-threading.
|
||||||
|
Yields:
|
||||||
|
Doc Documents, in order.
|
||||||
|
"""
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
self(doc)
|
self(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
|
@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
|
"""Annotate named entities on Doc objects."""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
feature_templates = get_feature_templates('ner')
|
feature_templates = get_feature_templates('ner')
|
||||||
|
|
|
@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
'''Map strings to and from integer IDs.'''
|
'''Map strings to and from integer IDs.'''
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
|
'''Create the StringStore.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
strings: A sequence of unicode strings to add to the store.
|
||||||
|
'''
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
self._oov = PreshMap()
|
||||||
|
@ -89,9 +94,22 @@ cdef class StringStore:
|
||||||
return self.size -1
|
return self.size -1
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
"""The number of strings in the store.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int The number of strings in the store.
|
||||||
|
"""
|
||||||
return self.size-1
|
return self.size-1
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
|
"""Retrieve a string from a given integer ID, or vice versa.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string_or_id (bytes or unicode or int):
|
||||||
|
The value to encode.
|
||||||
|
Returns:
|
||||||
|
unicode or int: The value to retrieved.
|
||||||
|
"""
|
||||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||||
return 0
|
return 0
|
||||||
elif string_or_id == 0:
|
elif string_or_id == 0:
|
||||||
|
@ -127,12 +145,23 @@ cdef class StringStore:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
def __contains__(self, unicode string not None):
|
def __contains__(self, unicode string not None):
|
||||||
|
"""Check whether a string is in the store.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string (unicode): The string to check.
|
||||||
|
Returns bool:
|
||||||
|
Whether the store contains the string.
|
||||||
|
"""
|
||||||
if len(string) == 0:
|
if len(string) == 0:
|
||||||
return True
|
return True
|
||||||
cdef hash_t key = hash_string(string)
|
cdef hash_t key = hash_string(string)
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
"""Iterate over the strings in the store, in order.
|
||||||
|
|
||||||
|
Yields: unicode A string in the store.
|
||||||
|
"""
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(self.size):
|
for i in range(self.size):
|
||||||
yield _decode(&self.c[i]) if i > 0 else u''
|
yield _decode(&self.c[i]) if i > 0 else u''
|
||||||
|
@ -185,6 +214,13 @@ cdef class StringStore:
|
||||||
return &self.c[self.size-1]
|
return &self.c[self.size-1]
|
||||||
|
|
||||||
def dump(self, file_):
|
def dump(self, file_):
|
||||||
|
"""Save the strings to a JSON file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
file_ (buffer): The file to save the strings.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
string_data = json.dumps(list(self))
|
string_data = json.dumps(list(self))
|
||||||
if not isinstance(string_data, unicode):
|
if not isinstance(string_data, unicode):
|
||||||
string_data = string_data.decode('utf8')
|
string_data = string_data.decode('utf8')
|
||||||
|
@ -192,6 +228,13 @@ cdef class StringStore:
|
||||||
file_.write(string_data)
|
file_.write(string_data)
|
||||||
|
|
||||||
def load(self, file_):
|
def load(self, file_):
|
||||||
|
"""Load the strings from a JSON file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
file_ (buffer): The file from which to load the strings.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
strings = json.load(file_)
|
strings = json.load(file_)
|
||||||
if strings == ['']:
|
if strings == ['']:
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron):
|
||||||
|
|
||||||
|
|
||||||
cdef class Parser:
|
cdef class Parser:
|
||||||
|
"""Base class of the DependencyParser and EntityRecognizer."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
|
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
|
||||||
|
"""Load the statistical model from the supplied path.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
path (Path):
|
||||||
|
The path to load from.
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary. Must be shared by the documents to be processed.
|
||||||
|
require (bool):
|
||||||
|
Whether to raise an error if the files are not found.
|
||||||
|
Returns (Parser):
|
||||||
|
The newly constructed object.
|
||||||
|
"""
|
||||||
with (path / 'config.json').open() as file_:
|
with (path / 'config.json').open() as file_:
|
||||||
cfg = json.load(file_)
|
cfg = json.load(file_)
|
||||||
# TODO: remove this shim when we don't have to support older data
|
# TODO: remove this shim when we don't have to support older data
|
||||||
|
@ -90,6 +103,16 @@ cdef class Parser:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||||
|
"""Create a Parser.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary object. Must be shared with documents to be processed.
|
||||||
|
model (thinc.linear.AveragedPerceptron):
|
||||||
|
The statistical model.
|
||||||
|
Returns (Parser):
|
||||||
|
The newly constructed object.
|
||||||
|
"""
|
||||||
if TransitionSystem is None:
|
if TransitionSystem is None:
|
||||||
TransitionSystem = self.TransitionSystem
|
TransitionSystem = self.TransitionSystem
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -107,6 +130,13 @@ cdef class Parser:
|
||||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
|
"""Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc): The document to be processed.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
"""
|
||||||
cdef int nr_feat = self.model.nr_feat
|
cdef int nr_feat = self.model.nr_feat
|
||||||
with nogil:
|
with nogil:
|
||||||
status = self.parseC(tokens.c, tokens.length, nr_feat)
|
status = self.parseC(tokens.c, tokens.length, nr_feat)
|
||||||
|
@ -117,6 +147,16 @@ cdef class Parser:
|
||||||
self.moves.finalize_doc(tokens)
|
self.moves.finalize_doc(tokens)
|
||||||
|
|
||||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||||
|
"""Process a stream of documents.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
stream: The sequence of documents to process.
|
||||||
|
batch_size (int):
|
||||||
|
The number of documents to accumulate into a working set.
|
||||||
|
n_threads (int):
|
||||||
|
The number of threads with which to work on the buffer in parallel.
|
||||||
|
Yields (Doc): Documents, in order.
|
||||||
|
"""
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
|
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
|
||||||
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
|
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
|
||||||
|
@ -194,6 +234,16 @@ cdef class Parser:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold):
|
def update(self, Doc tokens, GoldParse gold):
|
||||||
|
"""Update the statistical model.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc):
|
||||||
|
The example document for the update.
|
||||||
|
gold (GoldParse):
|
||||||
|
The gold-standard annotations, to calculate the loss.
|
||||||
|
Returns (float):
|
||||||
|
The loss on this example.
|
||||||
|
"""
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
|
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
|
||||||
self.moves.initialize_state(stcls.c)
|
self.moves.initialize_state(stcls.c)
|
||||||
|
@ -220,9 +270,24 @@ cdef class Parser:
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def step_through(self, Doc doc):
|
def step_through(self, Doc doc):
|
||||||
|
"""Set up a stepwise state, to introspect and control the transition sequence.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc): The document to step through.
|
||||||
|
Returns (StepwiseState):
|
||||||
|
A state object, to step through the annotation process.
|
||||||
|
"""
|
||||||
return StepwiseState(self, doc)
|
return StepwiseState(self, doc)
|
||||||
|
|
||||||
def from_transition_sequence(self, Doc doc, sequence):
|
def from_transition_sequence(self, Doc doc, sequence):
|
||||||
|
"""Control the annotations on a document by specifying a transition sequence
|
||||||
|
to follow.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc): The document to annotate.
|
||||||
|
sequence: A sequence of action names, as unicode strings.
|
||||||
|
Returns: None
|
||||||
|
"""
|
||||||
with self.step_through(doc) as stepwise:
|
with self.step_through(doc) as stepwise:
|
||||||
for transition in sequence:
|
for transition in sequence:
|
||||||
stepwise.transition(transition)
|
stepwise.transition(transition)
|
||||||
|
@ -233,7 +298,6 @@ cdef class Parser:
|
||||||
self.moves.add_action(action, label)
|
self.moves.add_action(action, label)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef class StepwiseState:
|
cdef class StepwiseState:
|
||||||
cdef readonly StateClass stcls
|
cdef readonly StateClass stcls
|
||||||
cdef readonly Example eg
|
cdef readonly Example eg
|
||||||
|
|
|
@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
"""A part-of-speech tagger for English"""
|
"""Annotate part-of-speech tags on Doc objects."""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab, require=False):
|
def load(cls, path, vocab, require=False):
|
||||||
|
"""Load the statistical model from the supplied path.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
path (Path):
|
||||||
|
The path to load from.
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary. Must be shared by the documents to be processed.
|
||||||
|
require (bool):
|
||||||
|
Whether to raise an error if the files are not found.
|
||||||
|
Returns (Tagger):
|
||||||
|
The newly created object.
|
||||||
|
"""
|
||||||
# TODO: Change this to expect config.json when we don't have to
|
# TODO: Change this to expect config.json when we don't have to
|
||||||
# support old data.
|
# support old data.
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||||
|
@ -126,6 +138,16 @@ cdef class Tagger:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||||
|
"""Create a Tagger.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
vocab (Vocab):
|
||||||
|
The vocabulary object. Must be shared with documents to be processed.
|
||||||
|
model (thinc.linear.AveragedPerceptron):
|
||||||
|
The statistical model.
|
||||||
|
Returns (Tagger):
|
||||||
|
The newly constructed object.
|
||||||
|
"""
|
||||||
if model is None:
|
if model is None:
|
||||||
model = TaggerModel(cfg.get('features', self.feature_templates))
|
model = TaggerModel(cfg.get('features', self.feature_templates))
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -154,8 +176,10 @@ cdef class Tagger:
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
||||||
Args:
|
Arguments:
|
||||||
tokens (Doc): The tokens to be tagged.
|
doc (Doc): The tokens to be tagged.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
"""
|
"""
|
||||||
if tokens.length == 0:
|
if tokens.length == 0:
|
||||||
return 0
|
return 0
|
||||||
|
@ -178,11 +202,33 @@ cdef class Tagger:
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||||
|
"""Tag a stream of documents.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
stream: The sequence of documents to tag.
|
||||||
|
batch_size (int):
|
||||||
|
The number of documents to accumulate into a working set.
|
||||||
|
n_threads (int):
|
||||||
|
The number of threads with which to work on the buffer in parallel,
|
||||||
|
if the Matcher implementation supports multi-threading.
|
||||||
|
Yields:
|
||||||
|
Doc Documents, in order.
|
||||||
|
"""
|
||||||
for doc in stream:
|
for doc in stream:
|
||||||
self(doc)
|
self(doc)
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold):
|
def update(self, Doc tokens, GoldParse gold):
|
||||||
|
"""Update the statistical model, with tags supplied for the given document.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc):
|
||||||
|
The document to update on.
|
||||||
|
gold (GoldParse):
|
||||||
|
Manager for the gold-standard tags.
|
||||||
|
Returns (int):
|
||||||
|
Number of tags correct.
|
||||||
|
"""
|
||||||
gold_tag_strs = gold.tags
|
gold_tag_strs = gold.tags
|
||||||
assert len(tokens) == len(gold_tag_strs)
|
assert len(tokens) == len(gold_tag_strs)
|
||||||
for tag in gold_tag_strs:
|
for tag in gold_tag_strs:
|
||||||
|
|
|
@ -219,6 +219,16 @@ cdef class Doc:
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
other (object): The object to compare with. By default, accepts Doc,
|
||||||
|
Span, Token and Lexeme objects.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
|
'''
|
||||||
if 'similarity' in self.user_hooks:
|
if 'similarity' in self.user_hooks:
|
||||||
return self.user_hooks['similarity'](self, other)
|
return self.user_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -226,6 +236,9 @@ cdef class Doc:
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
|
'''
|
||||||
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.user_hooks:
|
if 'has_vector' in self.user_hooks:
|
||||||
return self.user_hooks['has_vector'](self)
|
return self.user_hooks['has_vector'](self)
|
||||||
|
@ -233,6 +246,11 @@ cdef class Doc:
|
||||||
return any(token.has_vector for token in self)
|
return any(token.has_vector for token in self)
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
|
'''
|
||||||
|
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||||
|
|
||||||
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.user_hooks:
|
if 'vector' in self.user_hooks:
|
||||||
return self.user_hooks['vector'](self)
|
return self.user_hooks['vector'](self)
|
||||||
|
@ -265,14 +283,16 @@ cdef class Doc:
|
||||||
@property
|
@property
|
||||||
def string(self):
|
def string(self):
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
|
property text
|
||||||
|
'''A unicode representation of the document text.'''
|
||||||
|
def __get__(self):
|
||||||
|
return u''.join(t.text_with_ws for t in self)
|
||||||
|
|
||||||
@property
|
property text_with_ws:
|
||||||
def text_with_ws(self):
|
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
|
||||||
return self.text
|
def __get__(self):
|
||||||
|
return self.text
|
||||||
@property
|
|
||||||
def text(self):
|
|
||||||
return u''.join(t.text_with_ws for t in self)
|
|
||||||
|
|
||||||
property ents:
|
property ents:
|
||||||
'''
|
'''
|
||||||
|
@ -567,7 +587,6 @@ cdef class Doc:
|
||||||
set_children_from_heads(self.c, self.length)
|
set_children_from_heads(self.c, self.length)
|
||||||
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
||||||
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self):
|
def to_bytes(self):
|
||||||
|
@ -612,7 +631,22 @@ cdef class Doc:
|
||||||
yield n_bytes_str + data
|
yield n_bytes_str + data
|
||||||
|
|
||||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||||
"""Merge a multi-word expression into a single token."""
|
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||||
|
is merged into a single token. If start_idx and end_idx do not mark start
|
||||||
|
and end token boundaries, the document remains unchanged.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
start_idx (int): The character index of the start of the slice to merge.
|
||||||
|
end_idx (int): The character index after the end of the slice to merge.
|
||||||
|
**attributes:
|
||||||
|
Attributes to assign to the merged token. By default, attributes
|
||||||
|
are inherited from the syntactic root token of the span.
|
||||||
|
Returns:
|
||||||
|
token (Token):
|
||||||
|
The newly merged token, or None if the start and end indices did
|
||||||
|
not fall at token boundaries.
|
||||||
|
|
||||||
|
"""
|
||||||
cdef unicode tag, lemma, ent_type
|
cdef unicode tag, lemma, ent_type
|
||||||
if len(args) == 3:
|
if len(args) == 3:
|
||||||
# TODO: Warn deprecation
|
# TODO: Warn deprecation
|
||||||
|
|
|
@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
cdef class Span:
|
cdef class Span:
|
||||||
"""A slice from a Doc object."""
|
"""A slice from a Doc object."""
|
||||||
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
|
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||||
vector_norm=None):
|
vector_norm=None):
|
||||||
|
'''Create a Span object from the slice doc[start : end]
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
doc (Doc): The parent document.
|
||||||
|
start (int): The index of the first token of the span.
|
||||||
|
end (int): The index of the first token after the span.
|
||||||
|
label (int): A label to attach to the Span, e.g. for named entities.
|
||||||
|
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||||
|
Returns:
|
||||||
|
Span The newly constructed object.
|
||||||
|
'''
|
||||||
if not (0 <= start <= end <= len(tokens)):
|
if not (0 <= start <= end <= len(tokens)):
|
||||||
raise IndexError
|
raise IndexError
|
||||||
|
|
||||||
self.doc = tokens
|
self.doc = doc
|
||||||
self.start = start
|
self.start = start
|
||||||
self.start_char = self.doc[start].idx if start < self.doc.length else 0
|
self.start_char = self.doc[start].idx if start < self.doc.length else 0
|
||||||
self.end = end
|
self.end = end
|
||||||
|
@ -78,9 +89,29 @@ cdef class Span:
|
||||||
yield self.doc[i]
|
yield self.doc[i]
|
||||||
|
|
||||||
def merge(self, *args, **attributes):
|
def merge(self, *args, **attributes):
|
||||||
|
"""Retokenize the document, such that the span is merged into a single token.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
**attributes:
|
||||||
|
Attributes to assign to the merged token. By default, attributes
|
||||||
|
are inherited from the syntactic root token of the span.
|
||||||
|
Returns:
|
||||||
|
token (Token):
|
||||||
|
The newly merged token.
|
||||||
|
"""
|
||||||
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||||
|
similarity using an average of word vectors.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
other (object): The object to compare with. By default, accepts Doc,
|
||||||
|
Span, Token and Lexeme objects.
|
||||||
|
|
||||||
|
Return:
|
||||||
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
|
'''
|
||||||
if 'similarity' in self.doc.user_span_hooks:
|
if 'similarity' in self.doc.user_span_hooks:
|
||||||
self.doc.user_span_hooks['similarity'](self, other)
|
self.doc.user_span_hooks['similarity'](self, other)
|
||||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||||
|
@ -102,7 +133,11 @@ cdef class Span:
|
||||||
self.end = end + 1
|
self.end = end + 1
|
||||||
|
|
||||||
property sent:
|
property sent:
|
||||||
'''Get the sentence span that this span is a part of.'''
|
'''The sentence span that this span is a part of.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Span The sentence this is part of.
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'sent' in self.doc.user_span_hooks:
|
if 'sent' in self.doc.user_span_hooks:
|
||||||
return self.doc.user_span_hooks['sent'](self)
|
return self.doc.user_span_hooks['sent'](self)
|
||||||
|
@ -156,7 +191,12 @@ cdef class Span:
|
||||||
return u''.join([t.text_with_ws for t in self])
|
return u''.join([t.text_with_ws for t in self])
|
||||||
|
|
||||||
property root:
|
property root:
|
||||||
"""The word of the span that is highest in the parse tree, i.e. has the
|
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Token: The root token.
|
||||||
|
|
||||||
|
i.e. has the
|
||||||
shortest path to the root of the sentence (or is the root itself).
|
shortest path to the root of the sentence (or is the root itself).
|
||||||
|
|
||||||
If multiple words are equally high in the tree, the first word is taken.
|
If multiple words are equally high in the tree, the first word is taken.
|
||||||
|
@ -231,7 +271,10 @@ cdef class Span:
|
||||||
return self.doc[root]
|
return self.doc[root]
|
||||||
|
|
||||||
property lefts:
|
property lefts:
|
||||||
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
"""Tokens that are to the left of the span, whose head is within the Span.
|
||||||
|
|
||||||
|
Yields: Token A left-child of a token of the span.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
for token in reversed(self): # Reverse, so we get the tokens in order
|
||||||
for left in token.lefts:
|
for left in token.lefts:
|
||||||
|
@ -239,7 +282,10 @@ cdef class Span:
|
||||||
yield left
|
yield left
|
||||||
|
|
||||||
property rights:
|
property rights:
|
||||||
"""Tokens that are to the right of the Span, whose head is within the Span."""
|
"""Tokens that are to the right of the Span, whose head is within the Span.
|
||||||
|
|
||||||
|
Yields: Token A right-child of a token of the span.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for token in self:
|
for token in self:
|
||||||
for right in token.rights:
|
for right in token.rights:
|
||||||
|
@ -247,6 +293,10 @@ cdef class Span:
|
||||||
yield right
|
yield right
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
|
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||||
|
|
||||||
|
Yields: Token A descendant of a token within the span.
|
||||||
|
"""
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
|
@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||||
via Doc.__getitem__ and Doc.__iter__.
|
|
||||||
"""
|
"""
|
||||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
|
@ -40,6 +39,7 @@ cdef class Token:
|
||||||
self.i = offset
|
self.i = offset
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
|
'''Number of unicode characters in token.text'''
|
||||||
return self.c.lex.length
|
return self.c.lex.length
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
|
@ -57,12 +57,35 @@ cdef class Token:
|
||||||
return self.__str__()
|
return self.__str__()
|
||||||
|
|
||||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||||
|
'''Check the value of a boolean flag.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
flag_id (int): The ID of the flag attribute.
|
||||||
|
Returns:
|
||||||
|
is_set (bool): Whether the flag is set.
|
||||||
|
'''
|
||||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||||
|
|
||||||
def nbor(self, int i=1):
|
def nbor(self, int i=1):
|
||||||
|
'''Get a neighboring token.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
i (int): The relative position of the token to get. Defaults to 1.
|
||||||
|
Returns:
|
||||||
|
neighbor (Token): The token at position self.doc[self.i+i]
|
||||||
|
'''
|
||||||
return self.doc[self.i+i]
|
return self.doc[self.i+i]
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
|
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
other:
|
||||||
|
The object to compare with. By default, accepts Doc, Span,
|
||||||
|
Token and Lexeme objects.
|
||||||
|
Returns:
|
||||||
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
|
'''
|
||||||
if 'similarity' in self.doc.user_token_hooks:
|
if 'similarity' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['similarity'](self)
|
return self.doc.user_token_hooks['similarity'](self)
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
|
@ -158,6 +181,9 @@ cdef class Token:
|
||||||
self.c.dep = label
|
self.c.dep = label
|
||||||
|
|
||||||
property has_vector:
|
property has_vector:
|
||||||
|
'''
|
||||||
|
A boolean value indicating whether a word vector is associated with the object.
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'has_vector' in self.doc.user_token_hooks:
|
if 'has_vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['has_vector'](self)
|
return self.doc.user_token_hooks['has_vector'](self)
|
||||||
|
@ -169,6 +195,11 @@ cdef class Token:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
property vector:
|
property vector:
|
||||||
|
'''
|
||||||
|
A real-valued meaning representation.
|
||||||
|
|
||||||
|
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
if 'vector' in self.doc.user_token_hooks:
|
if 'vector' in self.doc.user_token_hooks:
|
||||||
return self.doc.user_token_hooks['vector'](self)
|
return self.doc.user_token_hooks['vector'](self)
|
||||||
|
@ -241,11 +272,19 @@ cdef class Token:
|
||||||
yield t
|
yield t
|
||||||
|
|
||||||
property children:
|
property children:
|
||||||
|
'''A sequence of the token's immediate syntactic children.
|
||||||
|
|
||||||
|
Yields: Token A child token such that child.head==self
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
yield from self.lefts
|
yield from self.lefts
|
||||||
yield from self.rights
|
yield from self.rights
|
||||||
|
|
||||||
property subtree:
|
property subtree:
|
||||||
|
'''A sequence of all the token's syntactic descendents.
|
||||||
|
|
||||||
|
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
for word in self.lefts:
|
for word in self.lefts:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
@ -254,14 +293,26 @@ cdef class Token:
|
||||||
yield from word.subtree
|
yield from word.subtree
|
||||||
|
|
||||||
property left_edge:
|
property left_edge:
|
||||||
|
'''The leftmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
|
Returns: Token The first token such that self.is_ancestor(token)
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.l_edge]
|
return self.doc[self.c.l_edge]
|
||||||
|
|
||||||
property right_edge:
|
property right_edge:
|
||||||
|
'''The rightmost token of this token's syntactic descendents.
|
||||||
|
|
||||||
|
Returns: Token The last token such that self.is_ancestor(token)
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self.doc[self.c.r_edge]
|
return self.doc[self.c.r_edge]
|
||||||
|
|
||||||
property ancestors:
|
property ancestors:
|
||||||
|
'''A sequence of this token's syntactic ancestors.
|
||||||
|
|
||||||
|
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* head_ptr = self.c
|
cdef const TokenC* head_ptr = self.c
|
||||||
# guard against infinite loop, no token can have
|
# guard against infinite loop, no token can have
|
||||||
|
@ -273,9 +324,27 @@ cdef class Token:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
def is_ancestor_of(self, descendant):
|
def is_ancestor_of(self, descendant):
|
||||||
|
# TODO: Remove after backward compatibility check.
|
||||||
|
return self.is_ancestor(descendant)
|
||||||
|
|
||||||
|
def is_ancestor(self, descendant):
|
||||||
|
'''Check whether this token is a parent, grandparent, etc. of another
|
||||||
|
in the dependency tree.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
descendant (Token): Another token.
|
||||||
|
Returns:
|
||||||
|
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||||
|
'''
|
||||||
|
if self.doc is not other.doc:
|
||||||
|
return False
|
||||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
|
'''The syntactic parent, or "governor", of this token.
|
||||||
|
|
||||||
|
Returns: Token
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
return self.doc[self.i + self.c.head]
|
return self.doc[self.i + self.c.head]
|
||||||
|
@ -370,6 +439,10 @@ cdef class Token:
|
||||||
self.c.head = rel_newhead_i
|
self.c.head = rel_newhead_i
|
||||||
|
|
||||||
property conjuncts:
|
property conjuncts:
|
||||||
|
'''A sequence of coordinated tokens, including the token itself.
|
||||||
|
|
||||||
|
Yields: Token A coordinated token
|
||||||
|
'''
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Get a list of conjoined words."""
|
"""Get a list of conjoined words."""
|
||||||
cdef Token word
|
cdef Token word
|
||||||
|
|
115
spacy/vocab.pyx
115
spacy/vocab.pyx
|
@ -52,6 +52,25 @@ cdef class Vocab:
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||||
|
"""
|
||||||
|
Load the vocabulary from a path.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
path (Path):
|
||||||
|
The path to load from.
|
||||||
|
lex_attr_getters (dict):
|
||||||
|
A dictionary mapping attribute IDs to functions to compute them.
|
||||||
|
Defaults to None.
|
||||||
|
lemmatizer (object):
|
||||||
|
A lemmatizer. Defaults to None.
|
||||||
|
tag_map (dict):
|
||||||
|
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||||
|
and optionally morphological attributes.
|
||||||
|
oov_prob (float):
|
||||||
|
The default probability for out-of-vocabulary words.
|
||||||
|
Returns:
|
||||||
|
Vocab: The newly constructed vocab object.
|
||||||
|
"""
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
if 'vectors' in deprecated_kwargs:
|
if 'vectors' in deprecated_kwargs:
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
|
@ -82,6 +101,22 @@ cdef class Vocab:
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
serializer_freqs=None, **deprecated_kwargs):
|
serializer_freqs=None, **deprecated_kwargs):
|
||||||
|
'''Create the vocabulary.
|
||||||
|
|
||||||
|
lex_attr_getters (dict):
|
||||||
|
A dictionary mapping attribute IDs to functions to compute them.
|
||||||
|
Defaults to None.
|
||||||
|
lemmatizer (object):
|
||||||
|
A lemmatizer. Defaults to None.
|
||||||
|
tag_map (dict):
|
||||||
|
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||||
|
and optionally morphological attributes.
|
||||||
|
oov_prob (float):
|
||||||
|
The default probability for out-of-vocabulary words.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Vocab: The newly constructed vocab object.
|
||||||
|
'''
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
|
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
|
@ -134,6 +169,9 @@ cdef class Vocab:
|
||||||
'''
|
'''
|
||||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||||
vectors if necessary. The memory will be zeroed.
|
vectors if necessary. The memory will be zeroed.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
new_size (int): The new size of the vectors.
|
||||||
'''
|
'''
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
|
@ -145,11 +183,14 @@ cdef class Vocab:
|
||||||
self.vectors_length = new_size
|
self.vectors_length = new_size
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
'''Set a new boolean flag to words in the vocabulary. The flag_setter
|
'''Set a new boolean flag to words in the vocabulary.
|
||||||
function will be called over the words currently in the vocab, and then
|
|
||||||
applied to new words as they occur. You'll then be able to access the
|
The flag_setter function will be called over the words currently in the
|
||||||
flag value on each token, using token.check_flag(flag_id). See also:
|
vocab, and then applied to new words as they occur. You'll then be able
|
||||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
to access the flag value on each token, using token.check_flag(flag_id).
|
||||||
|
|
||||||
|
See also:
|
||||||
|
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_getter:
|
flag_getter:
|
||||||
|
@ -246,11 +287,23 @@ cdef class Vocab:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def __contains__(self, unicode string):
|
def __contains__(self, unicode string):
|
||||||
|
'''Check whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
string (unicode): The ID string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool Whether the string has an entry in the vocabulary.
|
||||||
|
'''
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
lex = self._by_hash.get(key)
|
lex = self._by_hash.get(key)
|
||||||
return True if lex is not NULL else False
|
return True if lex is not NULL else False
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
|
'''Iterate over the lexemes in the vocabulary.
|
||||||
|
|
||||||
|
Yields: Lexeme An entry in the vocabulary.
|
||||||
|
'''
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
for orth, addr in self._by_orth.items():
|
for orth, addr in self._by_orth.items():
|
||||||
|
@ -260,16 +313,15 @@ cdef class Vocab:
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
unseen unicode string is given, a new lexeme is created and stored.
|
unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
Args:
|
Arguments:
|
||||||
id_or_string (int or unicode):
|
id_or_string (int or unicode):
|
||||||
The integer ID of a word, or its unicode string. If an int >= Lexicon.size,
|
The integer ID of a word, or its unicode string.
|
||||||
IndexError is raised. If id_or_string is neither an int nor a unicode string,
|
|
||||||
ValueError is raised.
|
If an int >= Lexicon.size, IndexError is raised. If id_or_string
|
||||||
|
is neither an int nor a unicode string, ValueError is raised.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme):
|
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||||
An instance of the Lexeme Python class, with data copied on
|
|
||||||
instantiation.
|
|
||||||
'''
|
'''
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == unicode:
|
if type(id_or_string) == unicode:
|
||||||
|
@ -295,6 +347,11 @@ cdef class Vocab:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def dump(self, loc):
|
def dump(self, loc):
|
||||||
|
"""Save the lexemes binary data to the given location.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
loc (Path): The path to save to.
|
||||||
|
"""
|
||||||
if hasattr(loc, 'as_posix'):
|
if hasattr(loc, 'as_posix'):
|
||||||
loc = loc.as_posix()
|
loc = loc.as_posix()
|
||||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||||
|
@ -323,6 +380,14 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
|
'''Load the binary vocabulary data from the given location.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
loc (Path): The path to load from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
'''
|
||||||
fp = CFile(loc, 'rb',
|
fp = CFile(loc, 'rb',
|
||||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
|
@ -363,6 +428,13 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
def dump_vectors(self, out_loc):
|
||||||
|
'''Save the word vectors to a binary file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
loc (Path): The path to save to.
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
'''
|
||||||
cdef int32_t vec_len = self.vectors_length
|
cdef int32_t vec_len = self.vectors_length
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef bytes word_str
|
cdef bytes word_str
|
||||||
|
@ -384,6 +456,17 @@ cdef class Vocab:
|
||||||
out_file.close()
|
out_file.close()
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
def load_vectors(self, file_):
|
||||||
|
"""Load vectors from a text-based file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||||
|
and each entry should be whitespace delimited. The first value of the entry
|
||||||
|
should be the word string, and subsequent entries should be the values of the
|
||||||
|
vector.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
vec_len (int): The length of the vectors loaded.
|
||||||
|
"""
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef int32_t vec_len = -1
|
cdef int32_t vec_len = -1
|
||||||
|
@ -409,6 +492,14 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
def load_vectors_from_bin_loc(self, loc):
|
||||||
|
"""Load vectors from the location of a binary file.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
loc (unicode): The path of the binary file to load from.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
vec_len (int): The length of the vectors loaded.
|
||||||
|
"""
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
cdef CFile file_ = CFile(loc, b'rb')
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef int32_t vec_len = 0
|
cdef int32_t vec_len = 0
|
||||||
|
|
Loading…
Reference in New Issue
Block a user