mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Fix doc strings
This commit is contained in:
parent
18aab4f71e
commit
b86f8af0c1
|
@ -212,6 +212,7 @@ def _consume_ent(tags):
|
|||
|
||||
|
||||
cdef class GoldParse:
|
||||
"""Collection for training annotations."""
|
||||
@classmethod
|
||||
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
|
||||
_, words, tags, heads, deps, entities = annot_tuples
|
||||
|
@ -220,6 +221,25 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
"""Create a GoldParse.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document the annotations refer to.
|
||||
words:
|
||||
A sequence of unicode word strings.
|
||||
tags:
|
||||
A sequence of strings, representing tag annotations.
|
||||
heads:
|
||||
A sequence of integers, representing syntactic head offsets.
|
||||
deps:
|
||||
A sequence of strings, representing the syntactic relation types.
|
||||
entities:
|
||||
A sequence of named entity annotations, either as BILUO tag strings,
|
||||
or as (start_char, end_char, label) tuples, representing the entity
|
||||
positions.
|
||||
Returns (GoldParse): The newly constructed object.
|
||||
"""
|
||||
if words is None:
|
||||
words = [token.text for token in doc]
|
||||
if tags is None:
|
||||
|
@ -280,10 +300,16 @@ cdef class GoldParse:
|
|||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of gold-standard tokens.
|
||||
|
||||
Returns (int): The number of gold-standard tokens.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""Whether the provided syntactic annotations form a projective dependency
|
||||
tree."""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
|
|
|
@ -293,13 +293,14 @@ class Language(object):
|
|||
text (unicode): The text to be processed.
|
||||
|
||||
Returns:
|
||||
tokens (spacy.tokens.Doc):
|
||||
doc (Doc): A container for accessing the annotations.
|
||||
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
Example:
|
||||
>>> from spacy.en import English
|
||||
>>> nlp = English()
|
||||
>>> tokens = nlp('An example sentence. Another example sentence.')
|
||||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
doc = self.make_doc(text)
|
||||
if self.entity and entity:
|
||||
|
@ -314,6 +315,16 @@ class Language(object):
|
|||
return doc
|
||||
|
||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||
'''Process texts as a stream, and yield Doc objects in order.
|
||||
|
||||
Supports GIL-free multi-threading.
|
||||
|
||||
Arguments:
|
||||
texts (iterator)
|
||||
tag (bool)
|
||||
parse (bool)
|
||||
entity (bool)
|
||||
'''
|
||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||
stream = (self.make_doc(text) for text in texts)
|
||||
for proc in self.pipeline:
|
||||
|
|
|
@ -36,6 +36,13 @@ cdef class Lexeme:
|
|||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
"""Create a Lexeme object.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab): The parent vocabulary
|
||||
orth (int): The orth id of the lexeme.
|
||||
Returns (Lexeme): The newly constructd object.
|
||||
"""
|
||||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
|
@ -73,12 +80,33 @@ cdef class Lexeme:
|
|||
return self.c.orth
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""Change the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
value (bool): The new value of the flag.
|
||||
"""
|
||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||
|
||||
def check_flag(self, attr_id_t flag_id):
|
||||
"""Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
Returns (bool): The value of the flag.
|
||||
"""
|
||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
The object to compare with. By default, accepts Doc, Span,
|
||||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
|
|
@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store):
|
|||
|
||||
|
||||
cdef class Matcher:
|
||||
'''Match sequences of tokens, based on pattern rules.'''
|
||||
cdef Pool mem
|
||||
cdef vector[TokenPatternC*] patterns
|
||||
cdef readonly Vocab vocab
|
||||
|
@ -175,6 +176,16 @@ cdef class Matcher:
|
|||
|
||||
@classmethod
|
||||
def load(cls, path, vocab):
|
||||
'''Load the matcher and patterns from a file path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
Path to a JSON-formatted patterns file.
|
||||
vocab (Vocab):
|
||||
The vocabulary that the documents to match over will refer to.
|
||||
Returns:
|
||||
Matcher: The newly constructed object.
|
||||
'''
|
||||
if (path / 'gazetteer.json').exists():
|
||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||
patterns = json.load(file_)
|
||||
|
@ -183,6 +194,16 @@ cdef class Matcher:
|
|||
return cls(vocab, patterns)
|
||||
|
||||
def __init__(self, vocab, patterns={}):
|
||||
"""Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object, which must be shared with the documents
|
||||
the matcher will operate on.
|
||||
patterns (dict): Patterns to add to the matcher.
|
||||
Returns:
|
||||
The newly constructed object.
|
||||
"""
|
||||
self._patterns = {}
|
||||
self._entities = {}
|
||||
self._acceptors = {}
|
||||
|
@ -203,6 +224,22 @@ cdef class Matcher:
|
|||
|
||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||
acceptor=None, on_match=None):
|
||||
"""Add an entity to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
attrs:
|
||||
Attributes to associate with the Matcher.
|
||||
if_exists ('raise', 'ignore' or 'update'):
|
||||
Controls what happens if the entity ID already exists. Defaults to 'raise'.
|
||||
acceptor:
|
||||
Callback function to filter matches of the entity.
|
||||
on_match:
|
||||
Callback function to act on matches of the entity.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if if_exists not in ('raise', 'ignore', 'update'):
|
||||
raise ValueError(
|
||||
"Unexpected value for if_exists: %s.\n"
|
||||
|
@ -224,6 +261,18 @@ cdef class Matcher:
|
|||
self._callbacks[entity_key] = on_match
|
||||
|
||||
def add_pattern(self, entity_key, token_specs, label=""):
|
||||
"""Add a pattern to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
An ID for the entity.
|
||||
token_specs:
|
||||
Description of the pattern to be matched.
|
||||
label:
|
||||
Label to assign to the matched pattern. Defaults to "".
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if not self.has_entity(entity_key):
|
||||
self.add_entity(entity_key)
|
||||
|
@ -249,10 +298,24 @@ cdef class Matcher:
|
|||
return entity_key
|
||||
|
||||
def has_entity(self, entity_key):
|
||||
"""Check whether the matcher has an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (string or int): The entity key to check.
|
||||
Returns:
|
||||
bool: Whether the matcher has the entity.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
return entity_key in self._entities
|
||||
|
||||
def get_entity(self, entity_key):
|
||||
"""Retrieve the attributes stored for an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int): The entity to retrieve.
|
||||
Returns:
|
||||
The entity attributes if present, otherwise None.
|
||||
"""
|
||||
entity_key = self.normalize_entity_key(entity_key)
|
||||
if entity_key in self._entities:
|
||||
return self._entities[entity_key]
|
||||
|
@ -260,6 +323,17 @@ cdef class Matcher:
|
|||
return None
|
||||
|
||||
def __call__(self, Doc doc, acceptor=None):
|
||||
"""Find all token sequences matching the supplied patterns on the Doc.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to match over.
|
||||
Returns:
|
||||
list
|
||||
A list of (entity_key, label_id, start, end) tuples,
|
||||
describing the matches. A match tuple describes a span doc[start:end].
|
||||
The label_id and entity_key are both integers.
|
||||
"""
|
||||
if acceptor is not None:
|
||||
raise ValueError(
|
||||
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
|
||||
|
@ -340,6 +414,18 @@ cdef class Matcher:
|
|||
return matches
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
docs: A stream of documents.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
"""
|
||||
for doc in docs:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
|
|
@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE
|
|||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
|
|
@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
|
|||
cdef class StringStore:
|
||||
'''Map strings to and from integer IDs.'''
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
'''Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
'''
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
|
@ -89,9 +94,22 @@ cdef class StringStore:
|
|||
return self.size -1
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
"""
|
||||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
Returns:
|
||||
unicode or int: The value to retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
elif string_or_id == 0:
|
||||
|
@ -127,12 +145,23 @@ cdef class StringStore:
|
|||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
Returns bool:
|
||||
Whether the store contains the string.
|
||||
"""
|
||||
if len(string) == 0:
|
||||
return True
|
||||
cdef hash_t key = hash_string(string)
|
||||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
for i in range(self.size):
|
||||
yield _decode(&self.c[i]) if i > 0 else u''
|
||||
|
@ -185,6 +214,13 @@ cdef class StringStore:
|
|||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
string_data = json.dumps(list(self))
|
||||
if not isinstance(string_data, unicode):
|
||||
string_data = string_data.decode('utf8')
|
||||
|
@ -192,6 +228,13 @@ cdef class StringStore:
|
|||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
strings = json.load(file_)
|
||||
if strings == ['']:
|
||||
return None
|
||||
|
|
|
@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron):
|
|||
|
||||
|
||||
cdef class Parser:
|
||||
"""Base class of the DependencyParser and EntityRecognizer."""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
|
||||
"""Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
The vocabulary. Must be shared by the documents to be processed.
|
||||
require (bool):
|
||||
Whether to raise an error if the files are not found.
|
||||
Returns (Parser):
|
||||
The newly constructed object.
|
||||
"""
|
||||
with (path / 'config.json').open() as file_:
|
||||
cfg = json.load(file_)
|
||||
# TODO: remove this shim when we don't have to support older data
|
||||
|
@ -90,6 +103,16 @@ cdef class Parser:
|
|||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
|
||||
"""Create a Parser.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
model (thinc.linear.AveragedPerceptron):
|
||||
The statistical model.
|
||||
Returns (Parser):
|
||||
The newly constructed object.
|
||||
"""
|
||||
if TransitionSystem is None:
|
||||
TransitionSystem = self.TransitionSystem
|
||||
self.vocab = vocab
|
||||
|
@ -107,6 +130,13 @@ cdef class Parser:
|
|||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the entity recognizer, setting the annotations onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to be processed.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
cdef int nr_feat = self.model.nr_feat
|
||||
with nogil:
|
||||
status = self.parseC(tokens.c, tokens.length, nr_feat)
|
||||
|
@ -117,6 +147,16 @@ cdef class Parser:
|
|||
self.moves.finalize_doc(tokens)
|
||||
|
||||
def pipe(self, stream, int batch_size=1000, int n_threads=2):
|
||||
"""Process a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to process.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel.
|
||||
Yields (Doc): Documents, in order.
|
||||
"""
|
||||
cdef Pool mem = Pool()
|
||||
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
|
||||
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
|
||||
|
@ -194,6 +234,16 @@ cdef class Parser:
|
|||
return 0
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold):
|
||||
"""Update the statistical model.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The example document for the update.
|
||||
gold (GoldParse):
|
||||
The gold-standard annotations, to calculate the loss.
|
||||
Returns (float):
|
||||
The loss on this example.
|
||||
"""
|
||||
self.moves.preprocess_gold(gold)
|
||||
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
|
||||
self.moves.initialize_state(stcls.c)
|
||||
|
@ -220,9 +270,24 @@ cdef class Parser:
|
|||
return loss
|
||||
|
||||
def step_through(self, Doc doc):
|
||||
"""Set up a stepwise state, to introspect and control the transition sequence.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to step through.
|
||||
Returns (StepwiseState):
|
||||
A state object, to step through the annotation process.
|
||||
"""
|
||||
return StepwiseState(self, doc)
|
||||
|
||||
def from_transition_sequence(self, Doc doc, sequence):
|
||||
"""Control the annotations on a document by specifying a transition sequence
|
||||
to follow.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The document to annotate.
|
||||
sequence: A sequence of action names, as unicode strings.
|
||||
Returns: None
|
||||
"""
|
||||
with self.step_through(doc) as stepwise:
|
||||
for transition in sequence:
|
||||
stepwise.transition(transition)
|
||||
|
@ -233,7 +298,6 @@ cdef class Parser:
|
|||
self.moves.add_action(action, label)
|
||||
|
||||
|
||||
|
||||
cdef class StepwiseState:
|
||||
cdef readonly StateClass stcls
|
||||
cdef readonly Example eg
|
||||
|
|
|
@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
|
||||
|
||||
cdef class Tagger:
|
||||
"""A part-of-speech tagger for English"""
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
@classmethod
|
||||
def load(cls, path, vocab, require=False):
|
||||
"""Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
vocab (Vocab):
|
||||
The vocabulary. Must be shared by the documents to be processed.
|
||||
require (bool):
|
||||
Whether to raise an error if the files are not found.
|
||||
Returns (Tagger):
|
||||
The newly created object.
|
||||
"""
|
||||
# TODO: Change this to expect config.json when we don't have to
|
||||
# support old data.
|
||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
||||
|
@ -126,6 +138,16 @@ cdef class Tagger:
|
|||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""Create a Tagger.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
The vocabulary object. Must be shared with documents to be processed.
|
||||
model (thinc.linear.AveragedPerceptron):
|
||||
The statistical model.
|
||||
Returns (Tagger):
|
||||
The newly constructed object.
|
||||
"""
|
||||
if model is None:
|
||||
model = TaggerModel(cfg.get('features', self.feature_templates))
|
||||
self.vocab = vocab
|
||||
|
@ -154,8 +176,10 @@ cdef class Tagger:
|
|||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Args:
|
||||
tokens (Doc): The tokens to be tagged.
|
||||
Arguments:
|
||||
doc (Doc): The tokens to be tagged.
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if tokens.length == 0:
|
||||
return 0
|
||||
|
@ -178,11 +202,33 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
batch_size (int):
|
||||
The number of documents to accumulate into a working set.
|
||||
n_threads (int):
|
||||
The number of threads with which to work on the buffer in parallel,
|
||||
if the Matcher implementation supports multi-threading.
|
||||
Yields:
|
||||
Doc Documents, in order.
|
||||
"""
|
||||
for doc in stream:
|
||||
self(doc)
|
||||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold):
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
The document to update on.
|
||||
gold (GoldParse):
|
||||
Manager for the gold-standard tags.
|
||||
Returns (int):
|
||||
Number of tags correct.
|
||||
"""
|
||||
gold_tag_strs = gold.tags
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
for tag in gold_tag_strs:
|
||||
|
|
|
@ -219,6 +219,16 @@ cdef class Doc:
|
|||
return self.__str__()
|
||||
|
||||
def similarity(self, other):
|
||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other (object): The object to compare with. By default, accepts Doc,
|
||||
Span, Token and Lexeme objects.
|
||||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
if 'similarity' in self.user_hooks:
|
||||
return self.user_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
|
@ -226,6 +236,9 @@ cdef class Doc:
|
|||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
||||
property has_vector:
|
||||
'''
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
'''
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
return self.user_hooks['has_vector'](self)
|
||||
|
@ -233,6 +246,11 @@ cdef class Doc:
|
|||
return any(token.has_vector for token in self)
|
||||
|
||||
property vector:
|
||||
'''
|
||||
A real-valued meaning representation. Defaults to an average of the token vectors.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
'''
|
||||
def __get__(self):
|
||||
if 'vector' in self.user_hooks:
|
||||
return self.user_hooks['vector'](self)
|
||||
|
@ -266,13 +284,15 @@ cdef class Doc:
|
|||
def string(self):
|
||||
return self.text
|
||||
|
||||
@property
|
||||
def text_with_ws(self):
|
||||
return self.text
|
||||
property text
|
||||
'''A unicode representation of the document text.'''
|
||||
def __get__(self):
|
||||
return u''.join(t.text_with_ws for t in self)
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
return u''.join(t.text_with_ws for t in self)
|
||||
property text_with_ws:
|
||||
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
|
||||
def __get__(self):
|
||||
return self.text
|
||||
|
||||
property ents:
|
||||
'''
|
||||
|
@ -567,7 +587,6 @@ cdef class Doc:
|
|||
set_children_from_heads(self.c, self.length)
|
||||
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
|
||||
self.is_tagged = bool(TAG in attrs or POS in attrs)
|
||||
|
||||
return self
|
||||
|
||||
def to_bytes(self):
|
||||
|
@ -612,7 +631,22 @@ cdef class Doc:
|
|||
yield n_bytes_str + data
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Merge a multi-word expression into a single token."""
|
||||
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
|
||||
is merged into a single token. If start_idx and end_idx do not mark start
|
||||
and end token boundaries, the document remains unchanged.
|
||||
|
||||
Arguments:
|
||||
start_idx (int): The character index of the start of the slice to merge.
|
||||
end_idx (int): The character index after the end of the slice to merge.
|
||||
**attributes:
|
||||
Attributes to assign to the merged token. By default, attributes
|
||||
are inherited from the syntactic root token of the span.
|
||||
Returns:
|
||||
token (Token):
|
||||
The newly merged token, or None if the start and end indices did
|
||||
not fall at token boundaries.
|
||||
|
||||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
if len(args) == 3:
|
||||
# TODO: Warn deprecation
|
||||
|
|
|
@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme
|
|||
|
||||
cdef class Span:
|
||||
"""A slice from a Doc object."""
|
||||
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
|
||||
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
|
||||
vector_norm=None):
|
||||
'''Create a Span object from the slice doc[start : end]
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The parent document.
|
||||
start (int): The index of the first token of the span.
|
||||
end (int): The index of the first token after the span.
|
||||
label (int): A label to attach to the Span, e.g. for named entities.
|
||||
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
|
||||
Returns:
|
||||
Span The newly constructed object.
|
||||
'''
|
||||
if not (0 <= start <= end <= len(tokens)):
|
||||
raise IndexError
|
||||
|
||||
self.doc = tokens
|
||||
self.doc = doc
|
||||
self.start = start
|
||||
self.start_char = self.doc[start].idx if start < self.doc.length else 0
|
||||
self.end = end
|
||||
|
@ -78,9 +89,29 @@ cdef class Span:
|
|||
yield self.doc[i]
|
||||
|
||||
def merge(self, *args, **attributes):
|
||||
"""Retokenize the document, such that the span is merged into a single token.
|
||||
|
||||
Arguments:
|
||||
**attributes:
|
||||
Attributes to assign to the merged token. By default, attributes
|
||||
are inherited from the syntactic root token of the span.
|
||||
Returns:
|
||||
token (Token):
|
||||
The newly merged token.
|
||||
"""
|
||||
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
|
||||
|
||||
def similarity(self, other):
|
||||
'''Make a semantic similarity estimate. The default estimate is cosine
|
||||
similarity using an average of word vectors.
|
||||
|
||||
Arguments:
|
||||
other (object): The object to compare with. By default, accepts Doc,
|
||||
Span, Token and Lexeme objects.
|
||||
|
||||
Return:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
if 'similarity' in self.doc.user_span_hooks:
|
||||
self.doc.user_span_hooks['similarity'](self, other)
|
||||
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
|
||||
|
@ -102,7 +133,11 @@ cdef class Span:
|
|||
self.end = end + 1
|
||||
|
||||
property sent:
|
||||
'''Get the sentence span that this span is a part of.'''
|
||||
'''The sentence span that this span is a part of.
|
||||
|
||||
Returns:
|
||||
Span The sentence this is part of.
|
||||
'''
|
||||
def __get__(self):
|
||||
if 'sent' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['sent'](self)
|
||||
|
@ -156,7 +191,12 @@ cdef class Span:
|
|||
return u''.join([t.text_with_ws for t in self])
|
||||
|
||||
property root:
|
||||
"""The word of the span that is highest in the parse tree, i.e. has the
|
||||
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
|
||||
|
||||
Returns:
|
||||
Token: The root token.
|
||||
|
||||
i.e. has the
|
||||
shortest path to the root of the sentence (or is the root itself).
|
||||
|
||||
If multiple words are equally high in the tree, the first word is taken.
|
||||
|
@ -231,7 +271,10 @@ cdef class Span:
|
|||
return self.doc[root]
|
||||
|
||||
property lefts:
|
||||
"""Tokens that are to the left of the Span, whose head is within the Span."""
|
||||
"""Tokens that are to the left of the span, whose head is within the Span.
|
||||
|
||||
Yields: Token A left-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in reversed(self): # Reverse, so we get the tokens in order
|
||||
for left in token.lefts:
|
||||
|
@ -239,7 +282,10 @@ cdef class Span:
|
|||
yield left
|
||||
|
||||
property rights:
|
||||
"""Tokens that are to the right of the Span, whose head is within the Span."""
|
||||
"""Tokens that are to the right of the Span, whose head is within the Span.
|
||||
|
||||
Yields: Token A right-child of a token of the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for token in self:
|
||||
for right in token.rights:
|
||||
|
@ -247,6 +293,10 @@ cdef class Span:
|
|||
yield right
|
||||
|
||||
property subtree:
|
||||
"""Tokens that descend from tokens in the span, but fall outside it.
|
||||
|
||||
Yields: Token A descendant of a token within the span.
|
||||
"""
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
yield from word.subtree
|
||||
|
|
|
@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme
|
|||
|
||||
|
||||
cdef class Token:
|
||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||
via Doc.__getitem__ and Doc.__iter__.
|
||||
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
|
||||
"""
|
||||
def __cinit__(self, Vocab vocab, Doc doc, int offset):
|
||||
self.vocab = vocab
|
||||
|
@ -40,6 +39,7 @@ cdef class Token:
|
|||
self.i = offset
|
||||
|
||||
def __len__(self):
|
||||
'''Number of unicode characters in token.text'''
|
||||
return self.c.lex.length
|
||||
|
||||
def __unicode__(self):
|
||||
|
@ -57,12 +57,35 @@ cdef class Token:
|
|||
return self.__str__()
|
||||
|
||||
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
|
||||
'''Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The ID of the flag attribute.
|
||||
Returns:
|
||||
is_set (bool): Whether the flag is set.
|
||||
'''
|
||||
return Lexeme.c_check_flag(self.c.lex, flag_id)
|
||||
|
||||
def nbor(self, int i=1):
|
||||
'''Get a neighboring token.
|
||||
|
||||
Arguments:
|
||||
i (int): The relative position of the token to get. Defaults to 1.
|
||||
Returns:
|
||||
neighbor (Token): The token at position self.doc[self.i+i]
|
||||
'''
|
||||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
The object to compare with. By default, accepts Doc, Span,
|
||||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
if 'similarity' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['similarity'](self)
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
|
@ -158,6 +181,9 @@ cdef class Token:
|
|||
self.c.dep = label
|
||||
|
||||
property has_vector:
|
||||
'''
|
||||
A boolean value indicating whether a word vector is associated with the object.
|
||||
'''
|
||||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['has_vector'](self)
|
||||
|
@ -169,6 +195,11 @@ cdef class Token:
|
|||
return False
|
||||
|
||||
property vector:
|
||||
'''
|
||||
A real-valued meaning representation.
|
||||
|
||||
Type: numpy.ndarray[ndim=1, dtype='float32']
|
||||
'''
|
||||
def __get__(self):
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector'](self)
|
||||
|
@ -241,11 +272,19 @@ cdef class Token:
|
|||
yield t
|
||||
|
||||
property children:
|
||||
'''A sequence of the token's immediate syntactic children.
|
||||
|
||||
Yields: Token A child token such that child.head==self
|
||||
'''
|
||||
def __get__(self):
|
||||
yield from self.lefts
|
||||
yield from self.rights
|
||||
|
||||
property subtree:
|
||||
'''A sequence of all the token's syntactic descendents.
|
||||
|
||||
Yields: Token A descendent token such that self.is_ancestor(descendent)
|
||||
'''
|
||||
def __get__(self):
|
||||
for word in self.lefts:
|
||||
yield from word.subtree
|
||||
|
@ -254,14 +293,26 @@ cdef class Token:
|
|||
yield from word.subtree
|
||||
|
||||
property left_edge:
|
||||
'''The leftmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The first token such that self.is_ancestor(token)
|
||||
'''
|
||||
def __get__(self):
|
||||
return self.doc[self.c.l_edge]
|
||||
|
||||
property right_edge:
|
||||
'''The rightmost token of this token's syntactic descendents.
|
||||
|
||||
Returns: Token The last token such that self.is_ancestor(token)
|
||||
'''
|
||||
def __get__(self):
|
||||
return self.doc[self.c.r_edge]
|
||||
|
||||
property ancestors:
|
||||
'''A sequence of this token's syntactic ancestors.
|
||||
|
||||
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
|
||||
'''
|
||||
def __get__(self):
|
||||
cdef const TokenC* head_ptr = self.c
|
||||
# guard against infinite loop, no token can have
|
||||
|
@ -273,9 +324,27 @@ cdef class Token:
|
|||
i += 1
|
||||
|
||||
def is_ancestor_of(self, descendant):
|
||||
# TODO: Remove after backward compatibility check.
|
||||
return self.is_ancestor(descendant)
|
||||
|
||||
def is_ancestor(self, descendant):
|
||||
'''Check whether this token is a parent, grandparent, etc. of another
|
||||
in the dependency tree.
|
||||
|
||||
Arguments:
|
||||
descendant (Token): Another token.
|
||||
Returns:
|
||||
is_ancestor (bool): Whether this token is the ancestor of the descendant.
|
||||
'''
|
||||
if self.doc is not other.doc:
|
||||
return False
|
||||
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
|
||||
|
||||
property head:
|
||||
'''The syntactic parent, or "governor", of this token.
|
||||
|
||||
Returns: Token
|
||||
'''
|
||||
def __get__(self):
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
return self.doc[self.i + self.c.head]
|
||||
|
@ -370,6 +439,10 @@ cdef class Token:
|
|||
self.c.head = rel_newhead_i
|
||||
|
||||
property conjuncts:
|
||||
'''A sequence of coordinated tokens, including the token itself.
|
||||
|
||||
Yields: Token A coordinated token
|
||||
'''
|
||||
def __get__(self):
|
||||
"""Get a list of conjoined words."""
|
||||
cdef Token word
|
||||
|
|
115
spacy/vocab.pyx
115
spacy/vocab.pyx
|
@ -52,6 +52,25 @@ cdef class Vocab:
|
|||
@classmethod
|
||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||
"""
|
||||
Load the vocabulary from a path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
Defaults to None.
|
||||
lemmatizer (object):
|
||||
A lemmatizer. Defaults to None.
|
||||
tag_map (dict):
|
||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||
and optionally morphological attributes.
|
||||
oov_prob (float):
|
||||
The default probability for out-of-vocabulary words.
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
if 'vectors' in deprecated_kwargs:
|
||||
raise AttributeError(
|
||||
|
@ -82,6 +101,22 @@ cdef class Vocab:
|
|||
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
serializer_freqs=None, **deprecated_kwargs):
|
||||
'''Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
Defaults to None.
|
||||
lemmatizer (object):
|
||||
A lemmatizer. Defaults to None.
|
||||
tag_map (dict):
|
||||
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
|
||||
and optionally morphological attributes.
|
||||
oov_prob (float):
|
||||
The default probability for out-of-vocabulary words.
|
||||
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
'''
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
|
@ -134,6 +169,9 @@ cdef class Vocab:
|
|||
'''
|
||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||
vectors if necessary. The memory will be zeroed.
|
||||
|
||||
Arguments:
|
||||
new_size (int): The new size of the vectors.
|
||||
'''
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
|
@ -145,11 +183,14 @@ cdef class Vocab:
|
|||
self.vectors_length = new_size
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
'''Set a new boolean flag to words in the vocabulary. The flag_setter
|
||||
function will be called over the words currently in the vocab, and then
|
||||
applied to new words as they occur. You'll then be able to access the
|
||||
flag value on each token, using token.check_flag(flag_id). See also:
|
||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||
'''Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
The flag_setter function will be called over the words currently in the
|
||||
vocab, and then applied to new words as they occur. You'll then be able
|
||||
to access the flag value on each token, using token.check_flag(flag_id).
|
||||
|
||||
See also:
|
||||
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
|
||||
|
||||
Arguments:
|
||||
flag_getter:
|
||||
|
@ -246,11 +287,23 @@ cdef class Vocab:
|
|||
self.length += 1
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
'''Check whether the string has an entry in the vocabulary.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The ID string.
|
||||
|
||||
Returns:
|
||||
bool Whether the string has an entry in the vocabulary.
|
||||
'''
|
||||
key = hash_string(string)
|
||||
lex = self._by_hash.get(key)
|
||||
return True if lex is not NULL else False
|
||||
|
||||
def __iter__(self):
|
||||
'''Iterate over the lexemes in the vocabulary.
|
||||
|
||||
Yields: Lexeme An entry in the vocabulary.
|
||||
'''
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in self._by_orth.items():
|
||||
|
@ -260,16 +313,15 @@ cdef class Vocab:
|
|||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
|
||||
Args:
|
||||
Arguments:
|
||||
id_or_string (int or unicode):
|
||||
The integer ID of a word, or its unicode string. If an int >= Lexicon.size,
|
||||
IndexError is raised. If id_or_string is neither an int nor a unicode string,
|
||||
ValueError is raised.
|
||||
The integer ID of a word, or its unicode string.
|
||||
|
||||
If an int >= Lexicon.size, IndexError is raised. If id_or_string
|
||||
is neither an int nor a unicode string, ValueError is raised.
|
||||
|
||||
Returns:
|
||||
lexeme (Lexeme):
|
||||
An instance of the Lexeme Python class, with data copied on
|
||||
instantiation.
|
||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||
'''
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
|
@ -295,6 +347,11 @@ cdef class Vocab:
|
|||
return tokens
|
||||
|
||||
def dump(self, loc):
|
||||
"""Save the lexemes binary data to the given location.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to save to.
|
||||
"""
|
||||
if hasattr(loc, 'as_posix'):
|
||||
loc = loc.as_posix()
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
|
@ -323,6 +380,14 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def load_lexemes(self, loc):
|
||||
'''Load the binary vocabulary data from the given location.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to load from.
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
fp = CFile(loc, 'rb',
|
||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||
cdef LexemeC* lexeme
|
||||
|
@ -363,6 +428,13 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def dump_vectors(self, out_loc):
|
||||
'''Save the word vectors to a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to save to.
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
cdef int32_t vec_len = self.vectors_length
|
||||
cdef int32_t word_len
|
||||
cdef bytes word_str
|
||||
|
@ -384,6 +456,17 @@ cdef class Vocab:
|
|||
out_file.close()
|
||||
|
||||
def load_vectors(self, file_):
|
||||
"""Load vectors from a text-based file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||
and each entry should be whitespace delimited. The first value of the entry
|
||||
should be the word string, and subsequent entries should be the values of the
|
||||
vector.
|
||||
|
||||
Returns:
|
||||
vec_len (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
cdef int32_t vec_len = -1
|
||||
|
@ -409,6 +492,14 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
def load_vectors_from_bin_loc(self, loc):
|
||||
"""Load vectors from the location of a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
|
||||
Returns:
|
||||
vec_len (int): The length of the vectors loaded.
|
||||
"""
|
||||
cdef CFile file_ = CFile(loc, b'rb')
|
||||
cdef int32_t word_len
|
||||
cdef int32_t vec_len = 0
|
||||
|
|
Loading…
Reference in New Issue
Block a user