Fix doc strings

This commit is contained in:
Matthew Honnibal 2016-11-01 12:25:36 +01:00
parent 18aab4f71e
commit b86f8af0c1
12 changed files with 592 additions and 39 deletions

View File

@ -212,6 +212,7 @@ def _consume_ent(tags):
cdef class GoldParse:
"""Collection for training annotations."""
@classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples
@ -220,6 +221,25 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False):
"""Create a GoldParse.
Arguments:
doc (Doc):
The document the annotations refer to.
words:
A sequence of unicode word strings.
tags:
A sequence of strings, representing tag annotations.
heads:
A sequence of integers, representing syntactic head offsets.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
"""
if words is None:
words = [token.text for token in doc]
if tags is None:
@ -280,10 +300,16 @@ cdef class GoldParse:
self.heads = proj_heads
def __len__(self):
"""Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
"""
return self.length
@property
def is_projective(self):
"""Whether the provided syntactic annotations form a projective dependency
tree."""
return not nonproj.is_nonproj_tree(self.heads)

View File

@ -293,8 +293,9 @@ class Language(object):
text (unicode): The text to be processed.
Returns:
tokens (spacy.tokens.Doc):
doc (Doc): A container for accessing the annotations.
Example:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
@ -314,6 +315,16 @@ class Language(object):
return doc
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
'''Process texts as a stream, and yield Doc objects in order.
Supports GIL-free multi-threading.
Arguments:
texts (iterator)
tag (bool)
parse (bool)
entity (bool)
'''
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
stream = (self.make_doc(text) for text in texts)
for proc in self.pipeline:

View File

@ -36,6 +36,13 @@ cdef class Lexeme:
tag).
"""
def __init__(self, Vocab vocab, int orth):
"""Create a Lexeme object.
Arguments:
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
@ -73,12 +80,33 @@ cdef class Lexeme:
return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to query.
Returns (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
'''
if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store):
cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.'''
cdef Pool mem
cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab
@ -175,6 +176,16 @@ cdef class Matcher:
@classmethod
def load(cls, path, vocab):
'''Load the matcher and patterns from a file path.
Arguments:
path (Path):
Path to a JSON-formatted patterns file.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
'''
if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = json.load(file_)
@ -183,6 +194,16 @@ cdef class Matcher:
return cls(vocab, patterns)
def __init__(self, vocab, patterns={}):
"""Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
"""
self._patterns = {}
self._entities = {}
self._acceptors = {}
@ -203,6 +224,22 @@ cdef class Matcher:
def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None):
"""Add an entity to the matcher.
Arguments:
entity_key (unicode or int):
An ID for the entity.
attrs:
Attributes to associate with the Matcher.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
"""
if if_exists not in ('raise', 'ignore', 'update'):
raise ValueError(
"Unexpected value for if_exists: %s.\n"
@ -224,6 +261,18 @@ cdef class Matcher:
self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""):
"""Add a pattern to the matcher.
Arguments:
entity_key (unicode or int):
An ID for the entity.
token_specs:
Description of the pattern to be matched.
label:
Label to assign to the matched pattern. Defaults to "".
Returns:
None
"""
entity_key = self.normalize_entity_key(entity_key)
if not self.has_entity(entity_key):
self.add_entity(entity_key)
@ -249,10 +298,24 @@ cdef class Matcher:
return entity_key
def has_entity(self, entity_key):
"""Check whether the matcher has an entity.
Arguments:
entity_key (string or int): The entity key to check.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities
def get_entity(self, entity_key):
"""Retrieve the attributes stored for an entity.
Arguments:
entity_key (unicode or int): The entity to retrieve.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities:
return self._entities[entity_key]
@ -260,6 +323,17 @@ cdef class Matcher:
return None
def __call__(self, Doc doc, acceptor=None):
"""Find all token sequences matching the supplied patterns on the Doc.
Arguments:
doc (Doc):
The document to match over.
Returns:
list
A list of (entity_key, label_id, start, end) tuples,
describing the matches. A match tuple describes a span doc[start:end].
The label_id and entity_key are both integers.
"""
if acceptor is not None:
raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor "
@ -340,6 +414,18 @@ cdef class Matcher:
return matches
def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn.
Arguments:
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs:
self(doc)
yield doc

View File

@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE
cdef class EntityRecognizer(Parser):
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')

View File

@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
cdef class StringStore:
'''Map strings to and from integer IDs.'''
def __init__(self, strings=None, freeze=False):
'''Create the StringStore.
Arguments:
strings: A sequence of unicode strings to add to the store.
'''
self.mem = Pool()
self._map = PreshMap()
self._oov = PreshMap()
@ -89,9 +94,22 @@ cdef class StringStore:
return self.size -1
def __len__(self):
"""The number of strings in the store.
Returns:
int The number of strings in the store.
"""
return self.size-1
def __getitem__(self, object string_or_id):
"""Retrieve a string from a given integer ID, or vice versa.
Arguments:
string_or_id (bytes or unicode or int):
The value to encode.
Returns:
unicode or int: The value to retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
elif string_or_id == 0:
@ -127,12 +145,23 @@ cdef class StringStore:
return utf8str - self.c
def __contains__(self, unicode string not None):
"""Check whether a string is in the store.
Arguments:
string (unicode): The string to check.
Returns bool:
Whether the store contains the string.
"""
if len(string) == 0:
return True
cdef hash_t key = hash_string(string)
return self._map.get(key) is not NULL
def __iter__(self):
"""Iterate over the strings in the store, in order.
Yields: unicode A string in the store.
"""
cdef int i
for i in range(self.size):
yield _decode(&self.c[i]) if i > 0 else u''
@ -185,6 +214,13 @@ cdef class StringStore:
return &self.c[self.size-1]
def dump(self, file_):
"""Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = json.dumps(list(self))
if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8')
@ -192,6 +228,13 @@ cdef class StringStore:
file_.write(string_data)
def load(self, file_):
"""Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = json.load(file_)
if strings == ['']:
return None

View File

@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron):
cdef class Parser:
"""Base class of the DependencyParser and EntityRecognizer."""
@classmethod
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
"""Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Parser):
The newly constructed object.
"""
with (path / 'config.json').open() as file_:
cfg = json.load(file_)
# TODO: remove this shim when we don't have to support older data
@ -90,6 +103,16 @@ cdef class Parser:
return self
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
"""Create a Parser.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Parser):
The newly constructed object.
"""
if TransitionSystem is None:
TransitionSystem = self.TransitionSystem
self.vocab = vocab
@ -107,6 +130,13 @@ cdef class Parser:
return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens):
"""Apply the entity recognizer, setting the annotations onto the Doc object.
Arguments:
doc (Doc): The document to be processed.
Returns:
None
"""
cdef int nr_feat = self.model.nr_feat
with nogil:
status = self.parseC(tokens.c, tokens.length, nr_feat)
@ -117,6 +147,16 @@ cdef class Parser:
self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2):
"""Process a stream of documents.
Arguments:
stream: The sequence of documents to process.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order.
"""
cdef Pool mem = Pool()
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
@ -194,6 +234,16 @@ cdef class Parser:
return 0
def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model.
Arguments:
doc (Doc):
The example document for the update.
gold (GoldParse):
The gold-standard annotations, to calculate the loss.
Returns (float):
The loss on this example.
"""
self.moves.preprocess_gold(gold)
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
self.moves.initialize_state(stcls.c)
@ -220,9 +270,24 @@ cdef class Parser:
return loss
def step_through(self, Doc doc):
"""Set up a stepwise state, to introspect and control the transition sequence.
Arguments:
doc (Doc): The document to step through.
Returns (StepwiseState):
A state object, to step through the annotation process.
"""
return StepwiseState(self, doc)
def from_transition_sequence(self, Doc doc, sequence):
"""Control the annotations on a document by specifying a transition sequence
to follow.
Arguments:
doc (Doc): The document to annotate.
sequence: A sequence of action names, as unicode strings.
Returns: None
"""
with self.step_through(doc) as stepwise:
for transition in sequence:
stepwise.transition(transition)
@ -233,7 +298,6 @@ cdef class Parser:
self.moves.add_action(action, label)
cdef class StepwiseState:
cdef readonly StateClass stcls
cdef readonly Example eg

View File

@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger:
"""A part-of-speech tagger for English"""
"""Annotate part-of-speech tags on Doc objects."""
@classmethod
def load(cls, path, vocab, require=False):
"""Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to
# support old data.
path = path if not isinstance(path, basestring) else pathlib.Path(path)
@ -126,6 +138,16 @@ cdef class Tagger:
return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Tagger):
The newly constructed object.
"""
if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates))
self.vocab = vocab
@ -154,8 +176,10 @@ cdef class Tagger:
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Doc): The tokens to be tagged.
Arguments:
doc (Doc): The tokens to be tagged.
Returns:
None
"""
if tokens.length == 0:
return 0
@ -178,11 +202,33 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents.
Arguments:
stream: The sequence of documents to tag.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in stream:
self(doc)
yield doc
def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document.
Arguments:
doc (Doc):
The document to update on.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
"""
gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs:

View File

@ -219,6 +219,16 @@ cdef class Doc:
return self.__str__()
def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0:
@ -226,6 +236,9 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector:
'''
A boolean value indicating whether a word vector is associated with the object.
'''
def __get__(self):
if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self)
@ -233,6 +246,11 @@ cdef class Doc:
return any(token.has_vector for token in self)
property vector:
'''
A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
if 'vector' in self.user_hooks:
return self.user_hooks['vector'](self)
@ -266,14 +284,16 @@ cdef class Doc:
def string(self):
return self.text
@property
def text_with_ws(self):
return self.text
@property
def text(self):
property text
'''A unicode representation of the document text.'''
def __get__(self):
return u''.join(t.text_with_ws for t in self)
property text_with_ws:
'''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
def __get__(self):
return self.text
property ents:
'''
Yields named-entity `Span` objects, if the entity recognizer
@ -567,7 +587,6 @@ cdef class Doc:
set_children_from_heads(self.c, self.length)
self.is_parsed = bool(HEAD in attrs or DEP in attrs)
self.is_tagged = bool(TAG in attrs or POS in attrs)
return self
def to_bytes(self):
@ -612,7 +631,22 @@ cdef class Doc:
yield n_bytes_str + data
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Merge a multi-word expression into a single token."""
"""Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged.
Arguments:
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type
if len(args) == 3:
# TODO: Warn deprecation

View File

@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme
cdef class Span:
"""A slice from a Doc object."""
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None):
'''Create a Span object from the slice doc[start : end]
Arguments:
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
Returns:
Span The newly constructed object.
'''
if not (0 <= start <= end <= len(tokens)):
raise IndexError
self.doc = tokens
self.doc = doc
self.start = start
self.start_char = self.doc[start].idx if start < self.doc.length else 0
self.end = end
@ -78,9 +89,29 @@ cdef class Span:
yield self.doc[i]
def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single token.
Arguments:
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token.
"""
self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other)
if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -102,7 +133,11 @@ cdef class Span:
self.end = end + 1
property sent:
'''Get the sentence span that this span is a part of.'''
'''The sentence span that this span is a part of.
Returns:
Span The sentence this is part of.
'''
def __get__(self):
if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self)
@ -156,7 +191,12 @@ cdef class Span:
return u''.join([t.text_with_ws for t in self])
property root:
"""The word of the span that is highest in the parse tree, i.e. has the
"""The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
Returns:
Token: The root token.
i.e. has the
shortest path to the root of the sentence (or is the root itself).
If multiple words are equally high in the tree, the first word is taken.
@ -231,7 +271,10 @@ cdef class Span:
return self.doc[root]
property lefts:
"""Tokens that are to the left of the Span, whose head is within the Span."""
"""Tokens that are to the left of the span, whose head is within the Span.
Yields: Token A left-child of a token of the span.
"""
def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order
for left in token.lefts:
@ -239,7 +282,10 @@ cdef class Span:
yield left
property rights:
"""Tokens that are to the right of the Span, whose head is within the Span."""
"""Tokens that are to the right of the Span, whose head is within the Span.
Yields: Token A right-child of a token of the span.
"""
def __get__(self):
for token in self:
for right in token.rights:
@ -247,6 +293,10 @@ cdef class Span:
yield right
property subtree:
"""Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span.
"""
def __get__(self):
for word in self.lefts:
yield from word.subtree

View File

@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Doc.__getitem__ and Doc.__iter__.
"""An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
"""
def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab
@ -40,6 +39,7 @@ cdef class Token:
self.i = offset
def __len__(self):
'''Number of unicode characters in token.text'''
return self.c.lex.length
def __unicode__(self):
@ -57,12 +57,35 @@ cdef class Token:
return self.__str__()
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
is_set (bool): Whether the flag is set.
'''
return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1):
'''Get a neighboring token.
Arguments:
i (int): The relative position of the token to get. Defaults to 1.
Returns:
neighbor (Token): The token at position self.doc[self.i+i]
'''
return self.doc[self.i+i]
def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0:
@ -158,6 +181,9 @@ cdef class Token:
self.c.dep = label
property has_vector:
'''
A boolean value indicating whether a word vector is associated with the object.
'''
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self)
@ -169,6 +195,11 @@ cdef class Token:
return False
property vector:
'''
A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self):
if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self)
@ -241,11 +272,19 @@ cdef class Token:
yield t
property children:
'''A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self
'''
def __get__(self):
yield from self.lefts
yield from self.rights
property subtree:
'''A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent)
'''
def __get__(self):
for word in self.lefts:
yield from word.subtree
@ -254,14 +293,26 @@ cdef class Token:
yield from word.subtree
property left_edge:
'''The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token)
'''
def __get__(self):
return self.doc[self.c.l_edge]
property right_edge:
'''The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token)
'''
def __get__(self):
return self.doc[self.c.r_edge]
property ancestors:
'''A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
'''
def __get__(self):
cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have
@ -273,9 +324,27 @@ cdef class Token:
i += 1
def is_ancestor_of(self, descendant):
# TODO: Remove after backward compatibility check.
return self.is_ancestor(descendant)
def is_ancestor(self, descendant):
'''Check whether this token is a parent, grandparent, etc. of another
in the dependency tree.
Arguments:
descendant (Token): Another token.
Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant.
'''
if self.doc is not other.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head:
'''The syntactic parent, or "governor", of this token.
Returns: Token
'''
def __get__(self):
"""The token predicted by the parser to be the head of the current token."""
return self.doc[self.i + self.c.head]
@ -370,6 +439,10 @@ cdef class Token:
self.c.head = rel_newhead_i
property conjuncts:
'''A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token
'''
def __get__(self):
"""Get a list of conjoined words."""
cdef Token word

View File

@ -52,6 +52,25 @@ cdef class Vocab:
@classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
"""
Load the vocabulary from a path.
Arguments:
path (Path):
The path to load from.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
if 'vectors' in deprecated_kwargs:
raise AttributeError(
@ -82,6 +101,22 @@ cdef class Vocab:
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, **deprecated_kwargs):
'''Create the vocabulary.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
'''
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -134,6 +169,9 @@ cdef class Vocab:
'''
Set vectors_length to a new size, and allocate more memory for the Lexeme
vectors if necessary. The memory will be zeroed.
Arguments:
new_size (int): The new size of the vectors.
'''
cdef hash_t key
cdef size_t addr
@ -145,10 +183,13 @@ cdef class Vocab:
self.vectors_length = new_size
def add_flag(self, flag_getter, int flag_id=-1):
'''Set a new boolean flag to words in the vocabulary. The flag_setter
function will be called over the words currently in the vocab, and then
applied to new words as they occur. You'll then be able to access the
flag value on each token, using token.check_flag(flag_id). See also:
'''Set a new boolean flag to words in the vocabulary.
The flag_setter function will be called over the words currently in the
vocab, and then applied to new words as they occur. You'll then be able
to access the flag value on each token, using token.check_flag(flag_id).
See also:
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
Arguments:
@ -246,11 +287,23 @@ cdef class Vocab:
self.length += 1
def __contains__(self, unicode string):
'''Check whether the string has an entry in the vocabulary.
Arguments:
string (unicode): The ID string.
Returns:
bool Whether the string has an entry in the vocabulary.
'''
key = hash_string(string)
lex = self._by_hash.get(key)
return True if lex is not NULL else False
def __iter__(self):
'''Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary.
'''
cdef attr_t orth
cdef size_t addr
for orth, addr in self._by_orth.items():
@ -260,16 +313,15 @@ cdef class Vocab:
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new lexeme is created and stored.
Args:
Arguments:
id_or_string (int or unicode):
The integer ID of a word, or its unicode string. If an int >= Lexicon.size,
IndexError is raised. If id_or_string is neither an int nor a unicode string,
ValueError is raised.
The integer ID of a word, or its unicode string.
If an int >= Lexicon.size, IndexError is raised. If id_or_string
is neither an int nor a unicode string, ValueError is raised.
Returns:
lexeme (Lexeme):
An instance of the Lexeme Python class, with data copied on
instantiation.
lexeme (Lexeme): The lexeme indicated by the given ID.
'''
cdef attr_t orth
if type(id_or_string) == unicode:
@ -295,6 +347,11 @@ cdef class Vocab:
return tokens
def dump(self, loc):
"""Save the lexemes binary data to the given location.
Arguments:
loc (Path): The path to save to.
"""
if hasattr(loc, 'as_posix'):
loc = loc.as_posix()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
@ -323,6 +380,14 @@ cdef class Vocab:
fp.close()
def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location.
Arguments:
loc (Path): The path to load from.
Returns:
None
'''
fp = CFile(loc, 'rb',
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
cdef LexemeC* lexeme
@ -363,6 +428,13 @@ cdef class Vocab:
fp.close()
def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file.
Arguments:
loc (Path): The path to save to.
Returns:
None
'''
cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len
cdef bytes word_str
@ -384,6 +456,17 @@ cdef class Vocab:
out_file.close()
def load_vectors(self, file_):
"""Load vectors from a text-based file.
Arguments:
file_ (buffer): The file to read from. Entries should be separated by newlines,
and each entry should be whitespace delimited. The first value of the entry
should be the word string, and subsequent entries should be the values of the
vector.
Returns:
vec_len (int): The length of the vectors loaded.
"""
cdef LexemeC* lexeme
cdef attr_t orth
cdef int32_t vec_len = -1
@ -409,6 +492,14 @@ cdef class Vocab:
return vec_len
def load_vectors_from_bin_loc(self, loc):
"""Load vectors from the location of a binary file.
Arguments:
loc (unicode): The path of the binary file to load from.
Returns:
vec_len (int): The length of the vectors loaded.
"""
cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len
cdef int32_t vec_len = 0