Fix doc strings

This commit is contained in:
Matthew Honnibal 2016-11-01 12:25:36 +01:00
parent 18aab4f71e
commit b86f8af0c1
12 changed files with 592 additions and 39 deletions

View File

@ -212,6 +212,7 @@ def _consume_ent(tags):
cdef class GoldParse: cdef class GoldParse:
"""Collection for training annotations."""
@classmethod @classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples _, words, tags, heads, deps, entities = annot_tuples
@ -220,6 +221,25 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
deps=None, entities=None, make_projective=False): deps=None, entities=None, make_projective=False):
"""Create a GoldParse.
Arguments:
doc (Doc):
The document the annotations refer to.
words:
A sequence of unicode word strings.
tags:
A sequence of strings, representing tag annotations.
heads:
A sequence of integers, representing syntactic head offsets.
deps:
A sequence of strings, representing the syntactic relation types.
entities:
A sequence of named entity annotations, either as BILUO tag strings,
or as (start_char, end_char, label) tuples, representing the entity
positions.
Returns (GoldParse): The newly constructed object.
"""
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
if tags is None: if tags is None:
@ -280,10 +300,16 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
"""
return self.length return self.length
@property @property
def is_projective(self): def is_projective(self):
"""Whether the provided syntactic annotations form a projective dependency
tree."""
return not nonproj.is_nonproj_tree(self.heads) return not nonproj.is_nonproj_tree(self.heads)

View File

@ -293,13 +293,14 @@ class Language(object):
text (unicode): The text to be processed. text (unicode): The text to be processed.
Returns: Returns:
tokens (spacy.tokens.Doc): doc (Doc): A container for accessing the annotations.
>>> from spacy.en import English Example:
>>> nlp = English() >>> from spacy.en import English
>>> tokens = nlp('An example sentence. Another example sentence.') >>> nlp = English()
>>> tokens[0].orth_, tokens[0].head.tag_ >>> tokens = nlp('An example sentence. Another example sentence.')
('An', 'NN') >>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
""" """
doc = self.make_doc(text) doc = self.make_doc(text)
if self.entity and entity: if self.entity and entity:
@ -314,6 +315,16 @@ class Language(object):
return doc return doc
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
'''Process texts as a stream, and yield Doc objects in order.
Supports GIL-free multi-threading.
Arguments:
texts (iterator)
tag (bool)
parse (bool)
entity (bool)
'''
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
stream = (self.make_doc(text) for text in texts) stream = (self.make_doc(text) for text in texts)
for proc in self.pipeline: for proc in self.pipeline:

View File

@ -36,6 +36,13 @@ cdef class Lexeme:
tag). tag).
""" """
def __init__(self, Vocab vocab, int orth): def __init__(self, Vocab vocab, int orth):
"""Create a Lexeme object.
Arguments:
vocab (Vocab): The parent vocabulary
orth (int): The orth id of the lexeme.
Returns (Lexeme): The newly constructd object.
"""
self.vocab = vocab self.vocab = vocab
self.orth = orth self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
@ -73,12 +80,33 @@ cdef class Lexeme:
return self.c.orth return self.c.orth
def set_flag(self, attr_id_t flag_id, bint value): def set_flag(self, attr_id_t flag_id, bint value):
"""Change the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to set.
value (bool): The new value of the flag.
"""
Lexeme.c_set_flag(self.c, flag_id, value) Lexeme.c_set_flag(self.c, flag_id, value)
def check_flag(self, attr_id_t flag_id): def check_flag(self, attr_id_t flag_id):
"""Check the value of a boolean flag.
Arguments:
flag_id (int): The attribute ID of the flag to query.
Returns (bool): The value of the flag.
"""
return True if Lexeme.c_check_flag(self.c, flag_id) else False return True if Lexeme.c_check_flag(self.c, flag_id) else False
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
'''
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
return 0.0 return 0.0
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

View File

@ -165,6 +165,7 @@ def _convert_strings(token_specs, string_store):
cdef class Matcher: cdef class Matcher:
'''Match sequences of tokens, based on pattern rules.'''
cdef Pool mem cdef Pool mem
cdef vector[TokenPatternC*] patterns cdef vector[TokenPatternC*] patterns
cdef readonly Vocab vocab cdef readonly Vocab vocab
@ -175,6 +176,16 @@ cdef class Matcher:
@classmethod @classmethod
def load(cls, path, vocab): def load(cls, path, vocab):
'''Load the matcher and patterns from a file path.
Arguments:
path (Path):
Path to a JSON-formatted patterns file.
vocab (Vocab):
The vocabulary that the documents to match over will refer to.
Returns:
Matcher: The newly constructed object.
'''
if (path / 'gazetteer.json').exists(): if (path / 'gazetteer.json').exists():
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
patterns = json.load(file_) patterns = json.load(file_)
@ -183,6 +194,16 @@ cdef class Matcher:
return cls(vocab, patterns) return cls(vocab, patterns)
def __init__(self, vocab, patterns={}): def __init__(self, vocab, patterns={}):
"""Create the Matcher.
Arguments:
vocab (Vocab):
The vocabulary object, which must be shared with the documents
the matcher will operate on.
patterns (dict): Patterns to add to the matcher.
Returns:
The newly constructed object.
"""
self._patterns = {} self._patterns = {}
self._entities = {} self._entities = {}
self._acceptors = {} self._acceptors = {}
@ -203,6 +224,22 @@ cdef class Matcher:
def add_entity(self, entity_key, attrs=None, if_exists='raise', def add_entity(self, entity_key, attrs=None, if_exists='raise',
acceptor=None, on_match=None): acceptor=None, on_match=None):
"""Add an entity to the matcher.
Arguments:
entity_key (unicode or int):
An ID for the entity.
attrs:
Attributes to associate with the Matcher.
if_exists ('raise', 'ignore' or 'update'):
Controls what happens if the entity ID already exists. Defaults to 'raise'.
acceptor:
Callback function to filter matches of the entity.
on_match:
Callback function to act on matches of the entity.
Returns:
None
"""
if if_exists not in ('raise', 'ignore', 'update'): if if_exists not in ('raise', 'ignore', 'update'):
raise ValueError( raise ValueError(
"Unexpected value for if_exists: %s.\n" "Unexpected value for if_exists: %s.\n"
@ -224,6 +261,18 @@ cdef class Matcher:
self._callbacks[entity_key] = on_match self._callbacks[entity_key] = on_match
def add_pattern(self, entity_key, token_specs, label=""): def add_pattern(self, entity_key, token_specs, label=""):
"""Add a pattern to the matcher.
Arguments:
entity_key (unicode or int):
An ID for the entity.
token_specs:
Description of the pattern to be matched.
label:
Label to assign to the matched pattern. Defaults to "".
Returns:
None
"""
entity_key = self.normalize_entity_key(entity_key) entity_key = self.normalize_entity_key(entity_key)
if not self.has_entity(entity_key): if not self.has_entity(entity_key):
self.add_entity(entity_key) self.add_entity(entity_key)
@ -249,10 +298,24 @@ cdef class Matcher:
return entity_key return entity_key
def has_entity(self, entity_key): def has_entity(self, entity_key):
"""Check whether the matcher has an entity.
Arguments:
entity_key (string or int): The entity key to check.
Returns:
bool: Whether the matcher has the entity.
"""
entity_key = self.normalize_entity_key(entity_key) entity_key = self.normalize_entity_key(entity_key)
return entity_key in self._entities return entity_key in self._entities
def get_entity(self, entity_key): def get_entity(self, entity_key):
"""Retrieve the attributes stored for an entity.
Arguments:
entity_key (unicode or int): The entity to retrieve.
Returns:
The entity attributes if present, otherwise None.
"""
entity_key = self.normalize_entity_key(entity_key) entity_key = self.normalize_entity_key(entity_key)
if entity_key in self._entities: if entity_key in self._entities:
return self._entities[entity_key] return self._entities[entity_key]
@ -260,6 +323,17 @@ cdef class Matcher:
return None return None
def __call__(self, Doc doc, acceptor=None): def __call__(self, Doc doc, acceptor=None):
"""Find all token sequences matching the supplied patterns on the Doc.
Arguments:
doc (Doc):
The document to match over.
Returns:
list
A list of (entity_key, label_id, start, end) tuples,
describing the matches. A match tuple describes a span doc[start:end].
The label_id and entity_key are both integers.
"""
if acceptor is not None: if acceptor is not None:
raise ValueError( raise ValueError(
"acceptor keyword argument to Matcher deprecated. Specify acceptor " "acceptor keyword argument to Matcher deprecated. Specify acceptor "
@ -340,6 +414,18 @@ cdef class Matcher:
return matches return matches
def pipe(self, docs, batch_size=1000, n_threads=2): def pipe(self, docs, batch_size=1000, n_threads=2):
"""Match a stream of documents, yielding them in turn.
Arguments:
docs: A stream of documents.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in docs: for doc in docs:
self(doc) self(doc)
yield doc yield doc

View File

@ -11,6 +11,7 @@ from .attrs import DEP, ENT_TYPE
cdef class EntityRecognizer(Parser): cdef class EntityRecognizer(Parser):
"""Annotate named entities on Doc objects."""
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner') feature_templates = get_feature_templates('ner')

View File

@ -73,6 +73,11 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, int length) except
cdef class StringStore: cdef class StringStore:
'''Map strings to and from integer IDs.''' '''Map strings to and from integer IDs.'''
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
'''Create the StringStore.
Arguments:
strings: A sequence of unicode strings to add to the store.
'''
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._oov = PreshMap() self._oov = PreshMap()
@ -89,9 +94,22 @@ cdef class StringStore:
return self.size -1 return self.size -1
def __len__(self): def __len__(self):
"""The number of strings in the store.
Returns:
int The number of strings in the store.
"""
return self.size-1 return self.size-1
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
"""Retrieve a string from a given integer ID, or vice versa.
Arguments:
string_or_id (bytes or unicode or int):
The value to encode.
Returns:
unicode or int: The value to retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0 return 0
elif string_or_id == 0: elif string_or_id == 0:
@ -127,12 +145,23 @@ cdef class StringStore:
return utf8str - self.c return utf8str - self.c
def __contains__(self, unicode string not None): def __contains__(self, unicode string not None):
"""Check whether a string is in the store.
Arguments:
string (unicode): The string to check.
Returns bool:
Whether the store contains the string.
"""
if len(string) == 0: if len(string) == 0:
return True return True
cdef hash_t key = hash_string(string) cdef hash_t key = hash_string(string)
return self._map.get(key) is not NULL return self._map.get(key) is not NULL
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order.
Yields: unicode A string in the store.
"""
cdef int i cdef int i
for i in range(self.size): for i in range(self.size):
yield _decode(&self.c[i]) if i > 0 else u'' yield _decode(&self.c[i]) if i > 0 else u''
@ -185,6 +214,13 @@ cdef class StringStore:
return &self.c[self.size-1] return &self.c[self.size-1]
def dump(self, file_): def dump(self, file_):
"""Save the strings to a JSON file.
Arguments:
file_ (buffer): The file to save the strings.
Returns:
None
"""
string_data = json.dumps(list(self)) string_data = json.dumps(list(self))
if not isinstance(string_data, unicode): if not isinstance(string_data, unicode):
string_data = string_data.decode('utf8') string_data = string_data.decode('utf8')
@ -192,6 +228,13 @@ cdef class StringStore:
file_.write(string_data) file_.write(string_data)
def load(self, file_): def load(self, file_):
"""Load the strings from a JSON file.
Arguments:
file_ (buffer): The file from which to load the strings.
Returns:
None
"""
strings = json.load(file_) strings = json.load(file_)
if strings == ['']: if strings == ['']:
return None return None

View File

@ -74,8 +74,21 @@ cdef class ParserModel(AveragedPerceptron):
cdef class Parser: cdef class Parser:
"""Base class of the DependencyParser and EntityRecognizer."""
@classmethod @classmethod
def load(cls, path, Vocab vocab, TransitionSystem=None, require=False): def load(cls, path, Vocab vocab, TransitionSystem=None, require=False):
"""Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Parser):
The newly constructed object.
"""
with (path / 'config.json').open() as file_: with (path / 'config.json').open() as file_:
cfg = json.load(file_) cfg = json.load(file_)
# TODO: remove this shim when we don't have to support older data # TODO: remove this shim when we don't have to support older data
@ -90,6 +103,16 @@ cdef class Parser:
return self return self
def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
"""Create a Parser.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Parser):
The newly constructed object.
"""
if TransitionSystem is None: if TransitionSystem is None:
TransitionSystem = self.TransitionSystem TransitionSystem = self.TransitionSystem
self.vocab = vocab self.vocab = vocab
@ -107,6 +130,13 @@ cdef class Parser:
return (Parser, (self.vocab, self.moves, self.model), None, None) return (Parser, (self.vocab, self.moves, self.model), None, None)
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the entity recognizer, setting the annotations onto the Doc object.
Arguments:
doc (Doc): The document to be processed.
Returns:
None
"""
cdef int nr_feat = self.model.nr_feat cdef int nr_feat = self.model.nr_feat
with nogil: with nogil:
status = self.parseC(tokens.c, tokens.length, nr_feat) status = self.parseC(tokens.c, tokens.length, nr_feat)
@ -117,6 +147,16 @@ cdef class Parser:
self.moves.finalize_doc(tokens) self.moves.finalize_doc(tokens)
def pipe(self, stream, int batch_size=1000, int n_threads=2): def pipe(self, stream, int batch_size=1000, int n_threads=2):
"""Process a stream of documents.
Arguments:
stream: The sequence of documents to process.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel.
Yields (Doc): Documents, in order.
"""
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*)) cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int)) cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
@ -194,6 +234,16 @@ cdef class Parser:
return 0 return 0
def update(self, Doc tokens, GoldParse gold): def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model.
Arguments:
doc (Doc):
The example document for the update.
gold (GoldParse):
The gold-standard annotations, to calculate the loss.
Returns (float):
The loss on this example.
"""
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
cdef StateClass stcls = StateClass.init(tokens.c, tokens.length) cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
self.moves.initialize_state(stcls.c) self.moves.initialize_state(stcls.c)
@ -220,9 +270,24 @@ cdef class Parser:
return loss return loss
def step_through(self, Doc doc): def step_through(self, Doc doc):
"""Set up a stepwise state, to introspect and control the transition sequence.
Arguments:
doc (Doc): The document to step through.
Returns (StepwiseState):
A state object, to step through the annotation process.
"""
return StepwiseState(self, doc) return StepwiseState(self, doc)
def from_transition_sequence(self, Doc doc, sequence): def from_transition_sequence(self, Doc doc, sequence):
"""Control the annotations on a document by specifying a transition sequence
to follow.
Arguments:
doc (Doc): The document to annotate.
sequence: A sequence of action names, as unicode strings.
Returns: None
"""
with self.step_through(doc) as stepwise: with self.step_through(doc) as stepwise:
for transition in sequence: for transition in sequence:
stepwise.transition(transition) stepwise.transition(transition)
@ -233,7 +298,6 @@ cdef class Parser:
self.moves.add_action(action, label) self.moves.add_action(action, label)
cdef class StepwiseState: cdef class StepwiseState:
cdef readonly StateClass stcls cdef readonly StateClass stcls
cdef readonly Example eg cdef readonly Example eg

View File

@ -102,9 +102,21 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
cdef class Tagger: cdef class Tagger:
"""A part-of-speech tagger for English""" """Annotate part-of-speech tags on Doc objects."""
@classmethod @classmethod
def load(cls, path, vocab, require=False): def load(cls, path, vocab, require=False):
"""Load the statistical model from the supplied path.
Arguments:
path (Path):
The path to load from.
vocab (Vocab):
The vocabulary. Must be shared by the documents to be processed.
require (bool):
Whether to raise an error if the files are not found.
Returns (Tagger):
The newly created object.
"""
# TODO: Change this to expect config.json when we don't have to # TODO: Change this to expect config.json when we don't have to
# support old data. # support old data.
path = path if not isinstance(path, basestring) else pathlib.Path(path) path = path if not isinstance(path, basestring) else pathlib.Path(path)
@ -126,6 +138,16 @@ cdef class Tagger:
return self return self
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
"""Create a Tagger.
Arguments:
vocab (Vocab):
The vocabulary object. Must be shared with documents to be processed.
model (thinc.linear.AveragedPerceptron):
The statistical model.
Returns (Tagger):
The newly constructed object.
"""
if model is None: if model is None:
model = TaggerModel(cfg.get('features', self.feature_templates)) model = TaggerModel(cfg.get('features', self.feature_templates))
self.vocab = vocab self.vocab = vocab
@ -154,8 +176,10 @@ cdef class Tagger:
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.
Args: Arguments:
tokens (Doc): The tokens to be tagged. doc (Doc): The tokens to be tagged.
Returns:
None
""" """
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
@ -178,11 +202,33 @@ cdef class Tagger:
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def pipe(self, stream, batch_size=1000, n_threads=2): def pipe(self, stream, batch_size=1000, n_threads=2):
"""Tag a stream of documents.
Arguments:
stream: The sequence of documents to tag.
batch_size (int):
The number of documents to accumulate into a working set.
n_threads (int):
The number of threads with which to work on the buffer in parallel,
if the Matcher implementation supports multi-threading.
Yields:
Doc Documents, in order.
"""
for doc in stream: for doc in stream:
self(doc) self(doc)
yield doc yield doc
def update(self, Doc tokens, GoldParse gold): def update(self, Doc tokens, GoldParse gold):
"""Update the statistical model, with tags supplied for the given document.
Arguments:
doc (Doc):
The document to update on.
gold (GoldParse):
Manager for the gold-standard tags.
Returns (int):
Number of tags correct.
"""
gold_tag_strs = gold.tags gold_tag_strs = gold.tags
assert len(tokens) == len(gold_tag_strs) assert len(tokens) == len(gold_tag_strs)
for tag in gold_tag_strs: for tag in gold_tag_strs:

View File

@ -219,6 +219,16 @@ cdef class Doc:
return self.__str__() return self.__str__()
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.user_hooks: if 'similarity' in self.user_hooks:
return self.user_hooks['similarity'](self, other) return self.user_hooks['similarity'](self, other)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -226,6 +236,9 @@ cdef class Doc:
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
property has_vector: property has_vector:
'''
A boolean value indicating whether a word vector is associated with the object.
'''
def __get__(self): def __get__(self):
if 'has_vector' in self.user_hooks: if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self) return self.user_hooks['has_vector'](self)
@ -233,6 +246,11 @@ cdef class Doc:
return any(token.has_vector for token in self) return any(token.has_vector for token in self)
property vector: property vector:
'''
A real-valued meaning representation. Defaults to an average of the token vectors.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self): def __get__(self):
if 'vector' in self.user_hooks: if 'vector' in self.user_hooks:
return self.user_hooks['vector'](self) return self.user_hooks['vector'](self)
@ -265,14 +283,16 @@ cdef class Doc:
@property @property
def string(self): def string(self):
return self.text return self.text
property text
'''A unicode representation of the document text.'''
def __get__(self):
return u''.join(t.text_with_ws for t in self)
@property property text_with_ws:
def text_with_ws(self): '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
return self.text def __get__(self):
return self.text
@property
def text(self):
return u''.join(t.text_with_ws for t in self)
property ents: property ents:
''' '''
@ -567,7 +587,6 @@ cdef class Doc:
set_children_from_heads(self.c, self.length) set_children_from_heads(self.c, self.length)
self.is_parsed = bool(HEAD in attrs or DEP in attrs) self.is_parsed = bool(HEAD in attrs or DEP in attrs)
self.is_tagged = bool(TAG in attrs or POS in attrs) self.is_tagged = bool(TAG in attrs or POS in attrs)
return self return self
def to_bytes(self): def to_bytes(self):
@ -612,7 +631,22 @@ cdef class Doc:
yield n_bytes_str + data yield n_bytes_str + data
def merge(self, int start_idx, int end_idx, *args, **attributes): def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Merge a multi-word expression into a single token.""" """Retokenize the document, such that the span at doc.text[start_idx : end_idx]
is merged into a single token. If start_idx and end_idx do not mark start
and end token boundaries, the document remains unchanged.
Arguments:
start_idx (int): The character index of the start of the slice to merge.
end_idx (int): The character index after the end of the slice to merge.
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token, or None if the start and end indices did
not fall at token boundaries.
"""
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:
# TODO: Warn deprecation # TODO: Warn deprecation

View File

@ -18,12 +18,23 @@ from ..lexeme cimport Lexeme
cdef class Span: cdef class Span:
"""A slice from a Doc object.""" """A slice from a Doc object."""
def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None, def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
vector_norm=None): vector_norm=None):
'''Create a Span object from the slice doc[start : end]
Arguments:
doc (Doc): The parent document.
start (int): The index of the first token of the span.
end (int): The index of the first token after the span.
label (int): A label to attach to the Span, e.g. for named entities.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
Returns:
Span The newly constructed object.
'''
if not (0 <= start <= end <= len(tokens)): if not (0 <= start <= end <= len(tokens)):
raise IndexError raise IndexError
self.doc = tokens self.doc = doc
self.start = start self.start = start
self.start_char = self.doc[start].idx if start < self.doc.length else 0 self.start_char = self.doc[start].idx if start < self.doc.length else 0
self.end = end self.end = end
@ -78,9 +89,29 @@ cdef class Span:
yield self.doc[i] yield self.doc[i]
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single token.
Arguments:
**attributes:
Attributes to assign to the merged token. By default, attributes
are inherited from the syntactic root token of the span.
Returns:
token (Token):
The newly merged token.
"""
self.doc.merge(self.start_char, self.end_char, *args, **attributes) self.doc.merge(self.start_char, self.end_char, *args, **attributes)
def similarity(self, other): def similarity(self, other):
'''Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.
Arguments:
other (object): The object to compare with. By default, accepts Doc,
Span, Token and Lexeme objects.
Return:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.doc.user_span_hooks: if 'similarity' in self.doc.user_span_hooks:
self.doc.user_span_hooks['similarity'](self, other) self.doc.user_span_hooks['similarity'](self, other)
if self.vector_norm == 0.0 or other.vector_norm == 0.0: if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -102,7 +133,11 @@ cdef class Span:
self.end = end + 1 self.end = end + 1
property sent: property sent:
'''Get the sentence span that this span is a part of.''' '''The sentence span that this span is a part of.
Returns:
Span The sentence this is part of.
'''
def __get__(self): def __get__(self):
if 'sent' in self.doc.user_span_hooks: if 'sent' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['sent'](self) return self.doc.user_span_hooks['sent'](self)
@ -156,7 +191,12 @@ cdef class Span:
return u''.join([t.text_with_ws for t in self]) return u''.join([t.text_with_ws for t in self])
property root: property root:
"""The word of the span that is highest in the parse tree, i.e. has the """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
Returns:
Token: The root token.
i.e. has the
shortest path to the root of the sentence (or is the root itself). shortest path to the root of the sentence (or is the root itself).
If multiple words are equally high in the tree, the first word is taken. If multiple words are equally high in the tree, the first word is taken.
@ -231,7 +271,10 @@ cdef class Span:
return self.doc[root] return self.doc[root]
property lefts: property lefts:
"""Tokens that are to the left of the Span, whose head is within the Span.""" """Tokens that are to the left of the span, whose head is within the Span.
Yields: Token A left-child of a token of the span.
"""
def __get__(self): def __get__(self):
for token in reversed(self): # Reverse, so we get the tokens in order for token in reversed(self): # Reverse, so we get the tokens in order
for left in token.lefts: for left in token.lefts:
@ -239,7 +282,10 @@ cdef class Span:
yield left yield left
property rights: property rights:
"""Tokens that are to the right of the Span, whose head is within the Span.""" """Tokens that are to the right of the Span, whose head is within the Span.
Yields: Token A right-child of a token of the span.
"""
def __get__(self): def __get__(self):
for token in self: for token in self:
for right in token.rights: for right in token.rights:
@ -247,6 +293,10 @@ cdef class Span:
yield right yield right
property subtree: property subtree:
"""Tokens that descend from tokens in the span, but fall outside it.
Yields: Token A descendant of a token within the span.
"""
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
yield from word.subtree yield from word.subtree

View File

@ -30,8 +30,7 @@ from ..lexeme cimport Lexeme
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
via Doc.__getitem__ and Doc.__iter__.
""" """
def __cinit__(self, Vocab vocab, Doc doc, int offset): def __cinit__(self, Vocab vocab, Doc doc, int offset):
self.vocab = vocab self.vocab = vocab
@ -40,6 +39,7 @@ cdef class Token:
self.i = offset self.i = offset
def __len__(self): def __len__(self):
'''Number of unicode characters in token.text'''
return self.c.lex.length return self.c.lex.length
def __unicode__(self): def __unicode__(self):
@ -57,12 +57,35 @@ cdef class Token:
return self.__str__() return self.__str__()
cpdef bint check_flag(self, attr_id_t flag_id) except -1: cpdef bint check_flag(self, attr_id_t flag_id) except -1:
'''Check the value of a boolean flag.
Arguments:
flag_id (int): The ID of the flag attribute.
Returns:
is_set (bool): Whether the flag is set.
'''
return Lexeme.c_check_flag(self.c.lex, flag_id) return Lexeme.c_check_flag(self.c.lex, flag_id)
def nbor(self, int i=1): def nbor(self, int i=1):
'''Get a neighboring token.
Arguments:
i (int): The relative position of the token to get. Defaults to 1.
Returns:
neighbor (Token): The token at position self.doc[self.i+i]
'''
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
Arguments:
other:
The object to compare with. By default, accepts Doc, Span,
Token and Lexeme objects.
Returns:
score (float): A scalar similarity score. Higher is more similar.
'''
if 'similarity' in self.doc.user_token_hooks: if 'similarity' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['similarity'](self) return self.doc.user_token_hooks['similarity'](self)
if self.vector_norm == 0 or other.vector_norm == 0: if self.vector_norm == 0 or other.vector_norm == 0:
@ -158,6 +181,9 @@ cdef class Token:
self.c.dep = label self.c.dep = label
property has_vector: property has_vector:
'''
A boolean value indicating whether a word vector is associated with the object.
'''
def __get__(self): def __get__(self):
if 'has_vector' in self.doc.user_token_hooks: if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self) return self.doc.user_token_hooks['has_vector'](self)
@ -169,6 +195,11 @@ cdef class Token:
return False return False
property vector: property vector:
'''
A real-valued meaning representation.
Type: numpy.ndarray[ndim=1, dtype='float32']
'''
def __get__(self): def __get__(self):
if 'vector' in self.doc.user_token_hooks: if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self) return self.doc.user_token_hooks['vector'](self)
@ -241,11 +272,19 @@ cdef class Token:
yield t yield t
property children: property children:
'''A sequence of the token's immediate syntactic children.
Yields: Token A child token such that child.head==self
'''
def __get__(self): def __get__(self):
yield from self.lefts yield from self.lefts
yield from self.rights yield from self.rights
property subtree: property subtree:
'''A sequence of all the token's syntactic descendents.
Yields: Token A descendent token such that self.is_ancestor(descendent)
'''
def __get__(self): def __get__(self):
for word in self.lefts: for word in self.lefts:
yield from word.subtree yield from word.subtree
@ -254,14 +293,26 @@ cdef class Token:
yield from word.subtree yield from word.subtree
property left_edge: property left_edge:
'''The leftmost token of this token's syntactic descendents.
Returns: Token The first token such that self.is_ancestor(token)
'''
def __get__(self): def __get__(self):
return self.doc[self.c.l_edge] return self.doc[self.c.l_edge]
property right_edge: property right_edge:
'''The rightmost token of this token's syntactic descendents.
Returns: Token The last token such that self.is_ancestor(token)
'''
def __get__(self): def __get__(self):
return self.doc[self.c.r_edge] return self.doc[self.c.r_edge]
property ancestors: property ancestors:
'''A sequence of this token's syntactic ancestors.
Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
'''
def __get__(self): def __get__(self):
cdef const TokenC* head_ptr = self.c cdef const TokenC* head_ptr = self.c
# guard against infinite loop, no token can have # guard against infinite loop, no token can have
@ -273,9 +324,27 @@ cdef class Token:
i += 1 i += 1
def is_ancestor_of(self, descendant): def is_ancestor_of(self, descendant):
# TODO: Remove after backward compatibility check.
return self.is_ancestor(descendant)
def is_ancestor(self, descendant):
'''Check whether this token is a parent, grandparent, etc. of another
in the dependency tree.
Arguments:
descendant (Token): Another token.
Returns:
is_ancestor (bool): Whether this token is the ancestor of the descendant.
'''
if self.doc is not other.doc:
return False
return any( ancestor.i == self.i for ancestor in descendant.ancestors ) return any( ancestor.i == self.i for ancestor in descendant.ancestors )
property head: property head:
'''The syntactic parent, or "governor", of this token.
Returns: Token
'''
def __get__(self): def __get__(self):
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""
return self.doc[self.i + self.c.head] return self.doc[self.i + self.c.head]
@ -370,6 +439,10 @@ cdef class Token:
self.c.head = rel_newhead_i self.c.head = rel_newhead_i
property conjuncts: property conjuncts:
'''A sequence of coordinated tokens, including the token itself.
Yields: Token A coordinated token
'''
def __get__(self): def __get__(self):
"""Get a list of conjoined words.""" """Get a list of conjoined words."""
cdef Token word cdef Token word

View File

@ -52,6 +52,25 @@ cdef class Vocab:
@classmethod @classmethod
def load(cls, path, lex_attr_getters=None, lemmatizer=True, def load(cls, path, lex_attr_getters=None, lemmatizer=True,
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
"""
Load the vocabulary from a path.
Arguments:
path (Path):
The path to load from.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
"""
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
if 'vectors' in deprecated_kwargs: if 'vectors' in deprecated_kwargs:
raise AttributeError( raise AttributeError(
@ -82,6 +101,22 @@ cdef class Vocab:
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
serializer_freqs=None, **deprecated_kwargs): serializer_freqs=None, **deprecated_kwargs):
'''Create the vocabulary.
lex_attr_getters (dict):
A dictionary mapping attribute IDs to functions to compute them.
Defaults to None.
lemmatizer (object):
A lemmatizer. Defaults to None.
tag_map (dict):
A dictionary mapping fine-grained tags to coarse-grained parts-of-speech,
and optionally morphological attributes.
oov_prob (float):
The default probability for out-of-vocabulary words.
Returns:
Vocab: The newly constructed vocab object.
'''
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
@ -134,6 +169,9 @@ cdef class Vocab:
''' '''
Set vectors_length to a new size, and allocate more memory for the Lexeme Set vectors_length to a new size, and allocate more memory for the Lexeme
vectors if necessary. The memory will be zeroed. vectors if necessary. The memory will be zeroed.
Arguments:
new_size (int): The new size of the vectors.
''' '''
cdef hash_t key cdef hash_t key
cdef size_t addr cdef size_t addr
@ -145,11 +183,14 @@ cdef class Vocab:
self.vectors_length = new_size self.vectors_length = new_size
def add_flag(self, flag_getter, int flag_id=-1): def add_flag(self, flag_getter, int flag_id=-1):
'''Set a new boolean flag to words in the vocabulary. The flag_setter '''Set a new boolean flag to words in the vocabulary.
function will be called over the words currently in the vocab, and then
applied to new words as they occur. You'll then be able to access the The flag_setter function will be called over the words currently in the
flag value on each token, using token.check_flag(flag_id). See also: vocab, and then applied to new words as they occur. You'll then be able
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag. to access the flag value on each token, using token.check_flag(flag_id).
See also:
Lexeme.set_flag, Lexeme.check_flag, Token.set_flag, Token.check_flag.
Arguments: Arguments:
flag_getter: flag_getter:
@ -246,11 +287,23 @@ cdef class Vocab:
self.length += 1 self.length += 1
def __contains__(self, unicode string): def __contains__(self, unicode string):
'''Check whether the string has an entry in the vocabulary.
Arguments:
string (unicode): The ID string.
Returns:
bool Whether the string has an entry in the vocabulary.
'''
key = hash_string(string) key = hash_string(string)
lex = self._by_hash.get(key) lex = self._by_hash.get(key)
return True if lex is not NULL else False return True if lex is not NULL else False
def __iter__(self): def __iter__(self):
'''Iterate over the lexemes in the vocabulary.
Yields: Lexeme An entry in the vocabulary.
'''
cdef attr_t orth cdef attr_t orth
cdef size_t addr cdef size_t addr
for orth, addr in self._by_orth.items(): for orth, addr in self._by_orth.items():
@ -260,16 +313,15 @@ cdef class Vocab:
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously '''Retrieve a lexeme, given an int ID or a unicode string. If a previously
unseen unicode string is given, a new lexeme is created and stored. unseen unicode string is given, a new lexeme is created and stored.
Args: Arguments:
id_or_string (int or unicode): id_or_string (int or unicode):
The integer ID of a word, or its unicode string. If an int >= Lexicon.size, The integer ID of a word, or its unicode string.
IndexError is raised. If id_or_string is neither an int nor a unicode string,
ValueError is raised. If an int >= Lexicon.size, IndexError is raised. If id_or_string
is neither an int nor a unicode string, ValueError is raised.
Returns: Returns:
lexeme (Lexeme): lexeme (Lexeme): The lexeme indicated by the given ID.
An instance of the Lexeme Python class, with data copied on
instantiation.
''' '''
cdef attr_t orth cdef attr_t orth
if type(id_or_string) == unicode: if type(id_or_string) == unicode:
@ -295,6 +347,11 @@ cdef class Vocab:
return tokens return tokens
def dump(self, loc): def dump(self, loc):
"""Save the lexemes binary data to the given location.
Arguments:
loc (Path): The path to save to.
"""
if hasattr(loc, 'as_posix'): if hasattr(loc, 'as_posix'):
loc = loc.as_posix() loc = loc.as_posix()
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
@ -323,6 +380,14 @@ cdef class Vocab:
fp.close() fp.close()
def load_lexemes(self, loc): def load_lexemes(self, loc):
'''Load the binary vocabulary data from the given location.
Arguments:
loc (Path): The path to load from.
Returns:
None
'''
fp = CFile(loc, 'rb', fp = CFile(loc, 'rb',
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
cdef LexemeC* lexeme cdef LexemeC* lexeme
@ -363,6 +428,13 @@ cdef class Vocab:
fp.close() fp.close()
def dump_vectors(self, out_loc): def dump_vectors(self, out_loc):
'''Save the word vectors to a binary file.
Arguments:
loc (Path): The path to save to.
Returns:
None
'''
cdef int32_t vec_len = self.vectors_length cdef int32_t vec_len = self.vectors_length
cdef int32_t word_len cdef int32_t word_len
cdef bytes word_str cdef bytes word_str
@ -384,6 +456,17 @@ cdef class Vocab:
out_file.close() out_file.close()
def load_vectors(self, file_): def load_vectors(self, file_):
"""Load vectors from a text-based file.
Arguments:
file_ (buffer): The file to read from. Entries should be separated by newlines,
and each entry should be whitespace delimited. The first value of the entry
should be the word string, and subsequent entries should be the values of the
vector.
Returns:
vec_len (int): The length of the vectors loaded.
"""
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth cdef attr_t orth
cdef int32_t vec_len = -1 cdef int32_t vec_len = -1
@ -409,6 +492,14 @@ cdef class Vocab:
return vec_len return vec_len
def load_vectors_from_bin_loc(self, loc): def load_vectors_from_bin_loc(self, loc):
"""Load vectors from the location of a binary file.
Arguments:
loc (unicode): The path of the binary file to load from.
Returns:
vec_len (int): The length of the vectors loaded.
"""
cdef CFile file_ = CFile(loc, b'rb') cdef CFile file_ = CFile(loc, b'rb')
cdef int32_t word_len cdef int32_t word_len
cdef int32_t vec_len = 0 cdef int32_t vec_len = 0