Improve docstrings for Doc object

This commit is contained in:
Matthew Honnibal 2016-09-28 11:15:13 +02:00
parent 81a47c01d8
commit 1b520e7bab

View File

@ -59,10 +59,42 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
cdef class Doc: cdef class Doc:
""" """
Container class for annotated text. Constructed via English.__call__ or A sequence of `Token` objects. Access sentences and named entities,
Tokenizer.__call__. export annotations to numpy arrays, losslessly serialize to compressed
binary strings.
Aside: Internals
The `Doc` object holds an array of `TokenC` structs.
The Python-level `Token` and `Span` objects are views of this
array, i.e. they don't own the data themselves.
Code: Construction 1
doc = nlp.tokenizer(u'Some text')
Code: Construction 2
doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)])
""" """
def __init__(self, Vocab vocab, orths_and_spaces=None): def __init__(self, Vocab vocab, orths_and_spaces=None):
'''
Create a Doc object.
Aside: Implementation
This method of constructing a `Doc` object is usually only used
for deserialization. Standard usage is to construct the document via
a call to the language object.
Arguments:
vocab:
A Vocabulary object, which must match any models you want to
use (e.g. tokenizer, parser, entity recognizer).
orths_and_spaces:
A list of tokens in the document as a sequence of
`(orth_id, has_space)` tuples, where `orth_id` is an
integer and `has_space` is a boolean, indicating whether the
token has a trailing space.
'''
self.vocab = vocab self.vocab = vocab
size = 20 size = 20
self.mem = Pool() self.mem = Pool()
@ -102,11 +134,21 @@ cdef class Doc:
<const LexemeC*>self.vocab.get(self.mem, orth), has_space) <const LexemeC*>self.vocab.get(self.mem, orth), has_space)
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a Token or a Span from the Doc. '''
doc[i]
Returns: Get the Token object at position i, where i is an integer.
token (Token) or span (Span): Negative indexing is supported, and follows the usual Python
""" semantics, i.e. doc[-2] is doc[len(doc) - 2].
doc[start : end]]
Get a `Span` object, starting at position `start`
and ending at position `end`, where `start` and
`end` are token indices. For instance,
`doc[2:5]` produces a span consisting of
tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`)
are not supported, as `Span` objects must be contiguous (cannot have gaps).
You can use negative indices and open-ended ranges, which have their
normal Python semantics.
'''
if isinstance(i, slice): if isinstance(i, slice):
start, stop = normalize_slice(len(self), i.start, i.stop, i.step) start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
return Span(self, start, stop, label=0) return Span(self, start, stop, label=0)
@ -120,11 +162,15 @@ cdef class Doc:
return Token.cinit(self.vocab, &self.c[i], i, self) return Token.cinit(self.vocab, &self.c[i], i, self)
def __iter__(self): def __iter__(self):
"""Iterate over the tokens. '''
for token in doc
Yields: Iterate over `Token` objects, from which the annotations can
token (Token): be easily accessed. This is the main way of accessing Token
""" objects, which are the main way annotations are accessed from
Python. If faster-than-Python speeds are required, you can
instead access the annotations as a numpy array, or access the
underlying C data directly from Cython.
'''
cdef int i cdef int i
for i in range(self.length): for i in range(self.length):
if self._py_tokens[i] is not None: if self._py_tokens[i] is not None:
@ -133,6 +179,10 @@ cdef class Doc:
yield Token.cinit(self.vocab, &self.c[i], i, self) yield Token.cinit(self.vocab, &self.c[i], i, self)
def __len__(self): def __len__(self):
'''
len(doc)
The number of tokens in the document.
'''
return self.length return self.length
def __unicode__(self): def __unicode__(self):
@ -161,7 +211,10 @@ cdef class Doc:
property vector: property vector:
def __get__(self): def __get__(self):
if self._vector is None: if self._vector is None:
self._vector = sum(t.vector for t in self) / len(self) if len(self):
self._vector = sum(t.vector for t in self) / len(self)
else:
return numpy.zeros((self.vocab.vectors_length,), dtype='float32')
return self._vector return self._vector
def __set__(self, value): def __set__(self, value):
@ -193,18 +246,22 @@ cdef class Doc:
return u''.join(t.text_with_ws for t in self) return u''.join(t.text_with_ws for t in self)
property ents: property ents:
'''
Yields named-entity `Span` objects, if the entity recognizer
has been applied to the document. Iterate over the span to get
individual Token objects, or access the label:
Example:
from spacy.en import English
nlp = English()
tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
ents = list(tokens.ents)
assert ents[0].label == 346
assert ents[0].label_ == 'PERSON'
assert ents[0].orth_ == 'Best'
assert ents[0].text == 'Mr. Best'
'''
def __get__(self): def __get__(self):
"""Yields named-entity Span objects.
Iterate over the span to get individual Token objects, or access the label:
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.')
>>> ents = list(tokens.ents)
>>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0])
(112504, u'PERSON', u'Best ')
"""
cdef int i cdef int i
cdef const TokenC* token cdef const TokenC* token
cdef int start = -1 cdef int start = -1
@ -263,44 +320,59 @@ cdef class Doc:
# Set start as B # Set start as B
self.c[start].ent_iob = 3 self.c[start].ent_iob = 3
@property property:
def noun_chunks(self): '''
"""Yield spans for base noun phrases.""" Yields base noun-phrase #[code Span] objects, if the document
if not self.is_parsed: has been syntactically parsed. A base noun phrase, or
raise ValueError( 'NP chunk', is a noun phrase that does not permit other NPs to
"noun_chunks requires the dependency parse, which " be nested within it so no NP-level coordination, no prepositional
"requires data to be installed. If you haven't done so, run: " phrases, and no relative clauses. For example:
"\npython -m spacy.%s.download all\n" '''
"to install the data" % self.vocab.lang) def __get__(self):
# Accumulate the result before beginning to iterate over it. This prevents if not self.is_parsed:
# the tokenisation from being changed out from under us during the iteration. raise ValueError(
# The tricky thing here is that Span accepts its tokenisation changing, "noun_chunks requires the dependency parse, which "
# so it's okay once we have the Span objects. See Issue #375 "requires data to be installed. If you haven't done so, run: "
spans = [] "\npython -m spacy.%s.download all\n"
for start, end, label in self.noun_chunks_iterator(self): "to install the data" % self.vocab.lang)
spans.append(Span(self, start, end, label=label)) # Accumulate the result before beginning to iterate over it. This prevents
for span in spans: # the tokenisation from being changed out from under us during the iteration.
yield span # The tricky thing here is that Span accepts its tokenisation changing,
# so it's okay once we have the Span objects. See Issue #375
spans = []
for start, end, label in self.noun_chunks_iterator(self):
spans.append(Span(self, start, end, label=label))
for span in spans:
yield span
@property property sents:
def sents(self):
""" """
Yield a list of sentence Span objects, calculated from the dependency parse. Yields sentence `Span` objects. Sentence spans have no label.
To improve accuracy on informal texts, spaCy calculates sentence
boundaries from the syntactic dependency parse. If the parser is disabled,
`sents` iterator will be unavailable.
Example:
from spacy.en import English
nlp = English()
doc = nlp("This is a sentence. Here's another...")
assert [s.root.orth_ for s in doc.sents] == ["is", "'s"]
""" """
if not self.is_parsed: def __get__(self):
raise ValueError( if not self.is_parsed:
"sentence boundary detection requires the dependency parse, which " raise ValueError(
"requires data to be installed. If you haven't done so, run: " "sentence boundary detection requires the dependency parse, which "
"\npython -m spacy.%s.download all\n" "requires data to be installed. If you haven't done so, run: "
"to install the data" % self.vocab.lang) "\npython -m spacy.%s.download all\n"
cdef int i "to install the data" % self.vocab.lang)
start = 0 cdef int i
for i in range(1, self.length): start = 0
if self.c[i].sent_start: for i in range(1, self.length):
yield Span(self, start, i) if self.c[i].sent_start:
start = i yield Span(self, start, i)
if start != self.length: start = i
yield Span(self, start, self.length) if start != self.length:
yield Span(self, start, self.length)
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
if self.length == self.max_length: if self.length == self.max_length:
@ -324,8 +396,16 @@ cdef class Doc:
@cython.boundscheck(False) @cython.boundscheck(False)
cpdef np.ndarray to_array(self, object py_attr_ids): cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy ndarray """
of shape N*M, where N is the length of the sentence. Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape (N, M), where `N` is the length
of the document. The values will be 32-bit integers.
Example:
from spacy import attrs
doc = nlp(text)
# All strings mapped to integers, for easy export to numpy
np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA])
Arguments: Arguments:
attr_ids (list[int]): A list of attribute ID ints. attr_ids (list[int]): A list of attribute ID ints.
@ -351,16 +431,22 @@ cdef class Doc:
"""Produce a dict of {attribute (int): count (ints)} frequencies, keyed """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
by the values of the given attribute ID. by the values of the given attribute ID.
>>> from spacy.en import English, attrs Example:
>>> nlp = English() from spacy.en import English, attrs
>>> tokens = nlp(u'apple apple orange banana') nlp = English()
>>> tokens.count_by(attrs.ORTH) tokens = nlp(u'apple apple orange banana')
{12800L: 1, 11880L: 2, 7561L: 1} tokens.count_by(attrs.ORTH)
>>> tokens.to_array([attrs.ORTH]) # {12800L: 1, 11880L: 2, 7561L: 1}
array([[11880], tokens.to_array([attrs.ORTH])
[11880], # array([[11880],
[ 7561], # [11880],
[12800]]) # [ 7561],
# [12800]])
Arguments:
attr_id
int
The attribute ID to key the counts.
""" """
cdef int i cdef int i
cdef attr_t attr cdef attr_t attr
@ -408,6 +494,8 @@ cdef class Doc:
self.c[i] = parsed[i] self.c[i] = parsed[i]
def from_array(self, attrs, array): def from_array(self, attrs, array):
'''Write to a `Doc` object, from an `(M, N)` array of attributes.
'''
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -448,16 +536,34 @@ cdef class Doc:
return self return self
def to_bytes(self): def to_bytes(self):
'''Serialize, producing a byte string.'''
byte_string = self.vocab.serializer.pack(self) byte_string = self.vocab.serializer.pack(self)
cdef uint32_t length = len(byte_string) cdef uint32_t length = len(byte_string)
return struct.pack('I', length) + byte_string return struct.pack('I', length) + byte_string
def from_bytes(self, data): def from_bytes(self, data):
'''Deserialize, loading from bytes.'''
self.vocab.serializer.unpack_into(data[4:], self) self.vocab.serializer.unpack_into(data[4:], self)
return self return self
@staticmethod @staticmethod
def read_bytes(file_): def read_bytes(file_):
'''
A static method, used to read serialized #[code Doc] objects from
a file. For example:
Example:
from spacy.tokens.doc import Doc
loc = 'test_serialize.bin'
with open(loc, 'wb') as file_:
file_.write(nlp(u'This is a document.').to_bytes())
file_.write(nlp(u'This is another.').to_bytes())
docs = []
with open(loc, 'rb') as file_:
for byte_string in Doc.read_bytes(file_):
docs.append(Doc(nlp.vocab).from_bytes(byte_string))
assert len(docs) == 2
'''
keep_reading = True keep_reading = True
while keep_reading: while keep_reading:
try: try:
@ -472,8 +578,7 @@ cdef class Doc:
def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
unicode ent_type): unicode ent_type):
"""Merge a multi-word expression into a single token. Currently """Merge a multi-word expression into a single token."""
experimental; API is likely to change."""
cdef int start = token_by_start(self.c, self.length, start_idx) cdef int start = token_by_start(self.c, self.length, start_idx)
if start == -1: if start == -1:
return None return None