Update docstrings and API docs for Span class

This commit is contained in:
ines 2017-05-19 00:31:31 +02:00
parent 8455cb1327
commit 0791f0aae6
2 changed files with 290 additions and 213 deletions

View File

@ -66,6 +66,10 @@ cdef class Span:
return hash((self.doc, self.label, self.start_char, self.end_char)) return hash((self.doc, self.label, self.start_char, self.end_char))
def __len__(self): def __len__(self):
"""Get the number of tokens in the span.
RETURNS (int): The number of tokens in the span.
"""
self._recalculate_indices() self._recalculate_indices()
if self.end < self.start: if self.end < self.start:
return 0 return 0
@ -77,6 +81,16 @@ cdef class Span:
return self.text.encode('utf-8') return self.text.encode('utf-8')
def __getitem__(self, object i): def __getitem__(self, object i):
"""Get a `Token` or a `Span` object
i (int or tuple): The index of the token within the span, or slice of
the span to get.
RETURNS (Token or Span): The token at `span[i]`.
EXAMPLE:
>>> span[0]
>>> span[1:3]
"""
self._recalculate_indices() self._recalculate_indices()
if isinstance(i, slice): if isinstance(i, slice):
start, end = normalize_slice(len(self), i.start, i.stop, i.step) start, end = normalize_slice(len(self), i.start, i.stop, i.step)
@ -88,12 +102,17 @@ cdef class Span:
return self.doc[self.start + i] return self.doc[self.start + i]
def __iter__(self): def __iter__(self):
"""Iterate over `Token` objects.
YIELDS (Token): A `Token` object.
"""
self._recalculate_indices() self._recalculate_indices()
for i in range(self.start, self.end): for i in range(self.start, self.end):
yield self.doc[i] yield self.doc[i]
def merge(self, *args, **attributes): def merge(self, *args, **attributes):
"""Retokenize the document, such that the span is merged into a single token. """Retokenize the document, such that the span is merged into a single
token.
**attributes: Attributes to assign to the merged token. By default, **attributes: Attributes to assign to the merged token. By default,
attributes are inherited from the syntactic root token of the span. attributes are inherited from the syntactic root token of the span.
@ -241,15 +260,15 @@ cdef class Span:
The head of 'new' is 'York', and the head of "York" is "like" The head of 'new' is 'York', and the head of "York" is "like"
>>> toks[new].head.orth_ >>> toks[new].head.text
'York' 'York'
>>> toks[york].head.orth_ >>> toks[york].head.text
'like' 'like'
Create a span for "New York". Its root is "York". Create a span for "New York". Its root is "York".
>>> new_york = toks[new:york+1] >>> new_york = toks[new:york+1]
>>> new_york.root.orth_ >>> new_york.root.text
'York' 'York'
Here's a more complicated case, raised by issue #214: Here's a more complicated case, raised by issue #214:
@ -370,7 +389,10 @@ cdef class Span:
return ''.join([t.string for t in self]).strip() return ''.join([t.string for t in self]).strip()
property lemma_: property lemma_:
# TODO: docstring """The span's lemma.
RETURNS (unicode): The span's lemma.
"""
def __get__(self): def __get__(self):
return ' '.join([t.lemma_ for t in self]).strip() return ' '.join([t.lemma_ for t in self]).strip()
@ -390,7 +412,10 @@ cdef class Span:
return ''.join([t.string for t in self]) return ''.join([t.string for t in self])
property label_: property label_:
# TODO: docstring """The span's label.
RETURNS (unicode): The span's label.
"""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]

View File

@ -2,7 +2,265 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p A slice from a #[code Doc] object. p A slice from a #[+api("doc") #[code Doc]] object.
+h(2, "init") Span.__init__
+tag method
p Create a Span object from the #[code slice doc[start : end]].
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
print([token.text for token in span])
# ['it', 'back', '!']
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code start]
+cell int
+cell The index of the first token of the span.
+row
+cell #[code end]
+cell int
+cell The index of the first token after the span.
+row
+cell #[code label]
+cell int
+cell A label to attach to the span, e.g. for named entities.
+row
+cell #[code vector]
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+h(2, "getitem") Span.__getitem__
+tag method
p Get a #[code Token] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1].text == 'back'
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token within the span.
+footrow
+cell returns
+cell #[code Token]
+cell The token at #[code span[i]].
p Get a #[code Span] object.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert span[1:3].text == 'back!'
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the span to get.
+footrow
+cell returns
+cell #[code Span]
+cell The span at #[code span[start : end]].
+h(2, "iter") Span.__iter__
+tag method
p Iterate over #[code Token] objects.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
print([token.text for token in span])
# ['it', 'back', '!']
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Span.__len__
+tag method
p Get the number of tokens in the span.
+aside-code("Example").
doc = nlp(u'Give it back! He pleaded.')
span = doc[1:4]
assert len(span) == 3
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of tokens in the span.
+h(2, "similarity") Span.similarity
+tag method
+tag requires model
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+aside-code("Example").
doc = nlp(u'apples and oranges')
apples = doc[0]
oranges = doc[1]
apples_oranges = apples.similarity(oranges)
oranges_apples = oranges.similarity(apples)
assert apples_oranges == oranges_apples
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "merge") Span.merge
+tag method
p Retokenize the document, such that the span is merged into a single token.
+table(["Name", "Type", "Description"])
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default, attributes
| are inherited from the syntactic root token of the span.
+footrow
+cell returns
+cell #[code Token]
+cell The newly merged token.
+h(2, "text") Span.text
+tag property
+aside-code("Example").
doc = nlp('Give it back! He pleaded.')
assert doc[1:4].text == 'it back!'
p A unicode representation of the span text.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell unicode
+cell The original verbatim text of the span.
+h(2, "text_with_ws") Span.text_with_ws
+tag property
+aside-code("Example").
doc = nlp('Give it back! He pleaded.')
assert doc[1:4].text_with_ws == 'it back! '
p
| The text content of the span with a trailing whitespace character if the
| last token has one.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell unicode
+cell The text content of the span (with trailing whitespace).
+h(2, "sent") Span.sent
+tag property
p The sentence span that this span is a part of.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Span]
+cell The sentence this is part of.
+h(2, "root") Span.root
+tag property
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
+aside-code("Example").
tokens = nlp(u'I like New York in Autumn.')
i, like, new, york, in_, autumn, dot = range(len(tokens))
assert tokens[new].head.text == 'York'
assert tokens[york].head.text == 'like'
new_york = tokens[new&#58;york+1]
assert new_york.root.text == 'York'
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Token]
+cell The root token.
+h(2, "lefts") Span.lefts
+tag property
p Tokens that are to the left of the span, whose head is within the span.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A left-child of a token of the span.
+h(2, "rights") Span.rights
+tag property
p Tokens that are to the right of the span, whose head is within the span.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A right-child of a token of the span.
+h(2, "subtree") Span.subtree
+tag property
p Tokens that descend from tokens in the span, but fall outside it.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant of a token within the span.
+h(2, "attributes") Attributes +h(2, "attributes") Attributes
@ -56,209 +314,3 @@ p A slice from a #[code Doc] object.
+cell #[code ent_id_] +cell #[code ent_id_]
+cell unicode +cell unicode
+cell The string ID of the named entity the token is an instance of. +cell The string ID of the named entity the token is an instance of.
+h(2, "init") Span.__init__
+tag method
p Create a Span object from the #[code slice doc[start : end]].
+table(["Name", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The parent document.
+row
+cell #[code start]
+cell int
+cell The index of the first token of the span.
+row
+cell #[code end]
+cell int
+cell The index of the first token after the span.
+row
+cell #[code label]
+cell int
+cell A label to attach to the span, e.g. for named entities.
+row
+cell #[code vector]
+cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+cell A meaning representation of the span.
+footrow
+cell returns
+cell #[code Span]
+cell The newly constructed object.
+h(2, "getitem") Span.__getitem__
+tag method
p Get a #[code Token] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code i]
+cell int
+cell The index of the token within the span.
+footrow
+cell returns
+cell #[code Token]
+cell The token at #[code span[i]].
p Get a #[code Span] object.
+table(["Name", "Type", "Description"])
+row
+cell #[code start_end]
+cell tuple
+cell The slice of the span to get.
+footrow
+cell returns
+cell #[code Span]
+cell The span at #[code span[start : end]].
+h(2, "iter") Span.__iter__
+tag method
p Iterate over #[code Token] objects.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A #[code Token] object.
+h(2, "len") Span.__len__
+tag method
p Get the number of tokens in the span.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell int
+cell The number of tokens in the span.
+h(2, "similarity") Span.similarity
+tag method
p
| Make a semantic similarity estimate. The default estimate is cosine
| similarity using an average of word vectors.
+table(["Name", "Type", "Description"])
+row
+cell #[code other]
+cell -
+cell
| The object to compare with. By default, accepts #[code Doc],
| #[code Span], #[code Token] and #[code Lexeme] objects.
+footrow
+cell returns
+cell float
+cell A scalar similarity score. Higher is more similar.
+h(2, "merge") Span.merge
+tag method
p Retokenize the document, such that the span is merged into a single token.
+table(["Name", "Type", "Description"])
+row
+cell #[code **attributes]
+cell -
+cell
| Attributes to assign to the merged token. By default, attributes
| are inherited from the syntactic root token of the span.
+footrow
+cell returns
+cell #[code Token]
+cell The newly merged token.
+h(2, "text") Span.text
+tag property
p A unicode representation of the span text.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell unicode
+cell The original verbatim text of the span.
+h(2, "text_with_ws") Span.text_with_ws
+tag property
p
| The text content of the span with a trailing whitespace character if the
| last token has one.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell unicode
+cell The text content of the span (with trailing whitespace).
+h(2, "sent") Span.sent
+tag property
p The sentence span that this span is a part of.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Span]
+cell The sentence this is part of.
+h(2, "root") Span.root
+tag property
p
| The token within the span that's highest in the parse tree. If there's a
| tie, the earlist is prefered.
+table(["Name", "Type", "Description"])
+footrow
+cell returns
+cell #[code Token]
+cell The root token.
+h(2, "lefts") Span.lefts
+tag property
p Tokens that are to the left of the span, whose head is within the span.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A left-child of a token of the span.
+h(2, "rights") Span.rights
+tag property
p Tokens that are to the right of the span, whose head is within the span.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A right-child of a token of the span.
+h(2, "subtree") Span.subtree
+tag property
p Tokens that descend from tokens in the span, but fall outside it.
+table(["Name", "Type", "Description"])
+footrow
+cell yields
+cell #[code Token]
+cell A descendant of a token within the span.