mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 03:56:23 +03:00
308 lines
11 KiB
Plaintext
308 lines
11 KiB
Plaintext
|
//- Docs > API > Span
|
||
|
//- ============================================================================
|
||
|
|
||
|
+section('span')
|
||
|
+h2('span', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/tokens/span.pyx#L19')
|
||
|
| #[+label('tag') class] Span
|
||
|
|
||
|
p
|
||
|
| A slice of a #[code Doc] object, consisting of zero or
|
||
|
| more tokens. Spans are usually used to represent sentences, named entities,
|
||
|
| phrases.
|
||
|
|
||
|
+aside('Implementation')
|
||
|
#[code Span] objects are views – that is, they do not copy the
|
||
|
underlying C data. This makes them cheap to construct, as internally are
|
||
|
simply a reference to the #[code Doc] object, a start position, an end
|
||
|
position, and a label ID.
|
||
|
|
||
|
+code('python', 'Overview').
|
||
|
class Span:
|
||
|
doc = Doc
|
||
|
start = int
|
||
|
end = int
|
||
|
label = int
|
||
|
|
||
|
def __init__(self, doc, start, end, label=0, vector=None, vector_norm=None):
|
||
|
return self
|
||
|
|
||
|
def __len__(self):
|
||
|
return int
|
||
|
def __getitem__(self, i):
|
||
|
return Token()
|
||
|
def __iter__(self):
|
||
|
yield Token()
|
||
|
|
||
|
def similarity(self, other):
|
||
|
return float
|
||
|
|
||
|
def merge(self, tag, lemma, ent_type):
|
||
|
return None
|
||
|
|
||
|
@property
|
||
|
def label_(self):
|
||
|
return unicode
|
||
|
|
||
|
@property
|
||
|
def vector(self):
|
||
|
return numpy.ndarray(dtype='float64')
|
||
|
@property
|
||
|
def vector_norm(self):
|
||
|
return float
|
||
|
|
||
|
@property
|
||
|
def text(self):
|
||
|
return unicode
|
||
|
@property
|
||
|
def text_with_ws(self):
|
||
|
return unicode
|
||
|
@property
|
||
|
def orth_(self):
|
||
|
return unicode
|
||
|
@property
|
||
|
def lemma_(self):
|
||
|
return unicode
|
||
|
|
||
|
@property
|
||
|
def root(self):
|
||
|
return Token()
|
||
|
@property
|
||
|
def lefts(self):
|
||
|
yield Token()
|
||
|
@property
|
||
|
def rights(self):
|
||
|
yield Token()
|
||
|
@property
|
||
|
def subtree(self):
|
||
|
yield Token()
|
||
|
|
||
|
+section('span-create')
|
||
|
+h3('span-init')
|
||
|
| #[+label('tag') Section] Create a Span
|
||
|
|
||
|
p
|
||
|
| Span instances are usually created via the #[code Doc] object.
|
||
|
|
||
|
+table(['Example', 'Description'], 'code')
|
||
|
+row
|
||
|
+cell #[code.lang-python span = doc[4 : 7]]
|
||
|
+cell Produce a span with tokens 4, 5 and 6.
|
||
|
+row
|
||
|
+cell #[code.lang-python span = Span(doc, start, end, label=spacy.symbols.PERSON)]
|
||
|
+cell Calling #[code Span.__init__] directly allows you to set a label.
|
||
|
+row
|
||
|
+cell #[code.lang-python for entity in doc.ents]
|
||
|
+cell See #[a(href="/docs#doc-spans-ents") Doc.ents]
|
||
|
+row
|
||
|
+cell #[code.lang-python for sentence in doc.sents]
|
||
|
+cell See #[a(href="/docs#doc-spans-sents") Doc.sents]
|
||
|
+row
|
||
|
+cell #[code.lang-python for noun_phrase in doc.noun_chunks]
|
||
|
+cell See #[a(href="/docs#doc-spans-nounchunks") Doc.noun_chunks]
|
||
|
|
||
|
+code('python', 'Definition').
|
||
|
def __init__(self, doc, start, end, label=0, vector=None, vector_norm=None):
|
||
|
return Span()
|
||
|
|
||
|
+table(['Name', 'Type', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell doc
|
||
|
+cell Doc
|
||
|
+cell The parent doc object, to slice from.
|
||
|
+row
|
||
|
+cell start
|
||
|
+cell int
|
||
|
+cell The index of the first token in the slice.
|
||
|
+row
|
||
|
+cell end
|
||
|
+cell int
|
||
|
+cell The index of the first token #[em outside] the slice (since ranges are exclusive in Python).
|
||
|
+row
|
||
|
+cell label
|
||
|
+cell int or unicode
|
||
|
+cell A label for the span. Either a string, or an integer ID, that should refer to a string mapped by the #[code Doc] object's #[code StringStore].
|
||
|
+row
|
||
|
+cell vector
|
||
|
+cell
|
||
|
+cell
|
||
|
+row
|
||
|
+cell vector_norm
|
||
|
+cell
|
||
|
+cell
|
||
|
|
||
|
+section('span-merge')
|
||
|
+h3('span-merge')
|
||
|
| #[+label('tag') method] Span.merge
|
||
|
|
||
|
p
|
||
|
| Merge the span into a single token, modifying the underlying
|
||
|
| #[code.lang-python Doc] object in place.
|
||
|
|
||
|
+aside('Caveat').
|
||
|
Magic is done to allow you to call #[code.lang-python merge()]
|
||
|
without invalidating other #[code.lang-python Span] objects.
|
||
|
However, it's difficult to ensure all indices are recomputed
|
||
|
correctly. Please report any errors encountered on the issue
|
||
|
tracker.
|
||
|
|
||
|
+code('python', 'Example').
|
||
|
for ent in doc.ents:
|
||
|
ent.merge(ent.root.tag_, ent.text, ent.label_)
|
||
|
for np in doc.noun_chunks:
|
||
|
while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'):
|
||
|
np = np[1:]
|
||
|
np.merge(np.root.tag_, np.text, np.root.ent_type_)
|
||
|
|
||
|
+code('python', 'Definition').
|
||
|
def merge(self, tag, lemma, ent_type):
|
||
|
return None
|
||
|
|
||
|
+table(['Name', 'Type', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell tag
|
||
|
+cell unicode
|
||
|
+cell The fine-grained part-of-speech tag to assign to the new token.
|
||
|
+row
|
||
|
+cell lemma
|
||
|
+cell unicode
|
||
|
+cell The lemma string for the new token.
|
||
|
+row
|
||
|
+cell ent_type
|
||
|
+cell unicode
|
||
|
+cell The named entity type to assign to the new token.
|
||
|
|
||
|
+section('span-similarity')
|
||
|
+h3('span-similarity')
|
||
|
| #[+label('tag') method] Span.similarity
|
||
|
|
||
|
p
|
||
|
| Estimate the semantic similarity between the span and another #[code Span],
|
||
|
#[code Doc], #[code Token] or #[code Lexeme].
|
||
|
|
||
|
+aside('Algorithm').
|
||
|
Similarity is estimated
|
||
|
using the cosine metric, between #[code Span.vector] and #[code other.vector].
|
||
|
By default, #[code Span.vector] is computed by averaging the vectors
|
||
|
of its tokens.
|
||
|
|
||
|
+code('python', 'Example').
|
||
|
doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
|
||
|
apples_sent, boots_sent = doc.sents
|
||
|
fruit = doc.vocab[u'fruit']
|
||
|
assert apples_sent.similarity(fruit) > boot_sent.similarity(fruit)
|
||
|
|
||
|
+code('python', 'Definition').
|
||
|
def similarity(self, other):
|
||
|
return float
|
||
|
|
||
|
+table(['Name', 'Type', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell other
|
||
|
+cell Token, Span, Doc or Lexeme
|
||
|
+cell The other object to judge similarity with.
|
||
|
|
||
|
+section('span-sequence')
|
||
|
+h3('span-sequence')
|
||
|
| #[+label('tag') section] Span as a Sequence
|
||
|
|
||
|
p.
|
||
|
#[code Span] objects act as a sequence of #[code Token] objects. In
|
||
|
this way they mirror the API of the #[code Doc] object.
|
||
|
|
||
|
+table(['Name', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell #[code.lang-python token = span[i]]
|
||
|
+cell
|
||
|
| Get the #[code Token] object at position #[em i], where
|
||
|
| #[code i] is an offset within the #[code Span], not the
|
||
|
| document. That is, if you have #[code.lang-python span = doc[4:6]],
|
||
|
| then #[code.lang-python span[0].i == 4]
|
||
|
|
||
|
+row
|
||
|
+cell #[code.lang-python for token in span]
|
||
|
+cell.
|
||
|
Iterate over the #[code Token] objects in the span.
|
||
|
|
||
|
+row
|
||
|
+cell __len__
|
||
|
+cell.
|
||
|
Number of tokens in the span.
|
||
|
|
||
|
+row
|
||
|
+cell text
|
||
|
+cell.
|
||
|
The text content of the span, obtained from
|
||
|
#[code ''.join(token.text_with_ws for token in span)].
|
||
|
|
||
|
+row
|
||
|
+cell start
|
||
|
+cell.
|
||
|
The start offset of the span, i.e. #[code span[0].i].
|
||
|
|
||
|
+row
|
||
|
+cell end
|
||
|
+cell.
|
||
|
The end offset of the span, i.e. #[code span[-1].i + 1].
|
||
|
|
||
|
+section('span-navigating-parse')
|
||
|
+h3('span-navigativing-parse')
|
||
|
| #[+label('tag') Section] Span and the Syntactic Parse
|
||
|
|
||
|
p.
|
||
|
Span objects allow similar access to the syntactic parse as individual
|
||
|
tokens.
|
||
|
|
||
|
+table(['Name', 'Type', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell root
|
||
|
+cell #[code.lang-python Token]
|
||
|
+cell
|
||
|
| The word with the shortest path to the root of the sentence is
|
||
|
| the root of the span.
|
||
|
+row
|
||
|
+cell lefts
|
||
|
+cell #[code.lang-python yield Token]
|
||
|
+cell Tokens that are to the left of the span, whose head is within it.
|
||
|
+row
|
||
|
+cell rights
|
||
|
+cell #[code.lang-python yield Token]
|
||
|
+cell Tokens that are to the right of the span, whose head is within it.
|
||
|
|
||
|
+row
|
||
|
+cell subtree
|
||
|
+cell #[code.lang-python yield Token]
|
||
|
+cell
|
||
|
| Tokens in the range #[code (start, end+1)], where #[code start]
|
||
|
| is the index of the leftmost word descended from a token in the
|
||
|
| span, and #[code end] is the index of the rightmost token descended
|
||
|
| from a token in the span.
|
||
|
|
||
|
+section('span-strings')
|
||
|
+h3('span-strings')
|
||
|
| #[+label('tag') section] Span's Strings API
|
||
|
|
||
|
p.
|
||
|
You can access the textual content of the span, and different view of
|
||
|
it, with the following properties.
|
||
|
|
||
|
+table(['Name', 'Type', 'Description'], 'params')
|
||
|
+row
|
||
|
+cell text_with_ws
|
||
|
+cell unicode
|
||
|
+cell.
|
||
|
The form of the span as it appears in the string, including
|
||
|
trailing whitespace. This is useful when you need to use linguistic
|
||
|
features to add inline mark-up to the string.
|
||
|
|
||
|
+row
|
||
|
+cell lemma / lemma_
|
||
|
+cell int / unicode
|
||
|
+cell.
|
||
|
Whitespace-concatenated lemmas of each token in the span.
|
||
|
|
||
|
+row
|
||
|
+cell label / label_
|
||
|
+cell int / unicode
|
||
|
+cell.
|
||
|
The span label, used particularly for named entities.
|