//- Docs > API > Span //- ============================================================================ +section('span') +h2('span', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/tokens/span.pyx#L19') | #[+label('tag') class] Span p | A slice of a #[code Doc] object, consisting of zero or | more tokens. Spans are usually used to represent sentences, named entities, | phrases. +aside('Implementation') #[code Span] objects are views – that is, they do not copy the underlying C data. This makes them cheap to construct, as internally are simply a reference to the #[code Doc] object, a start position, an end position, and a label ID. +code('python', 'Overview'). class Span: doc = Doc start = int end = int label = int def __init__(self, doc, start, end, label=0, vector=None, vector_norm=None): return self def __len__(self): return int def __getitem__(self, i): return Token() def __iter__(self): yield Token() def similarity(self, other): return float def merge(self, tag, lemma, ent_type): return None @property def label_(self): return unicode @property def vector(self): return numpy.ndarray(dtype='float64') @property def vector_norm(self): return float @property def text(self): return unicode @property def text_with_ws(self): return unicode @property def orth_(self): return unicode @property def lemma_(self): return unicode @property def root(self): return Token() @property def lefts(self): yield Token() @property def rights(self): yield Token() @property def subtree(self): yield Token() +section('span-create') +h3('span-init') | #[+label('tag') Section] Create a Span p | Span instances are usually created via the #[code Doc] object. +table(['Example', 'Description'], 'code') +row +cell #[code.lang-python span = doc[4 : 7]] +cell Produce a span with tokens 4, 5 and 6. +row +cell #[code.lang-python span = Span(doc, start, end, label=spacy.symbols.PERSON)] +cell Calling #[code Span.__init__] directly allows you to set a label. +row +cell #[code.lang-python for entity in doc.ents] +cell See #[a(href="/docs#doc-spans-ents") Doc.ents] +row +cell #[code.lang-python for sentence in doc.sents] +cell See #[a(href="/docs#doc-spans-sents") Doc.sents] +row +cell #[code.lang-python for noun_phrase in doc.noun_chunks] +cell See #[a(href="/docs#doc-spans-nounchunks") Doc.noun_chunks] +code('python', 'Definition'). def __init__(self, doc, start, end, label=0, vector=None, vector_norm=None): return Span() +table(['Name', 'Type', 'Description'], 'params') +row +cell doc +cell Doc +cell The parent doc object, to slice from. +row +cell start +cell int +cell The index of the first token in the slice. +row +cell end +cell int +cell The index of the first token #[em outside] the slice (since ranges are exclusive in Python). +row +cell label +cell int or unicode +cell A label for the span. Either a string, or an integer ID, that should refer to a string mapped by the #[code Doc] object's #[code StringStore]. +row +cell vector +cell +cell +row +cell vector_norm +cell +cell +section('span-merge') +h3('span-merge') | #[+label('tag') method] Span.merge p | Merge the span into a single token, modifying the underlying | #[code.lang-python Doc] object in place. +aside('Caveat'). Magic is done to allow you to call #[code.lang-python merge()] without invalidating other #[code.lang-python Span] objects. However, it's difficult to ensure all indices are recomputed correctly. Please report any errors encountered on the issue tracker. +code('python', 'Example'). for ent in doc.ents: ent.merge(ent.root.tag_, ent.text, ent.label_) for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): np = np[1:] np.merge(np.root.tag_, np.text, np.root.ent_type_) +code('python', 'Definition'). def merge(self, tag, lemma, ent_type): return None +table(['Name', 'Type', 'Description'], 'params') +row +cell tag +cell unicode +cell The fine-grained part-of-speech tag to assign to the new token. +row +cell lemma +cell unicode +cell The lemma string for the new token. +row +cell ent_type +cell unicode +cell The named entity type to assign to the new token. +section('span-similarity') +h3('span-similarity') | #[+label('tag') method] Span.similarity p | Estimate the semantic similarity between the span and another #[code Span], #[code Doc], #[code Token] or #[code Lexeme]. +aside('Algorithm'). Similarity is estimated using the cosine metric, between #[code Span.vector] and #[code other.vector]. By default, #[code Span.vector] is computed by averaging the vectors of its tokens. +code('python', 'Example'). doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") apples_sent, boots_sent = doc.sents fruit = doc.vocab[u'fruit'] assert apples_sent.similarity(fruit) > boot_sent.similarity(fruit) +code('python', 'Definition'). def similarity(self, other): return float +table(['Name', 'Type', 'Description'], 'params') +row +cell other +cell Token, Span, Doc or Lexeme +cell The other object to judge similarity with. +section('span-sequence') +h3('span-sequence') | #[+label('tag') section] Span as a Sequence p. #[code Span] objects act as a sequence of #[code Token] objects. In this way they mirror the API of the #[code Doc] object. +table(['Name', 'Description'], 'params') +row +cell #[code.lang-python token = span[i]] +cell | Get the #[code Token] object at position #[em i], where | #[code i] is an offset within the #[code Span], not the | document. That is, if you have #[code.lang-python span = doc[4:6]], | then #[code.lang-python span[0].i == 4] +row +cell #[code.lang-python for token in span] +cell. Iterate over the #[code Token] objects in the span. +row +cell __len__ +cell. Number of tokens in the span. +row +cell text +cell. The text content of the span, obtained from #[code ''.join(token.text_with_ws for token in span)]. +row +cell start +cell. The start offset of the span, i.e. #[code span[0].i]. +row +cell end +cell. The end offset of the span, i.e. #[code span[-1].i + 1]. +section('span-navigating-parse') +h3('span-navigativing-parse') | #[+label('tag') Section] Span and the Syntactic Parse p. Span objects allow similar access to the syntactic parse as individual tokens. +table(['Name', 'Type', 'Description'], 'params') +row +cell root +cell #[code.lang-python Token] +cell | The word with the shortest path to the root of the sentence is | the root of the span. +row +cell lefts +cell #[code.lang-python yield Token] +cell Tokens that are to the left of the span, whose head is within it. +row +cell rights +cell #[code.lang-python yield Token] +cell Tokens that are to the right of the span, whose head is within it. +row +cell subtree +cell #[code.lang-python yield Token] +cell | Tokens in the range #[code (start, end+1)], where #[code start] | is the index of the leftmost word descended from a token in the | span, and #[code end] is the index of the rightmost token descended | from a token in the span. +section('span-strings') +h3('span-strings') | #[+label('tag') section] Span's Strings API p. You can access the textual content of the span, and different view of it, with the following properties. +table(['Name', 'Type', 'Description'], 'params') +row +cell text_with_ws +cell unicode +cell. The form of the span as it appears in the string, including trailing whitespace. This is useful when you need to use linguistic features to add inline mark-up to the string. +row +cell lemma / lemma_ +cell int / unicode +cell. Whitespace-concatenated lemmas of each token in the span. +row +cell label / label_ +cell int / unicode +cell. The span label, used particularly for named entities.