mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Update docstrings and API docs for Doc class
This commit is contained in:
		
							parent
							
								
									0f513850ab
								
							
						
					
					
						commit
						b87066ff10
					
				|  | @ -63,40 +63,30 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Doc: | cdef class Doc: | ||||||
|     """ |     """A sequence of Token objects. Access sentences and named entities, export | ||||||
|     A sequence of `Token` objects. Access sentences and named entities, |     annotations to numpy arrays, losslessly serialize to compressed binary strings. | ||||||
|     export annotations to numpy arrays, losslessly serialize to compressed |     The `Doc` object holds an array of `TokenC` structs. The Python-level | ||||||
|     binary strings. |     `Token` and `Span` objects are views of this array, i.e. they don't own | ||||||
|  |     the data themselves. | ||||||
| 
 | 
 | ||||||
|     Aside: Internals |     EXAMPLE: Construction 1 | ||||||
|         The `Doc` object holds an array of `TokenC` structs. |         >>> doc = nlp(u'Some text') | ||||||
|         The Python-level `Token` and `Span` objects are views of this |  | ||||||
|         array, i.e. they don't own the data themselves. |  | ||||||
| 
 |  | ||||||
|     Code: Construction 1 |  | ||||||
|         doc = nlp.tokenizer(u'Some text') |  | ||||||
| 
 |  | ||||||
|     Code: Construction 2 |  | ||||||
|         doc = Doc(nlp.vocab, orths_and_spaces=[(u'Some', True), (u'text', True)]) |  | ||||||
| 
 | 
 | ||||||
|  |         Construction 2 | ||||||
|  |         >>> from spacy.tokens import Doc | ||||||
|  |         >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], spaces=[True, False, False]) | ||||||
|     """ |     """ | ||||||
|     def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): |     def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): | ||||||
|         """ |         """Create a Doc object. | ||||||
|         Create a Doc object. |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         vocab (Vocab): A vocabulary object, which must match any models you want | ||||||
|             vocab: |             to use (e.g. tokenizer, parser, entity recognizer). | ||||||
|                 A Vocabulary object, which must match any models you want to |         words (list or None): A list of unicode strings to add to the document | ||||||
|                 use (e.g. tokenizer, parser, entity recognizer). |             as words. If `None`, defaults to empty list. | ||||||
| 
 |         spaces (list or None): A list of boolean values, of the same length as | ||||||
|             words: |             words. True means that the word is followed by a space, False means | ||||||
|                 A list of unicode strings to add to the document as words. If None, |             it is not. If `None`, defaults to `[True]*len(words)` | ||||||
|                 defaults to empty list. |         RETURNS (Doc): The newly constructed object. | ||||||
| 
 |  | ||||||
|             spaces: |  | ||||||
|                 A list of boolean values, of the same length as words. True |  | ||||||
|                 means that the word is followed by a space, False means it is not. |  | ||||||
|                 If None, defaults to [True]*len(words) |  | ||||||
|         """ |         """ | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         size = 20 |         size = 20 | ||||||
|  | @ -158,20 +148,22 @@ cdef class Doc: | ||||||
|             self.is_parsed = True |             self.is_parsed = True | ||||||
| 
 | 
 | ||||||
|     def __getitem__(self, object i): |     def __getitem__(self, object i): | ||||||
|         """ |         """Get a `Token` or `Span` object. | ||||||
|         doc[i] | 
 | ||||||
|             Get the Token object at position i, where i is an integer. |         EXAMPLE: | ||||||
|  |             >>> doc[i] | ||||||
|  |             Get the `Token` object at position `i`, where `i` is an integer. | ||||||
|             Negative indexing is supported, and follows the usual Python |             Negative indexing is supported, and follows the usual Python | ||||||
|             semantics, i.e. doc[-2] is doc[len(doc) - 2]. |             semantics, i.e. `doc[-2]` is `doc[len(doc) - 2]`. | ||||||
|         doc[start : end]] | 
 | ||||||
|             Get a `Span` object, starting at position `start` |             >>> doc[start : end]] | ||||||
|             and ending at position `end`, where `start` and |             Get a `Span` object, starting at position `start` and ending at | ||||||
|             `end` are token indices. For instance, |             position `end`, where `start` and `end` are token indices. For | ||||||
|             `doc[2:5]` produces a span consisting of |             instance, `doc[2:5]` produces a span consisting of tokens 2, 3 and 4. | ||||||
|             tokens 2, 3 and 4. Stepped slices (e.g. `doc[start : end : step]`) |             Stepped slices (e.g. `doc[start : end : step]`) are not supported, | ||||||
|             are not supported, as `Span` objects must be contiguous (cannot have gaps). |             as `Span` objects must be contiguous (cannot have gaps). You can use | ||||||
|             You can use negative indices and open-ended ranges, which have their |             negative indices and open-ended ranges, which have their normal | ||||||
|             normal Python semantics. |             Python semantics. | ||||||
|         """ |         """ | ||||||
|         if isinstance(i, slice): |         if isinstance(i, slice): | ||||||
|             start, stop = normalize_slice(len(self), i.start, i.stop, i.step) |             start, stop = normalize_slice(len(self), i.start, i.stop, i.step) | ||||||
|  | @ -186,14 +178,14 @@ cdef class Doc: | ||||||
|             return Token.cinit(self.vocab, &self.c[i], i, self) |             return Token.cinit(self.vocab, &self.c[i], i, self) | ||||||
| 
 | 
 | ||||||
|     def __iter__(self): |     def __iter__(self): | ||||||
|         """ |         """Iterate over `Token`  objects, from which the annotations can be | ||||||
|         for token in doc |         easily accessed. This is the main way of accessing `Token` objects, | ||||||
|             Iterate over `Token`  objects, from which the annotations can |         which are the main way annotations are accessed from Python. If faster- | ||||||
|             be easily accessed. This is the main way of accessing Token |         than-Python speeds are required, you can instead access the annotations | ||||||
|             objects, which are the main way annotations are accessed from |         as a numpy array, or access the underlying C data directly from Cython. | ||||||
|             Python. If faster-than-Python speeds are required, you can | 
 | ||||||
|             instead access the annotations as a numpy array, or access the |         EXAMPLE: | ||||||
|             underlying C data directly from Cython. |             >>> for token in doc | ||||||
|         """ |         """ | ||||||
|         cdef int i |         cdef int i | ||||||
|         for i in range(self.length): |         for i in range(self.length): | ||||||
|  | @ -203,9 +195,10 @@ cdef class Doc: | ||||||
|                 yield Token.cinit(self.vocab, &self.c[i], i, self) |                 yield Token.cinit(self.vocab, &self.c[i], i, self) | ||||||
| 
 | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|         """ |         """The number of tokens in the document. | ||||||
|         len(doc) | 
 | ||||||
|             The number of tokens in the document. |         EXAMPLE: | ||||||
|  |             >>> len(doc) | ||||||
|         """ |         """ | ||||||
|         return self.length |         return self.length | ||||||
| 
 | 
 | ||||||
|  | @ -228,16 +221,12 @@ cdef class Doc: | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def similarity(self, other): |     def similarity(self, other): | ||||||
|         """ |         """Make a semantic similarity estimate. The default estimate is cosine | ||||||
|         Make a semantic similarity estimate. The default estimate is cosine |  | ||||||
|         similarity using an average of word vectors. |         similarity using an average of word vectors. | ||||||
| 
 | 
 | ||||||
|         Arguments: |         other (object): The object to compare with. By default, accepts `Doc`, | ||||||
|             other (object): The object to compare with. By default, accepts Doc, |             `Span`, `Token` and `Lexeme` objects. | ||||||
|                 Span, Token and Lexeme objects. |         RETURNS (float): A scalar similarity score. Higher is more similar. | ||||||
| 
 |  | ||||||
|         Return: |  | ||||||
|             score (float): A scalar similarity score. Higher is more similar. |  | ||||||
|         """ |         """ | ||||||
|         if 'similarity' in self.user_hooks: |         if 'similarity' in self.user_hooks: | ||||||
|             return self.user_hooks['similarity'](self, other) |             return self.user_hooks['similarity'](self, other) | ||||||
|  | @ -246,8 +235,10 @@ cdef class Doc: | ||||||
|         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) |         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||||
| 
 | 
 | ||||||
|     property has_vector: |     property has_vector: | ||||||
|         """ |         """A boolean value indicating whether a word vector is associated with | ||||||
|         A boolean value indicating whether a word vector is associated with the object. |         the object. | ||||||
|  | 
 | ||||||
|  |         RETURNS (bool): Whether a word vector is associated with the object. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'has_vector' in self.user_hooks: |             if 'has_vector' in self.user_hooks: | ||||||
|  | @ -256,10 +247,11 @@ cdef class Doc: | ||||||
|             return any(token.has_vector for token in self) |             return any(token.has_vector for token in self) | ||||||
| 
 | 
 | ||||||
|     property vector: |     property vector: | ||||||
|         """ |         """A real-valued meaning representation. Defaults to an average of the | ||||||
|         A real-valued meaning representation. Defaults to an average of the token vectors. |         token vectors. | ||||||
| 
 | 
 | ||||||
|         Type: numpy.ndarray[ndim=1, dtype='float32'] |         RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array | ||||||
|  |             representing the document's semantics. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'vector' in self.user_hooks: |             if 'vector' in self.user_hooks: | ||||||
|  | @ -275,6 +267,7 @@ cdef class Doc: | ||||||
|             self._vector = value |             self._vector = value | ||||||
| 
 | 
 | ||||||
|     property vector_norm: |     property vector_norm: | ||||||
|  |         # TODO: docstrings / docs | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'vector_norm' in self.user_hooks: |             if 'vector_norm' in self.user_hooks: | ||||||
|                 return self.user_hooks['vector_norm'](self) |                 return self.user_hooks['vector_norm'](self) | ||||||
|  | @ -295,34 +288,37 @@ cdef class Doc: | ||||||
|         return self.text |         return self.text | ||||||
| 
 | 
 | ||||||
|     property text: |     property text: | ||||||
|         """ |         """A unicode representation of the document text. | ||||||
|         A unicode representation of the document text. | 
 | ||||||
|  |         RETURNS (unicode): The original verbatim text of the document. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return u''.join(t.text_with_ws for t in self) |             return u''.join(t.text_with_ws for t in self) | ||||||
| 
 | 
 | ||||||
|     property text_with_ws: |     property text_with_ws: | ||||||
|         """ |         """An alias of `Doc.text`, provided for duck-type compatibility with | ||||||
|         An alias of Doc.text, provided for duck-type compatibility with Span and Token. |         `Span` and `Token`. | ||||||
|  | 
 | ||||||
|  |         RETURNS (unicode): The original verbatim text of the document. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self.text |             return self.text | ||||||
| 
 | 
 | ||||||
|     property ents: |     property ents: | ||||||
|         """ |         """Iterate over the entities in the document. Yields named-entity `Span` | ||||||
|         Yields named-entity `Span` objects, if the entity recognizer |         objects, if the entity recognizer has been applied to the document. | ||||||
|         has been applied to the document. Iterate over the span to get |  | ||||||
|         individual Token objects, or access the label: |  | ||||||
| 
 | 
 | ||||||
|         Example: |         YIELDS (Span): Entities in the document. | ||||||
|             from spacy.en import English | 
 | ||||||
|             nlp = English() |         EXAMPLE: Iterate over the span to get individual Token objects, or access | ||||||
|             tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') |             the label: | ||||||
|             ents = list(tokens.ents) | 
 | ||||||
|             assert ents[0].label == 346 |             >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') | ||||||
|             assert ents[0].label_ == 'PERSON' |             >>> ents = list(tokens.ents) | ||||||
|             assert ents[0].orth_ == 'Best' |             >>> assert ents[0].label == 346 | ||||||
|             assert ents[0].text == 'Mr. Best' |             >>> assert ents[0].label_ == 'PERSON' | ||||||
|  |             >>> assert ents[0].orth_ == 'Best' | ||||||
|  |             >>> assert ents[0].text == 'Mr. Best' | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             cdef int i |             cdef int i | ||||||
|  | @ -387,12 +383,13 @@ cdef class Doc: | ||||||
|                     self.c[start].ent_iob = 3 |                     self.c[start].ent_iob = 3 | ||||||
| 
 | 
 | ||||||
|     property noun_chunks: |     property noun_chunks: | ||||||
|         """ |         """Iterate over the base noun phrases in the document. Yields base | ||||||
|         Yields base noun-phrase #[code Span] objects, if the document |         noun-phrase #[code Span] objects, if the document has been syntactically | ||||||
|         has been syntactically parsed. A base noun phrase, or |         parsed. A base noun phrase, or "NP chunk", is a noun phrase that does | ||||||
|         'NP chunk', is a noun phrase that does not permit other NPs to |         not permit other NPs to be nested within it – so no NP-level | ||||||
|         be nested within it – so no NP-level coordination, no prepositional |         coordination, no prepositional phrases, and no relative clauses. | ||||||
|         phrases, and no relative clauses. | 
 | ||||||
|  |         YIELDS (Span): Noun chunks in the document. | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if not self.is_parsed: |             if not self.is_parsed: | ||||||
|  | @ -411,17 +408,15 @@ cdef class Doc: | ||||||
|                 yield span |                 yield span | ||||||
| 
 | 
 | ||||||
|     property sents: |     property sents: | ||||||
|         """ |         """Iterate over the sentences in the document. Yields sentence `Span` | ||||||
|         Yields sentence `Span` objects. Sentence spans have no label. |         objects. Sentence spans have no label. To improve accuracy on informal | ||||||
|         To improve accuracy on informal texts, spaCy calculates sentence |         texts, spaCy calculates sentence boundaries from the syntactic | ||||||
|         boundaries from the syntactic dependency parse. If the parser is disabled, |         dependency parse. If the parser is disabled, the `sents` iterator will | ||||||
|         `sents` iterator will be unavailable. |         be unavailable. | ||||||
| 
 | 
 | ||||||
|         Example: |         EXAMPLE: | ||||||
|             from spacy.en import English |             >>> doc = nlp("This is a sentence. Here's another...") | ||||||
|             nlp = English() |             >>> assert [s.root.text for s in doc.sents] == ["is", "'s"] | ||||||
|             doc = nlp("This is a sentence. Here's another...") |  | ||||||
|             assert [s.root.orth_ for s in doc.sents] == ["is", "'s"] |  | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if 'sents' in self.user_hooks: |             if 'sents' in self.user_hooks: | ||||||
|  | @ -467,24 +462,20 @@ cdef class Doc: | ||||||
| 
 | 
 | ||||||
|     @cython.boundscheck(False) |     @cython.boundscheck(False) | ||||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): |     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||||
|         """ |         """Given a list of M attribute IDs, export the tokens to a numpy | ||||||
|         Given a list of M attribute IDs, export the tokens to a numpy |         `ndarray` of shape `(N, M)`, where `N` is the length of the document. | ||||||
|         `ndarray` of shape (N, M), where `N` is the length |         The values will be 32-bit integers. | ||||||
|         of the document. The values will be 32-bit integers. |  | ||||||
| 
 | 
 | ||||||
|         Example: |         attr_ids (list[int]): A list of attribute ID ints. | ||||||
|             from spacy import attrs |         RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row | ||||||
|             doc = nlp(text) |             per word, and one column per attribute indicated in the input | ||||||
|             # All strings mapped to integers, for easy export to numpy |             `attr_ids`. | ||||||
|             np_array = doc.to_array([attrs.LOWER, attrs.POS, attrs.ENT_TYPE, attrs.IS_ALPHA]) |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         EXAMPLE: | ||||||
|             attr_ids (list[int]): A list of attribute ID ints. |             >>> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA | ||||||
| 
 |             >>> doc = nlp(text) | ||||||
|         Returns: |             >>> # All strings mapped to integers, for easy export to numpy | ||||||
|             feat_array (numpy.ndarray[long, ndim=2]): |             >>> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||||
|               A feature matrix, with one row per word, and one column per attribute |  | ||||||
|               indicated in the input attr_ids. |  | ||||||
|         """ |         """ | ||||||
|         cdef int i, j |         cdef int i, j | ||||||
|         cdef attr_id_t feature |         cdef attr_id_t feature | ||||||
|  | @ -499,27 +490,20 @@ cdef class Doc: | ||||||
|         return output |         return output | ||||||
| 
 | 
 | ||||||
|     def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): |     def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): | ||||||
|         """ |         """Count the frequencies of a given attribute. Produces a dict of | ||||||
|         Produce a dict of {attribute (int): count (ints)} frequencies, keyed |         `{attribute (int): count (ints)}` frequencies, keyed by the values of | ||||||
|         by the values of the given attribute ID. |         the given attribute ID. | ||||||
| 
 | 
 | ||||||
|         Example: |         attr_id (int): The attribute ID to key the counts. | ||||||
|             from spacy.en import English |         RETURNS (dict): A dictionary mapping attributes to integer counts. | ||||||
|             from spacy import attrs |  | ||||||
|             nlp = English() |  | ||||||
|             tokens = nlp(u'apple apple orange banana') |  | ||||||
|             tokens.count_by(attrs.ORTH) |  | ||||||
|             # {12800L: 1, 11880L: 2, 7561L: 1} |  | ||||||
|             tokens.to_array([attrs.ORTH]) |  | ||||||
|             # array([[11880], |  | ||||||
|             #   [11880], |  | ||||||
|             #   [ 7561], |  | ||||||
|             #   [12800]]) |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         EXAMPLE: | ||||||
|             attr_id |             >>> from spacy import attrs | ||||||
|                 int |             >>> doc = nlp(u'apple apple orange banana') | ||||||
|                 The attribute ID to key the counts. |             >>> tokens.count_by(attrs.ORTH) | ||||||
|  |             {12800L: 1, 11880L: 2, 7561L: 1} | ||||||
|  |             >>> tokens.to_array([attrs.ORTH]) | ||||||
|  |             array([[11880], [11880], [7561], [12800]]) | ||||||
|         """ |         """ | ||||||
|         cdef int i |         cdef int i | ||||||
|         cdef attr_t attr |         cdef attr_t attr | ||||||
|  | @ -567,8 +551,12 @@ cdef class Doc: | ||||||
|             self.c[i] = parsed[i] |             self.c[i] = parsed[i] | ||||||
| 
 | 
 | ||||||
|     def from_array(self, attrs, int[:, :] array): |     def from_array(self, attrs, int[:, :] array): | ||||||
|         """ |         """Load attributes from a numpy array. Write to a `Doc` object, from an | ||||||
|         Write to a `Doc` object, from an `(M, N)` array of attributes. |         `(M, N)` array of attributes. | ||||||
|  | 
 | ||||||
|  |         attrs (ints): A list of attribute ID ints. | ||||||
|  |         array (numpy.ndarray[ndim=2, dtype='int32']) The attribute values to load. | ||||||
|  |         RETURNS (Doc): Itself. | ||||||
|         """ |         """ | ||||||
|         cdef int i, col |         cdef int i, col | ||||||
|         cdef attr_id_t attr_id |         cdef attr_id_t attr_id | ||||||
|  | @ -597,8 +585,10 @@ cdef class Doc: | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def to_bytes(self): |     def to_bytes(self): | ||||||
|         """ |         """Serialize, i.e. export the document contents to a binary string. | ||||||
|         Serialize, producing a byte string. | 
 | ||||||
|  |         RETURNS (bytes): A losslessly serialized copy of the `Doc`, including | ||||||
|  |             all annotations. | ||||||
|         """ |         """ | ||||||
|         return dill.dumps( |         return dill.dumps( | ||||||
|             (self.text, |             (self.text, | ||||||
|  | @ -611,8 +601,10 @@ cdef class Doc: | ||||||
|             protocol=-1) |             protocol=-1) | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, data): |     def from_bytes(self, data): | ||||||
|         """ |         """Deserialize, i.e. import the document contents from a binary string. | ||||||
|         Deserialize, loading from bytes. | 
 | ||||||
|  |         data (bytes): The string to load from. | ||||||
|  |         RETURNS (Doc): Itself. | ||||||
|         """ |         """ | ||||||
|         if self.length != 0: |         if self.length != 0: | ||||||
|             raise ValueError("Cannot load into non-empty Doc") |             raise ValueError("Cannot load into non-empty Doc") | ||||||
|  | @ -640,21 +632,16 @@ cdef class Doc: | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|     def merge(self, int start_idx, int end_idx, *args, **attributes): |     def merge(self, int start_idx, int end_idx, *args, **attributes): | ||||||
|         """ |         """Retokenize the document, such that the span at `doc.text[start_idx : end_idx]` | ||||||
|         Retokenize the document, such that the span at doc.text[start_idx : end_idx] |         is merged into a single token. If `start_idx` and `end_idx `do not mark | ||||||
|         is merged into a single token. If start_idx and end_idx do not mark start |         start and end token boundaries, the document remains unchanged. | ||||||
|         and end token boundaries, the document remains unchanged. |  | ||||||
| 
 | 
 | ||||||
|         Arguments: |         start_idx (int): The character index of the start of the slice to merge. | ||||||
|             start_idx (int): The character index of the start of the slice to merge. |         end_idx (int): The character index after the end of the slice to merge. | ||||||
|             end_idx (int): The character index after the end of the slice to merge. |         **attributes: Attributes to assign to the merged token. By default, | ||||||
|             **attributes: |             attributes are inherited from the syntactic root token of the span. | ||||||
|                 Attributes to assign to the merged token. By default, attributes |         RETURNS (Token): The newly merged token, or `None` if the start and end | ||||||
|                 are inherited from the syntactic root token of the span. |             indices did not fall at token boundaries. | ||||||
|         Returns: |  | ||||||
|             token (Token): |  | ||||||
|                 The newly merged token, or None if the start and end indices did |  | ||||||
|                 not fall at token boundaries. |  | ||||||
|         """ |         """ | ||||||
|         cdef unicode tag, lemma, ent_type |         cdef unicode tag, lemma, ent_type | ||||||
|         if len(args) == 3: |         if len(args) == 3: | ||||||
|  | @ -758,7 +745,29 @@ cdef class Doc: | ||||||
|         return self[start] |         return self[start] | ||||||
| 
 | 
 | ||||||
|     def print_tree(self, light=False, flat=False): |     def print_tree(self, light=False, flat=False): | ||||||
|         """Returns the parse trees in the JSON (Dict) format.""" |         """Returns the parse trees in JSON (dict) format. | ||||||
|  | 
 | ||||||
|  |         light (bool): Don't include lemmas or entities. | ||||||
|  |         flat (bool): Don't include arcs or modifiers. | ||||||
|  |         RETURNS (dict): Parse tree as dict. | ||||||
|  | 
 | ||||||
|  |         EXAMPLE: | ||||||
|  |             >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') | ||||||
|  |             >>> trees = doc.print_tree() | ||||||
|  |             >>> trees[1] | ||||||
|  |             {'modifiers': [ | ||||||
|  |                 {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', | ||||||
|  |                 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, | ||||||
|  |                 {'modifiers': [ | ||||||
|  |                     {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', | ||||||
|  |                     'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], | ||||||
|  |                 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', | ||||||
|  |                 'POS_fine': 'NN', 'lemma': 'pizza'}, | ||||||
|  |                 {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', | ||||||
|  |                 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], | ||||||
|  |                 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', | ||||||
|  |                 'POS_fine': 'VBD', 'lemma': 'eat'} | ||||||
|  |         """ | ||||||
|         return parse_tree(self, light=light, flat=flat) |         return parse_tree(self, light=light, flat=flat) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,18 +6,14 @@ from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def merge_ents(doc): | def merge_ents(doc): | ||||||
|     """ |     """Helper: merge adjacent entities into single tokens; modifies the doc.""" | ||||||
|     Helper: merge adjacent entities into single tokens; modifies the doc. |  | ||||||
|     """ |  | ||||||
|     for ent in doc.ents: |     for ent in doc.ents: | ||||||
|         ent.merge(ent.root.tag_, ent.text, ent.label_) |         ent.merge(ent.root.tag_, ent.text, ent.label_) | ||||||
|     return doc |     return doc | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def format_POS(token, light, flat): | def format_POS(token, light, flat): | ||||||
|     """ |     """Helper: form the POS output for a token.""" | ||||||
|     Helper: form the POS output for a token. |  | ||||||
|     """ |  | ||||||
|     subtree = dict([ |     subtree = dict([ | ||||||
|         ("word", token.text), |         ("word", token.text), | ||||||
|         ("lemma", token.lemma_),  # trigger |         ("lemma", token.lemma_),  # trigger | ||||||
|  | @ -37,9 +33,8 @@ def format_POS(token, light, flat): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def POS_tree(root, light=False, flat=False): | def POS_tree(root, light=False, flat=False): | ||||||
|     """ |     """Helper: generate a POS tree for a root token. The doc must have | ||||||
|     Helper: generate a POS tree for a root token. The doc must have |     `merge_ents(doc)` ran on it. | ||||||
|     merge_ents(doc) ran on it. |  | ||||||
|     """ |     """ | ||||||
|     subtree = format_POS(root, light=light, flat=flat) |     subtree = format_POS(root, light=light, flat=flat) | ||||||
|     for c in root.children: |     for c in root.children: | ||||||
|  | @ -48,21 +43,28 @@ def POS_tree(root, light=False, flat=False): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def parse_tree(doc, light=False, flat=False): | def parse_tree(doc, light=False, flat=False): | ||||||
|     """ |     """Makes a copy of the doc, then construct a syntactic parse tree, similar to | ||||||
|     Makes a copy of the doc, then construct a syntactic parse tree, similar to |  | ||||||
|     the one used in displaCy. Generates the POS tree for all sentences in a doc. |     the one used in displaCy. Generates the POS tree for all sentences in a doc. | ||||||
| 
 | 
 | ||||||
|     Args: |     doc (Doc): The doc for parsing. | ||||||
|         doc: The doc for parsing. |     RETURNS (dict): The parse tree. | ||||||
| 
 | 
 | ||||||
|     Returns: |     EXAMPLE: | ||||||
|         [parse_trees (Dict)]: |         >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') | ||||||
| 
 |         >>> trees = doc.print_tree() | ||||||
|     >>> from spacy.en import English |         >>> trees[1] | ||||||
|     >>> nlp = English() |         {'modifiers': [ | ||||||
|     >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.') |             {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', | ||||||
|     >>> trees = doc.print_tree() |              'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, | ||||||
|     [{'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Bob', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Bob'}, {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'dobj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'brought', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'bring'}, {'modifiers': [{'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'}] |             {'modifiers': [ | ||||||
|  |                 {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', | ||||||
|  |                  'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], | ||||||
|  |              'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', | ||||||
|  |              'POS_fine': 'NN', 'lemma': 'pizza'}, | ||||||
|  |             {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', | ||||||
|  |              'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}], | ||||||
|  |             'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', | ||||||
|  |             'POS_fine': 'VBD', 'lemma': 'eat'} | ||||||
|     """ |     """ | ||||||
|     doc_clone  = Doc(doc.vocab, words=[w.text for w in doc]) |     doc_clone  = Doc(doc.vocab, words=[w.text for w in doc]) | ||||||
|     doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], |     doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE], | ||||||
|  |  | ||||||
|  | @ -4,6 +4,503 @@ include ../../_includes/_mixins | ||||||
| 
 | 
 | ||||||
| p A container for accessing linguistic annotations. | p A container for accessing linguistic annotations. | ||||||
| 
 | 
 | ||||||
|  | p | ||||||
|  |     |  A #[code Doc] is a sequence of #[+api("token") #[code Token]] objects. | ||||||
|  |     |  Access sentences and named entities, export annotations to numpy arrays, | ||||||
|  |     |  losslessly serialize to compressed binary strings. The #[code Doc] object | ||||||
|  |     |  holds an array of #[code TokenC] structs. The Python-level #[code Token] | ||||||
|  |     |  and #[+api("span") #[code Span]] objects are views of this array, i.e. | ||||||
|  |     |  they don't own the data themselves. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     # Construction 1 | ||||||
|  |     doc = nlp(u'Some text') | ||||||
|  | 
 | ||||||
|  |     # Construction 2 | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     doc = doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], | ||||||
|  |                                spaces=[True, False, False]) | ||||||
|  | 
 | ||||||
|  | +h(2, "init") Doc.__init__ | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Construct a #[code Doc] object. The most common way to get a #[code Doc] | ||||||
|  |     |  object is via the #[code nlp] object. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code vocab] | ||||||
|  |         +cell #[code Vocab] | ||||||
|  |         +cell A storage container for lexical types. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code words] | ||||||
|  |         +cell - | ||||||
|  |         +cell A list of strings to add to the container. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code spaces] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  A list of boolean values indicating whether each word has a | ||||||
|  |             |  subsequent space. Must have the same length as #[code words], if | ||||||
|  |             |  specified. Defaults to a sequence of #[code True]. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell The newly constructed object. | ||||||
|  | 
 | ||||||
|  | +h(2, "getitem") Doc.__getitem__ | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Get a #[+api("token") #[code Token]] object at position #[code i], where | ||||||
|  |     |  #[code i] is an integer. Negative indexing is supported, and follows the | ||||||
|  |     |  usual Python semantics, i.e. #[code doc[-2]] is #[code doc[len(doc) - 2]]. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'Give it back! He pleaded.') | ||||||
|  |     assert doc[0].text == 'Give' | ||||||
|  |     assert doc[-1].text == '.' | ||||||
|  |     span = doc[1:1] | ||||||
|  |     assert span.text == 'it back' | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code i] | ||||||
|  |         +cell int | ||||||
|  |         +cell The index of the token. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Token] | ||||||
|  |         +cell The token at #[code doc[i]]. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Get a #[+api("span") #[code Span]] object, starting at position | ||||||
|  |     |  #[code start] (token index) and ending at position #[code end] (token | ||||||
|  |     |  index). | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  For instance, #[code doc[2:5]] produces a span consisting of tokens 2, 3 | ||||||
|  |     |  and 4. Stepped slices (e.g. #[code doc[start : end : step]]) are not | ||||||
|  |     |  supported, as #[code Span] objects must be contiguous (cannot have gaps). | ||||||
|  |     |  You can use negative indices and open-ended ranges, which have their | ||||||
|  |     |  normal Python semantics. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code start_end] | ||||||
|  |         +cell tuple | ||||||
|  |         +cell The slice of the document to get. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Span] | ||||||
|  |         +cell The span at #[code doc[start : end]]. | ||||||
|  | 
 | ||||||
|  | +h(2, "iter") Doc.__iter__ | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Iterate over #[code Token] objects, from which the annotations can be | ||||||
|  |     |  easily accessed. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'Give it back! He pleaded.') | ||||||
|  |     for token in doc: | ||||||
|  |         print(token.text, token.tag_) | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  This is the main way of accessing #[+api("token") #[code Token]] objects, | ||||||
|  |     |  which are the main way annotations are accessed from Python. If | ||||||
|  |     |  faster-than-Python speeds are required, you can instead access the | ||||||
|  |     |  annotations as a numpy array, or access the underlying C data directly | ||||||
|  |     |  from Cython. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell yield | ||||||
|  |         +cell #[code Token] | ||||||
|  |         +cell A #[code Token] object. | ||||||
|  | 
 | ||||||
|  | +h(2, "len") Doc.__len__ | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p Get the number of tokens in the document. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'Give it back! He pleaded.') | ||||||
|  |     assert len(doc) == 7 | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell int | ||||||
|  |         +cell The number of tokens in the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "similarity") Doc.similarity | ||||||
|  |     +tag method | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Make a semantic similarity estimate. The default estimate is cosine | ||||||
|  |     |  similarity using an average of word vectors. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     apples, and, oranges = nlp(u'apples and oranges') | ||||||
|  |     apples_oranges = apples.similarity(oranges) | ||||||
|  |     oranges_apples = oranges.similarity(apples) | ||||||
|  |     assert apples_oranges == oranges_apples | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code other] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  The object to compare with. By default, accepts #[code Doc], | ||||||
|  |             |  #[code Span], #[code Token] and #[code Lexeme] objects. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell float | ||||||
|  |         +cell A scalar similarity score. Higher is more similar. | ||||||
|  | 
 | ||||||
|  | +h(2, "count_by") Doc.count_by | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Count the frequencies of a given attribute. Produces a dict of | ||||||
|  |     |  #[code {attr (int): count (ints)}] frequencies, keyed by the values | ||||||
|  |     |  of the given attribute ID. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy import attrs | ||||||
|  |     doc = nlp(u'apple apple orange banana') | ||||||
|  |     tokens.count_by(attrs.ORTH) | ||||||
|  |     # {12800L: 1, 11880L: 2, 7561L: 1} | ||||||
|  |     tokens.to_array([attrs.ORTH]) | ||||||
|  |     # array([[11880], [11880], [7561], [12800]]) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code attr_id] | ||||||
|  |         +cell int | ||||||
|  |         +cell The attribute ID | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell dict | ||||||
|  |         +cell A dictionary mapping attributes to integer counts. | ||||||
|  | 
 | ||||||
|  | +h(2, "to_array") Doc.to_array | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Export the document annotations to a numpy array of shape #[code N*M] | ||||||
|  |     |  where #[code N] is the length of the document and #[code M] is the number | ||||||
|  |     |  of attribute IDs to export. The values will be 32-bit integers. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA | ||||||
|  |     doc = nlp(text) | ||||||
|  |     # All strings mapped to integers, for easy export to numpy | ||||||
|  |     np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code attr_ids] | ||||||
|  |         +cell ints | ||||||
|  |         +cell A list of attribute ID ints. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||||
|  |         +cell | ||||||
|  |             |  The exported attributes as a 2D numpy array, with one row per | ||||||
|  |             |  token and one column per attribute. | ||||||
|  | 
 | ||||||
|  | +h(2, "from_array") Doc.from_array | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Load attributes from a numpy array. Write to a #[code Doc] object, from | ||||||
|  |     |  an #[code (M, N)] array of attributes. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     doc = nlp(text) | ||||||
|  |     np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) | ||||||
|  |     doc2 = Doc(doc.vocab) | ||||||
|  |     doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code attrs] | ||||||
|  |         +cell ints | ||||||
|  |         +cell A list of attribute ID ints. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code array] | ||||||
|  |         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] | ||||||
|  |         +cell The attribute values to load. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell Itself. | ||||||
|  | 
 | ||||||
|  | +h(2, "to_bytes") Doc.to_bytes | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p Serialize, i.e. export the document contents to a binary string. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'Give it back! He pleaded.') | ||||||
|  |     doc_bytes = doc.to_bytes() | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell bytes | ||||||
|  |         +cell | ||||||
|  |             |  A losslessly serialized copy of the #[code Doc], including all | ||||||
|  |             |  annotations. | ||||||
|  | 
 | ||||||
|  | +h(2, "from_bytes") Doc.from_bytes | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p Deserialize, i.e. import the document contents from a binary string. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     from spacy.tokens import Doc | ||||||
|  |     text = u'Give it back! He pleaded.' | ||||||
|  |     doc = nlp(text) | ||||||
|  |     bytes = doc.to_bytes() | ||||||
|  |     doc2 = Doc(doc.vocab).from_bytes(bytes) | ||||||
|  |     assert doc.text == doc2.text | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code data] | ||||||
|  |         +cell bytes | ||||||
|  |         +cell The string to load from. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Doc] | ||||||
|  |         +cell Itself. | ||||||
|  | 
 | ||||||
|  | +h(2, "merge") Doc.merge | ||||||
|  |     +tag method | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Retokenize the document, such that the span at | ||||||
|  |     |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If | ||||||
|  |     |  #[code start_idx] and #[end_idx] do not mark start and end token | ||||||
|  |     |  boundaries, the document remains unchanged. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'Los Angeles start.') | ||||||
|  |     doc.merge(0, len('Los Angeles'), 'NNP', 'Los Angeles', 'GPE') | ||||||
|  |     print([token.text for token in doc]) | ||||||
|  |     # ['Los Angeles', 'start', '.'] | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code start_idx] | ||||||
|  |         +cell int | ||||||
|  |         +cell The character index of the start of the slice to merge. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code end_idx] | ||||||
|  |         +cell int | ||||||
|  |         +cell The character index after the end of the slice to merge. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code **attributes] | ||||||
|  |         +cell - | ||||||
|  |         +cell | ||||||
|  |             |  Attributes to assign to the merged token. By default, | ||||||
|  |             |  attributes are inherited from the syntactic root token of | ||||||
|  |             |  the span. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code Token] | ||||||
|  |         +cell | ||||||
|  |             |  The newly merged token, or #[code None] if the start and end | ||||||
|  |             |  indices did not fall at token boundaries | ||||||
|  | 
 | ||||||
|  | +h(2, "print_tree") Doc.print_tree | ||||||
|  |     +tag method | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Returns the parse trees in JSON (dict) format. Especially useful for | ||||||
|  |     |  web applications. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp('Alice ate the pizza.') | ||||||
|  |     trees = doc.print_tree() | ||||||
|  |     # {'modifiers': [ | ||||||
|  |     #   {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj', 'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'}, | ||||||
|  |     #   {'modifiers': [{'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det', 'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}], 'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN', 'POS_fine': 'NN', 'lemma': 'pizza'}, | ||||||
|  |     #   {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct', 'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'} | ||||||
|  |     # ], 'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB', 'POS_fine': 'VBD', 'lemma': 'eat'} | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +row | ||||||
|  |         +cell #[code light] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Don't include lemmas or entities. | ||||||
|  | 
 | ||||||
|  |     +row | ||||||
|  |         +cell #[code flat] | ||||||
|  |         +cell bool | ||||||
|  |         +cell Don't include arcs or modifiers. | ||||||
|  | 
 | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell dict | ||||||
|  |         +cell Parse tree as dict. | ||||||
|  | 
 | ||||||
|  | +h(2, "text") Doc.text | ||||||
|  |     +tag property | ||||||
|  | 
 | ||||||
|  | p A unicode representation of the document text. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     text = u'Give it back! He pleaded.' | ||||||
|  |     doc = nlp(text) | ||||||
|  |     assert doc.text == text | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell unicode | ||||||
|  |         +cell The original verbatim text of the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "text_with_ws") Doc.text_with_ws | ||||||
|  |     +tag property | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  An alias of #[code Doc.text], provided for duck-type compatibility with | ||||||
|  |     |  #[code Span] and #[code Token]. | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell unicode | ||||||
|  |         +cell The original verbatim text of the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "ents") Doc.ents | ||||||
|  |     +tag property | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Iterate over the entities in the document. Yields named-entity | ||||||
|  |     |  #[code Span] objects, if the entity recognizer has been applied to the | ||||||
|  |     |  document. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') | ||||||
|  |     ents = list(tokens.ents) | ||||||
|  |     assert ents[0].label == 346 | ||||||
|  |     assert ents[0].label_ == 'PERSON' | ||||||
|  |     assert ents[0].text == 'Mr. Best' | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell yield | ||||||
|  |         +cell #[code Span] | ||||||
|  |         +cell Entities in the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "noun_chunks") Doc.noun_chunks | ||||||
|  |     +tag property | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Iterate over the base noun phrases in the document. Yields base | ||||||
|  |     |  noun-phrase #[code Span] objects, if the document has been syntactically | ||||||
|  |     |  parsed. A base noun phrase, or "NP chunk", is a noun phrase that does not | ||||||
|  |     |  permit other NPs to be nested within it – so no NP-level coordination, no | ||||||
|  |     |  prepositional phrases, and no relative clauses. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u'A phrase with another phrase occurs.') | ||||||
|  |     chunks = list(doc.noun_chunks) | ||||||
|  |     assert chunks[0].text == "A phrase" | ||||||
|  |     assert chunks[1].text == "another phrase" | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell yield | ||||||
|  |         +cell #[code Span] | ||||||
|  |         +cell Noun chunks in the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "sents") Doc.sents | ||||||
|  |     +tag property | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  Iterate over the sentences in the document. Sentence spans have no label. | ||||||
|  |     |  To improve accuracy on informal texts, spaCy calculates sentence boundaries | ||||||
|  |     |  from the syntactic dependency parse. If the parser is disabled, | ||||||
|  |     |  the #[code sents] iterator will be unavailable. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     doc = nlp(u"This is a sentence. Here's another...") | ||||||
|  |     sents = list(doc.sents) | ||||||
|  |     assert len(sents) == 2 | ||||||
|  |     assert [s.root.text for s in sents] == ["is", "'s"] | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell yield | ||||||
|  |         +cell #[code Span] | ||||||
|  |         +cell Sentences in the document. | ||||||
|  | 
 | ||||||
|  | +h(2, "has_vector") Doc.has_vector | ||||||
|  |     +tag property | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  A boolean value indicating whether a word vector is associated with the | ||||||
|  |     |  object. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     apple = nlp(u'apple') | ||||||
|  |     assert apple.has_vector | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell bool | ||||||
|  |         +cell Whether the document has a vector data attached. | ||||||
|  | 
 | ||||||
|  | +h(2, "vector") Doc.vector | ||||||
|  |     +tag property | ||||||
|  |     +tag requires model | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |     |  A real-valued meaning representation. Defaults to an average of the | ||||||
|  |     |  token vectors. | ||||||
|  | 
 | ||||||
|  | +aside-code("Example"). | ||||||
|  |     apple = nlp(u'apple') | ||||||
|  |     (apple.vector.dtype, apple.vector.shape) | ||||||
|  |     # (dtype('float32'), (300,)) | ||||||
|  | 
 | ||||||
|  | +table(["Name", "Type", "Description"]) | ||||||
|  |     +footrow | ||||||
|  |         +cell return | ||||||
|  |         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] | ||||||
|  |         +cell A 1D numpy array representing the document's semantics. | ||||||
|  | 
 | ||||||
| +h(2, "attributes") Attributes | +h(2, "attributes") Attributes | ||||||
| 
 | 
 | ||||||
| +table(["Name", "Type", "Description"]) | +table(["Name", "Type", "Description"]) | ||||||
|  | @ -59,358 +556,3 @@ p A container for accessing linguistic annotations. | ||||||
|         +cell |         +cell | ||||||
|             |  A dictionary that allows customisation of properties of |             |  A dictionary that allows customisation of properties of | ||||||
|             |  #[code Span] children. |             |  #[code Span] children. | ||||||
| 
 |  | ||||||
| +h(2, "init") Doc.__init__ |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Construct a #[code Doc] object. |  | ||||||
| 
 |  | ||||||
| +aside("Note") |  | ||||||
|     |  The most common way to get a #[code Doc] object is via the #[code nlp] |  | ||||||
|     |  object. This method is usually only used for deserialization or preset |  | ||||||
|     |  tokenization. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code vocab] |  | ||||||
|         +cell #[code Vocab] |  | ||||||
|         +cell A storage container for lexical types. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code words] |  | ||||||
|         +cell - |  | ||||||
|         +cell A list of strings to add to the container. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code spaces] |  | ||||||
|         +cell - |  | ||||||
|         +cell |  | ||||||
|             |  A list of boolean values indicating whether each word has a |  | ||||||
|             |  subsequent space. Must have the same length as #[code words], if |  | ||||||
|             |  specified. Defaults to a sequence of #[code True]. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The newly constructed object. |  | ||||||
| 
 |  | ||||||
| +h(2, "getitem") Doc.__getitem__ |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Get a #[code Token] object. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     doc = nlp(u'Give it back! He pleaded.') |  | ||||||
|     assert doc[0].text == 'Give' |  | ||||||
|     assert doc[-1].text == '.' |  | ||||||
|     span = doc[1:1] |  | ||||||
|     assert span.text == 'it back' |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code i] |  | ||||||
|         +cell int |  | ||||||
|         +cell The index of the token. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code Token] |  | ||||||
|         +cell The token at #[code doc[i]]. |  | ||||||
| 
 |  | ||||||
| p Get a #[code Span] object. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code start_end] |  | ||||||
|         +cell tuple |  | ||||||
|         +cell The slice of the document to get. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code Span] |  | ||||||
|         +cell The span at #[code doc[start : end]]. |  | ||||||
| 
 |  | ||||||
| +h(2, "iter") Doc.__iter__ |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Iterate over #[code Token] objects. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell yield |  | ||||||
|         +cell #[code Token] |  | ||||||
|         +cell A #[code Token] object. |  | ||||||
| 
 |  | ||||||
| +h(2, "len") Doc.__len__ |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Get the number of tokens in the document. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell int |  | ||||||
|         +cell The number of tokens in the document. |  | ||||||
| 
 |  | ||||||
| +h(2, "similarity") Doc.similarity |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Make a semantic similarity estimate. The default estimate is cosine |  | ||||||
|     |  similarity using an average of word vectors. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code other] |  | ||||||
|         +cell - |  | ||||||
|         +cell |  | ||||||
|             |  The object to compare with. By default, accepts #[code Doc], |  | ||||||
|             |  #[code Span], #[code Token] and #[code Lexeme] objects. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell float |  | ||||||
|         +cell A scalar similarity score. Higher is more similar. |  | ||||||
| 
 |  | ||||||
| +h(2, "to_array") Doc.to_array |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Export the document annotations to a numpy array of shape #[code N*M] |  | ||||||
|     |  where #[code N] is the length of the document and #[code M] is the number |  | ||||||
|     |  of attribute IDs to export. The values will be 32-bit integers. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     from spacy import attrs |  | ||||||
|     doc = nlp(text) |  | ||||||
|     # All strings mapped to integers, for easy export to numpy |  | ||||||
|     np_array = doc.to_array([attrs.LOWER, attrs.POS, |  | ||||||
|                              attrs.ENT_TYPE, attrs.IS_ALPHA]) |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code attr_ids] |  | ||||||
|         +cell ints |  | ||||||
|         +cell A list of attribute ID ints. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] |  | ||||||
|         +cell |  | ||||||
|             |  The exported attributes as a 2D numpy array, with one row per |  | ||||||
|             |  token and one column per attribute. |  | ||||||
| 
 |  | ||||||
| +h(2, "count_by") Doc.count_by |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Count the frequencies of a given attribute. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code attr_id] |  | ||||||
|         +cell int |  | ||||||
|         +cell The attribute ID |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell dict |  | ||||||
|         +cell A dictionary mapping attributes to integer counts. |  | ||||||
| 
 |  | ||||||
| +h(2, "from_array") Doc.from_array |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Load attributes from a numpy array. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code attr_ids] |  | ||||||
|         +cell ints |  | ||||||
|         +cell A list of attribute ID ints. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code values] |  | ||||||
|         +cell #[code numpy.ndarray[ndim=2, dtype='int32']] |  | ||||||
|         +cell The attribute values to load. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code None] |  | ||||||
|         +cell - |  | ||||||
| 
 |  | ||||||
| +h(2, "to_bytes") Doc.to_bytes |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Export the document contents to a binary string. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell bytes |  | ||||||
|         +cell |  | ||||||
|             |  A losslessly serialized copy of the #[code Doc] including all |  | ||||||
|             |  annotations. |  | ||||||
| 
 |  | ||||||
| +h(2, "from_bytes") Doc.from_bytes |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p Import the document contents from a binary string. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code byte_string] |  | ||||||
|         +cell bytes |  | ||||||
|         +cell The string to load from. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code Doc] |  | ||||||
|         +cell The #[code self] variable. |  | ||||||
| 
 |  | ||||||
| +h(2, "merge") Doc.merge |  | ||||||
|     +tag method |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Retokenize the document, such that the span at |  | ||||||
|     |  #[code doc.text[start_idx : end_idx]] is merged into a single token. If |  | ||||||
|     |  #[code start_idx] and #[end_idx] do not mark start and end token |  | ||||||
|     |  boundaries, the document remains unchanged. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell #[code start_idx] |  | ||||||
|         +cell int |  | ||||||
|         +cell The character index of the start of the slice to merge. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code end_idx] |  | ||||||
|         +cell int |  | ||||||
|         +cell The character index after the end of the slice to merge. |  | ||||||
| 
 |  | ||||||
|     +row |  | ||||||
|         +cell #[code **attributes] |  | ||||||
|         +cell - |  | ||||||
|         +cell |  | ||||||
|             |  Attributes to assign to the merged token. By default, |  | ||||||
|             |  attributes are inherited from the syntactic root token of |  | ||||||
|             |  the span. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code Token] |  | ||||||
|         +cell |  | ||||||
|             |  The newly merged token, or None if the start and end |  | ||||||
|             |  indices did not fall at token boundaries |  | ||||||
| 
 |  | ||||||
| +h(2, "read_bytes") Doc.read_bytes |  | ||||||
|     +tag staticmethod |  | ||||||
| 
 |  | ||||||
| p A static method, used to read serialized #[code Doc] objects from a file. |  | ||||||
| 
 |  | ||||||
| +aside-code("Example"). |  | ||||||
|     from spacy.tokens.doc import Doc |  | ||||||
|     loc = 'test_serialize.bin' |  | ||||||
|     with open(loc, 'wb') as file_: |  | ||||||
|         file_.write(nlp(u'This is a document.').to_bytes()) |  | ||||||
|         file_.write(nlp(u'This is another.').to_bytes()) |  | ||||||
|     docs = [] |  | ||||||
|     with open(loc, 'rb') as file_: |  | ||||||
|         for byte_string in Doc.read_bytes(file_): |  | ||||||
|             docs.append(Doc(nlp.vocab).from_bytes(byte_string)) |  | ||||||
|     assert len(docs) == 2 |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +row |  | ||||||
|         +cell file |  | ||||||
|         +cell buffer |  | ||||||
|         +cell A binary buffer to read the serialized annotations from. |  | ||||||
| 
 |  | ||||||
|     +footrow |  | ||||||
|         +cell yield |  | ||||||
|         +cell bytes |  | ||||||
|         +cell Binary strings from with documents can be loaded. |  | ||||||
| 
 |  | ||||||
| +h(2, "text") Doc.text |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p A unicode representation of the document text. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell unicode |  | ||||||
|         +cell The original verbatim text of the document. |  | ||||||
| 
 |  | ||||||
| +h(2, "text_with_ws") Doc.text_with_ws |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  An alias of #[code Doc.text], provided for duck-type compatibility with |  | ||||||
|     |  #[code Span] and #[code Token]. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell unicode |  | ||||||
|         +cell The original verbatim text of the document. |  | ||||||
| 
 |  | ||||||
| +h(2, "sents") Doc.sents |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p Iterate over the sentences in the document. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell yield |  | ||||||
|         +cell #[code Span] |  | ||||||
|         +cell Sentences in the document. |  | ||||||
| 
 |  | ||||||
| +h(2, "ents") Doc.ents |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p Iterate over the entities in the document. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell yield |  | ||||||
|         +cell #[code Span] |  | ||||||
|         +cell Entities in the document. |  | ||||||
| 
 |  | ||||||
| +h(2, "noun_chunks") Doc.noun_chunks |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  Iterate over the base noun phrases in the document. A base noun phrase, |  | ||||||
|     |  or "NP chunk", is a noun phrase that does not permit other NPs to be |  | ||||||
|     |  nested within it. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell yield |  | ||||||
|         +cell #[code Span] |  | ||||||
|         +cell Noun chunks in the document |  | ||||||
| 
 |  | ||||||
| +h(2, "vector") Doc.vector |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A real-valued meaning representation. Defaults to an average of the |  | ||||||
|     |  token vectors. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell #[code numpy.ndarray[ndim=1, dtype='float32']] |  | ||||||
|         +cell A 1D numpy array representing the document's semantics. |  | ||||||
| 
 |  | ||||||
| +h(2, "has_vector") Doc.has_vector |  | ||||||
|     +tag property |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|     |  A boolean value indicating whether a word vector is associated with the |  | ||||||
|     |  object. |  | ||||||
| 
 |  | ||||||
| +table(["Name", "Type", "Description"]) |  | ||||||
|     +footrow |  | ||||||
|         +cell return |  | ||||||
|         +cell bool |  | ||||||
|         +cell Whether the document has a vector data attached. |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user