mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	* Work on reorganization of docs
This commit is contained in:
		
							parent
							
								
									63f86efa8b
								
							
						
					
					
						commit
						67979a8008
					
				|  | @ -54,11 +54,12 @@ and a small usage snippet. | ||||||
| .. toctree:: | .. toctree:: | ||||||
|     :maxdepth: 4 |     :maxdepth: 4 | ||||||
| 
 | 
 | ||||||
|     loading.rst |  | ||||||
|     processing.rst |     processing.rst | ||||||
|     using/document.rst |     using/document.rst | ||||||
|     using/span.rst |     using/span.rst | ||||||
|     using/token.rst |     using/token.rst | ||||||
|  |     using/lexeme.rst | ||||||
|  |     lookup.rst | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. _English: processing.html | .. _English: processing.html | ||||||
|  |  | ||||||
|  | @ -1,27 +1,6 @@ | ||||||
| ================= | ================= | ||||||
| Loading Resources | Loading Resources | ||||||
| ================= | ================= | ||||||
| 
 |  | ||||||
| 99\% of the time, you will load spaCy's resources using a language pipeline class, |  | ||||||
| e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a |  | ||||||
| specified directory.  By default, spaCy installs data into each language's |  | ||||||
| package directory, and loads it from there. |  | ||||||
| 
 |  | ||||||
| Usually, this is all you will need: |  | ||||||
| 
 |  | ||||||
|     >>> from spacy.en import English |  | ||||||
|     >>> nlp = English() |  | ||||||
| 
 |  | ||||||
| If you need to replace some of the components, you may want to just make your |  | ||||||
| own pipeline class --- the English class itself does almost no work; it just |  | ||||||
| applies the modules in order. You can also provide a function or class that |  | ||||||
| produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, |  | ||||||
| to customize the pipeline: |  | ||||||
| 
 |  | ||||||
|     >>> from spacy.en import English |  | ||||||
|     >>> from my_module import MyTagger |  | ||||||
|     >>> nlp = English(Tagger=MyTagger) |  | ||||||
| 
 |  | ||||||
| In more detail: | In more detail: | ||||||
| 
 | 
 | ||||||
| .. code:: | .. code:: | ||||||
|  | @ -44,12 +23,12 @@ In more detail: | ||||||
| 
 | 
 | ||||||
| :code:`Tokenizer` | :code:`Tokenizer` | ||||||
|   :code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc` |   :code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc` | ||||||
|    | 
 | ||||||
|   A class/function that creates the tokenizer. |   A class/function that creates the tokenizer. | ||||||
| 
 | 
 | ||||||
| :code:`Tagger` / :code:`Parser` / :code:`Entity` | :code:`Tagger` / :code:`Parser` / :code:`Entity` | ||||||
|   :code:`(Vocab vocab, unicode data_dir)(Doc) --> None` |   :code:`(Vocab vocab, unicode data_dir)(Doc) --> None` | ||||||
|    | 
 | ||||||
|   A class/function that creates the part-of-speech tagger / |   A class/function that creates the part-of-speech tagger / | ||||||
|   syntactic dependency parser / named entity recogniser. |   syntactic dependency parser / named entity recogniser. | ||||||
|   May be None or False, to disable tagging. |   May be None or False, to disable tagging. | ||||||
|  |  | ||||||
|  | @ -17,33 +17,95 @@ up in the vocabulary directly: | ||||||
| 
 | 
 | ||||||
| .. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) | .. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __len__(self) --> int |   .. py:method:: __len__(self) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __getitem__(self, id: int) --> unicode |     :returns: number of words in the vocabulary | ||||||
|  |     :rtype: int | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __getitem__(self, string: unicode) --> int |   .. py:method:: __getitem__(self, key_int) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None |     :param int key: | ||||||
|  |       Integer ID | ||||||
| 
 | 
 | ||||||
|   .. py:method:: dump(self, loc: unicode) --> None |     :returns: A Lexeme object | ||||||
| 
 | 
 | ||||||
|   .. py:method:: load_lexemes(self, loc: unicode) --> None |   .. py:method:: __getitem__(self, key_str) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: load_vectors(self, loc: unicode) --> None |     :param unicode key_str: | ||||||
|  |       A string in the vocabulary | ||||||
|  | 
 | ||||||
|  |     :rtype: Lexeme | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |   .. py:method:: __setitem__(self, orth_str, props) | ||||||
|  | 
 | ||||||
|  |     :param unicode orth_str: | ||||||
|  |       The orth key | ||||||
|  | 
 | ||||||
|  |     :param dict props: | ||||||
|  |       A props dictionary | ||||||
|  | 
 | ||||||
|  |     :returns: None | ||||||
|  | 
 | ||||||
|  |   .. py:method:: dump(self, loc) | ||||||
|  | 
 | ||||||
|  |     :param unicode loc: | ||||||
|  |       Path where the vocabulary should be saved | ||||||
|  | 
 | ||||||
|  |   .. py:method:: load_lexemes(self, loc) | ||||||
|  | 
 | ||||||
|  |     :param unicode loc: | ||||||
|  |       Path to load the lexemes.bin file from | ||||||
|  | 
 | ||||||
|  |   .. py:method:: load_vectors(self, loc) | ||||||
|  | 
 | ||||||
|  |     :param unicode loc: | ||||||
|  |       Path to load the vectors.bin from | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| .. py:class:: strings.StringStore(self) | .. py:class:: strings.StringStore(self) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __len__(self) --> int |   .. py:method:: __len__(self) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __getitem__(self, id: int) --> unicode |     :returns: | ||||||
|  |       Number of strings in the string-store | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __getitem__(self, string: bytes) --> id |   .. py:method:: __getitem__(self, key_int) | ||||||
| 
 | 
 | ||||||
|   .. py:method:: __getitem__(self, string: unicode) --> id |     :param int key_int: An integer key | ||||||
| 
 | 
 | ||||||
|   .. py:method:: dump(self, loc: unicode) --> None |     :returns: | ||||||
|  |       The string that the integer key maps to | ||||||
| 
 | 
 | ||||||
|   .. py:method:: load(self, loc: unicode) --> None |       :rtype: unicode | ||||||
| 
 | 
 | ||||||
|  |   .. py:method:: __getitem__(self, key_unicode) | ||||||
| 
 | 
 | ||||||
|  |     :param int key_unicode: | ||||||
|  |       A key, as a unicode string | ||||||
|  | 
 | ||||||
|  |     :returns: | ||||||
|  |       The integer ID of the string. | ||||||
|  | 
 | ||||||
|  |     :rtype: int | ||||||
|  | 
 | ||||||
|  |   .. py:method:: __getitem__(self, key_utf8_bytes) | ||||||
|  | 
 | ||||||
|  |     :param int key_utf8_bytes: | ||||||
|  |       A key, as a UTF-8 encoded byte-string | ||||||
|  | 
 | ||||||
|  |     :returns: | ||||||
|  |       The integer ID of the string. | ||||||
|  | 
 | ||||||
|  |     :rtype: | ||||||
|  |       int | ||||||
|  | 
 | ||||||
|  |   .. py:method:: dump(self, loc) | ||||||
|  | 
 | ||||||
|  |     :param loc: | ||||||
|  |       File path to save the strings.txt to. | ||||||
|  | 
 | ||||||
|  |   .. py:method:: load(self, loc) | ||||||
|  | 
 | ||||||
|  |     :param loc: | ||||||
|  |       File path to load the strings.txt from. | ||||||
|  |  | ||||||
|  | @ -1,33 +1,76 @@ | ||||||
| =============== | ================ | ||||||
| Processing Text | spacy.en.English | ||||||
| =============== | ================ | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 99\% of the time, you will load spaCy's resources using a language pipeline class, | ||||||
|  | e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a | ||||||
|  | specified directory.  By default, spaCy installs data into each language's | ||||||
|  | package directory, and loads it from there. | ||||||
|  | 
 | ||||||
|  | Usually, this is all you will need: | ||||||
|  | 
 | ||||||
|  |     >>> from spacy.en import English | ||||||
|  |     >>> nlp = English() | ||||||
|  | 
 | ||||||
|  | If you need to replace some of the components, you may want to just make your | ||||||
|  | own pipeline class --- the English class itself does almost no work; it just | ||||||
|  | applies the modules in order. You can also provide a function or class that | ||||||
|  | produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, | ||||||
|  | to customize the pipeline: | ||||||
|  | 
 | ||||||
|  |     >>> from spacy.en import English | ||||||
|  |     >>> from my_module import MyTagger | ||||||
|  |     >>> nlp = English(Tagger=MyTagger) | ||||||
| 
 | 
 | ||||||
| The text processing API is very small and simple. Everything is a callable object, | The text processing API is very small and simple. Everything is a callable object, | ||||||
| and you will almost always apply the pipeline all at once. | and you will almost always apply the pipeline all at once. | ||||||
| 
 | 
 | ||||||
| Applying a pipeline |  | ||||||
| ------------------- |  | ||||||
| 
 | 
 | ||||||
|  | .. py:class:: spacy.en.English | ||||||
|  |    | ||||||
|  |   .. py:method:: __init__(self, data_dir=..., Tokenizer=..., Tagger=..., Parser=..., Entity=..., Matcher=..., Packer=None, load_vectors=True) | ||||||
| 
 | 
 | ||||||
| .. py:method:: English.__call__(text, tag=True, parse=True, entity=True) --> Doc |     :param unicode data_dir: | ||||||
|  |       The data directory.  May be None, to disable any data loading (including | ||||||
|  |       the vocabulary). | ||||||
| 
 | 
 | ||||||
|  |     :param Tokenizer: | ||||||
|  |       A class/function that creates the tokenizer. | ||||||
| 
 | 
 | ||||||
| text (unicode) |     :param Tagger: | ||||||
|   The text to be processed.  No pre-processing needs to be applied, and any |       A class/function that creates the part-of-speech tagger. | ||||||
|   length of text can be submitted.  Usually you will submit a whole document. |  | ||||||
|   Text may be zero-length. An exception is raised if byte strings are supplied. |  | ||||||
| 
 | 
 | ||||||
| tag (bool) |     :param Parser: | ||||||
|   Whether to apply the part-of-speech tagger. Required for parsing and entity recognition. |       A class/function that creates the dependency parser. | ||||||
| 
 | 
 | ||||||
| parse (bool) |     :param Entity: | ||||||
|   Whether to apply the syntactic dependency parser. |       A class/function that creates the named entity recogniser. | ||||||
| 
 | 
 | ||||||
| entity (bool) |     :param bool load_vectors: | ||||||
|   Whether to apply the named entity recognizer. |       A boolean value to control whether the word vectors are loaded. | ||||||
| 
 | 
 | ||||||
|  |   .. py:method:: __call__(text, tag=True, parse=True, entity=True) --> Doc | ||||||
| 
 | 
 | ||||||
| **Examples** |     :param unicode text: | ||||||
|  |       The text to be processed.  No pre-processing needs to be applied, and any | ||||||
|  |       length of text can be submitted.  Usually you will submit a whole document. | ||||||
|  |       Text may be zero-length. An exception is raised if byte strings are supplied. | ||||||
|  | 
 | ||||||
|  |     :param bool tag: | ||||||
|  |       Whether to apply the part-of-speech tagger. Required for parsing and entity | ||||||
|  |       recognition. | ||||||
|  | 
 | ||||||
|  |     :param bool parse: | ||||||
|  |       Whether to apply the syntactic dependency parser. | ||||||
|  | 
 | ||||||
|  |     :param bool entity: | ||||||
|  |       Whether to apply the named entity recognizer. | ||||||
|  | 
 | ||||||
|  |     :return: A document | ||||||
|  |     :rtype: :py:class:`spacy.tokens.Doc` | ||||||
|  | 
 | ||||||
|  |     :Example: | ||||||
| 
 | 
 | ||||||
|     >>> from spacy.en import English |     >>> from spacy.en import English | ||||||
|     >>> nlp = English() |     >>> nlp = English() | ||||||
|  | @ -44,24 +87,3 @@ entity (bool) | ||||||
|     TypeError: Argument 'string' has incorrect type (expected unicode, got str) |     TypeError: Argument 'string' has incorrect type (expected unicode, got str) | ||||||
|     >>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. |     >>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. | ||||||
|     >>> |     >>> | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Tokenizer |  | ||||||
| --------- |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| .. autoclass:: spacy.tokenizer.Tokenizer |  | ||||||
|   :members: |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| Tagger |  | ||||||
| ------ |  | ||||||
| 
 |  | ||||||
| .. autoclass:: spacy.en.pos.EnPosTagger |  | ||||||
|   :members: |  | ||||||
| 
 |  | ||||||
| Parser and Entity Recognizer |  | ||||||
| ---------------------------- |  | ||||||
| 
 |  | ||||||
| .. autoclass:: spacy.syntax.parser.Parser |  | ||||||
|   :members: |  | ||||||
|  |  | ||||||
|  | @ -2,69 +2,93 @@ | ||||||
| The Doc Object | The Doc Object | ||||||
| ============== | ============== | ||||||
| 
 | 
 | ||||||
| .. autoclass:: spacy.tokens.Tokens |  | ||||||
| 
 | 
 | ||||||
| :code:`__getitem__`, :code:`__iter__`, :code:`__len__` | .. py:class:: spacy.tokens.doc.Doc | ||||||
|   The Tokens class behaves as a Python sequence, supporting the usual operators, |  | ||||||
|   len(), etc.  Negative indexing is supported. Slices are not yet. |  | ||||||
| 
 | 
 | ||||||
|   .. code:: |   .. py:method:: __init__(self, Vocab vocab, orths_and_spaces=None) | ||||||
| 
 | 
 | ||||||
|     >>> tokens = nlp(u'Zero one two three four five six') |     :param Vocab vocab: A vocabulary object. | ||||||
|     >>> tokens[0].orth_ |  | ||||||
|     u'Zero' |  | ||||||
|     >>> tokens[-1].orth_ |  | ||||||
|     u'six' |  | ||||||
|     >>> tokens[0:4] |  | ||||||
|     Error |  | ||||||
| 
 | 
 | ||||||
| :code:`sents` |     :param list orths_and_spaces=None: Defaults to None. | ||||||
|   Iterate over sentences in the document. |  | ||||||
| 
 | 
 | ||||||
| :code:`ents` |   .. py:method:: __getitem__(self, int i) | ||||||
|   Iterate over entities in the document. |      | ||||||
|  |     :returns: Token | ||||||
| 
 | 
 | ||||||
| :code:`to_array` |   .. py:method:: __getitem__(self, slice start_colon_end) | ||||||
|   Given a list of M attribute IDs, export the tokens to a numpy ndarray |  | ||||||
|   of shape N*M, where N is the length of the sentence. |  | ||||||
| 
 | 
 | ||||||
|     Arguments: |     :returns: Span | ||||||
|         attr_ids (list[int]): A list of attribute ID ints. |  | ||||||
| 
 | 
 | ||||||
|     Returns: |   .. py:method:: __iter__(self) | ||||||
|         feat_array (numpy.ndarray[long, ndim=2]): |  | ||||||
|         A feature matrix, with one row per word, and one column per attribute |  | ||||||
|         indicated in the input attr_ids. |  | ||||||
|   |  | ||||||
| :code:`count_by` |  | ||||||
|   Produce a dict of {attribute (int): count (ints)} frequencies, keyed |  | ||||||
|   by the values of the given attribute ID. |  | ||||||
| 
 | 
 | ||||||
|     >>> from spacy.en import English, attrs |     Iterate over tokens | ||||||
|     >>> nlp = English() |      | ||||||
|     >>> tokens = nlp(u'apple apple orange banana') |     .. code:: | ||||||
|     >>> tokens.count_by(attrs.ORTH) |  | ||||||
|     {12800L: 1, 11880L: 2, 7561L: 1} |  | ||||||
|     >>> tokens.to_array([attrs.ORTH]) |  | ||||||
|     array([[11880], |  | ||||||
|           [11880], |  | ||||||
|           [ 7561], |  | ||||||
|           [12800]]) |  | ||||||
| 
 | 
 | ||||||
| :code:`merge` |       >>> tokens = nlp(u'Zero one two three four five six') | ||||||
|   Merge a multi-word expression into a single token.  Currently |       >>> tokens[0].orth_ | ||||||
|   experimental; API is likely to change. |       u'Zero' | ||||||
|  |       >>> tokens[-1].orth_ | ||||||
|  |       u'six' | ||||||
| 
 | 
 | ||||||
|  |   .. py:method:: __len__(self) | ||||||
| 
 | 
 | ||||||
|  |     Number of tokens | ||||||
| 
 | 
 | ||||||
| Internals |   .. py:attribute:: sents | ||||||
|   A Tokens instance stores the annotations in a C-array of `TokenC` structs. |    | ||||||
|   Each TokenC struct holds a const pointer to a LexemeC struct, which describes |     Iterate over sentences in the document. | ||||||
|   a vocabulary item. |  | ||||||
| 
 | 
 | ||||||
|   The Token objects are built lazily, from this underlying C-data. |     :returns generator: Sentences | ||||||
| 
 | 
 | ||||||
|   For faster access, the underlying C data can be accessed from Cython.  You |   .. py:attribute:: ents | ||||||
|   can also export the data to a numpy array, via `Tokens.to_array`, if pure Python |      | ||||||
|   access is required, and you need slightly better performance.  However, this |     Iterate over named entities in the document. | ||||||
|   is both slower and has a worse API than Cython access. | 
 | ||||||
|  |     :returns tuple: Named Entities | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: noun_chunks | ||||||
|  | 
 | ||||||
|  |     :returns generator: | ||||||
|  | 
 | ||||||
|  |   .. py:method:: to_array(self, list attr_ids) | ||||||
|  | 
 | ||||||
|  |     Given a list of M attribute IDs, export the tokens to a numpy ndarray | ||||||
|  |     of shape N*M, where N is the length of the sentence. | ||||||
|  | 
 | ||||||
|  |     :param list[int] attr_ids: A list of attribute ID ints. | ||||||
|  | 
 | ||||||
|  |     :returns feat_array: | ||||||
|  |       A feature matrix, with one row per word, and one column per attribute | ||||||
|  |       indicated in the input attr_ids. | ||||||
|  | 
 | ||||||
|  |   .. py:method:: count_by(self, attr_id) | ||||||
|  | 
 | ||||||
|  |     Produce a dict of {attribute (int): count (ints)} frequencies, keyed | ||||||
|  |     by the values of the given attribute ID. | ||||||
|  | 
 | ||||||
|  |     .. code:: | ||||||
|  |      | ||||||
|  |       >>> from spacy.en import English, attrs | ||||||
|  |       >>> nlp = English() | ||||||
|  |       >>> tokens = nlp(u'apple apple orange banana') | ||||||
|  |       >>> tokens.count_by(attrs.ORTH) | ||||||
|  |       {12800L: 1, 11880L: 2, 7561L: 1} | ||||||
|  |       >>> tokens.to_array([attrs.ORTH]) | ||||||
|  |       array([[11880], | ||||||
|  |             [11880], | ||||||
|  |             [ 7561], | ||||||
|  |             [12800]]) | ||||||
|  | 
 | ||||||
|  |   .. py:method:: from_array(self, attrs, array) | ||||||
|  | 
 | ||||||
|  |   .. py:method:: to_bytes(self) | ||||||
|  | 
 | ||||||
|  |   .. py:method:: from_bytes(self) | ||||||
|  | 
 | ||||||
|  |   .. py:method:: read_bytes(self) | ||||||
|  | 
 | ||||||
|  |   .. py:method:: merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type) | ||||||
|  | 
 | ||||||
|  |     Merge a multi-word expression into a single token.  Currently | ||||||
|  |     experimental; API is likely to change. | ||||||
|  |  | ||||||
|  | @ -4,29 +4,55 @@ The Span Object | ||||||
| 
 | 
 | ||||||
| .. autoclass:: spacy.spans.Span | .. autoclass:: spacy.spans.Span | ||||||
| 
 | 
 | ||||||
| :code:`__getitem__`, :code:`__iter__`, :code:`__len__` | .. py:class:: Span | ||||||
|   Sequence API |  | ||||||
| 
 | 
 | ||||||
| :code:`head` |  | ||||||
|   Syntactic head, or None |  | ||||||
| 
 | 
 | ||||||
| :code:`left` |   .. py:method:: __getitem__ | ||||||
|   Tokens to the left of the span |  | ||||||
| 
 | 
 | ||||||
| :code:`rights` |   .. py:method:: __iter__ | ||||||
|   Tokens to the left of the span |  | ||||||
| 
 | 
 | ||||||
| :code:`orth` / :code:`orth_` |   .. py:method:: __len__ | ||||||
|   Orth string |  | ||||||
| 
 | 
 | ||||||
| :code:`lemma` / :code:`lemma_` |   .. py:attribute:: root | ||||||
|   Lemma string |  | ||||||
| 
 | 
 | ||||||
| :code:`string` |     Syntactic head | ||||||
|   String |  | ||||||
| 
 | 
 | ||||||
| :code:`label` / :code:`label_` |   .. py:attribute:: lefts | ||||||
|   Label |  | ||||||
| 
 | 
 | ||||||
| :code:`subtree` |     Tokens that are: | ||||||
|   Lefts + [self] + Rights | 
 | ||||||
|  |     1. To the left of the span; | ||||||
|  |     2. Syntactic children of words within the span | ||||||
|  | 
 | ||||||
|  |     i.e. | ||||||
|  | 
 | ||||||
|  |     .. code:: | ||||||
|  | 
 | ||||||
|  |       lefts = [span.doc[i] for i in range(0, span.start) if span.doc[i].head in span] | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: rights | ||||||
|  | 
 | ||||||
|  |     Tokens that are: | ||||||
|  | 
 | ||||||
|  |     1. To the right of the span; | ||||||
|  |     2. Syntactic children of words within the span | ||||||
|  | 
 | ||||||
|  |     i.e. | ||||||
|  | 
 | ||||||
|  |     .. code:: | ||||||
|  | 
 | ||||||
|  |       rights = [span.doc[i] for i in range(span.end, len(span.doc)) if span.doc[i].head in span] | ||||||
|  | 
 | ||||||
|  |     Tokens that are: | ||||||
|  | 
 | ||||||
|  |     1. To the right of the span; | ||||||
|  |     2. Syntactic children of words within the span | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: string | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: lemma / lemma\_ | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: label / label\_ | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: subtree | ||||||
|  |  | ||||||
|  | @ -11,115 +11,185 @@ token.orth is an integer ID, token.orth\_ is the unicode value. | ||||||
| The only exception is the Token.string attribute, which is (unicode) | The only exception is the Token.string attribute, which is (unicode) | ||||||
| string-typed. | string-typed. | ||||||
| 
 | 
 | ||||||
| **String Features** |  | ||||||
| 
 | 
 | ||||||
| :code:`orth` / :code:`orth_` | .. py:class:: Token | ||||||
|   The form of the word with no string normalization or processing, as it |  | ||||||
|   appears in the string, without trailing whitespace. |  | ||||||
| 
 | 
 | ||||||
| :code:`lemma` / :code:`lemma_` |   .. py:method:: __init__(self, Vocab vocab, Doc doc, int offset) | ||||||
|   The "base" of the word, with no inflectional suffixes, e.g. the lemma of |  | ||||||
|   "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that |  | ||||||
|   *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" |  | ||||||
|   is "institution", not "institute".  Lemmatization is performed using the |  | ||||||
|   WordNet data, but extended to also cover closed-class words such as |  | ||||||
|   pronouns.  By default, the WN lemmatizer returns "hi" as the lemma of "his". |  | ||||||
|   We assign pronouns the lemma -PRON-. |  | ||||||
| 
 | 
 | ||||||
| :code:`lower` / :code:`lower_` |   **String Views** | ||||||
|   The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() |  | ||||||
| 
 | 
 | ||||||
| :code:`norm` / :code:`norm_` |   .. py:attribute:: orth / orth\_ | ||||||
|   The form of the word, after language-specific normalizations have been |  | ||||||
|   applied. |  | ||||||
| 
 | 
 | ||||||
| :code:`shape` / :code:`shape_` |     The form of the word with no string normalization or processing, as it | ||||||
|   A transform of the word's string, to show orthographic features.  The |     appears in the string, without trailing whitespace. | ||||||
|   characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. |  | ||||||
|   After these mappings, sequences of 4 or more of the same character are |  | ||||||
|   truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, |  | ||||||
|   :) --> :) |  | ||||||
| 
 | 
 | ||||||
| :code:`prefix` / :code:`prefix_` |   .. py:attribute:: lemma / lemma\_ | ||||||
|   A length-N substring from the start of the word.  Length may vary by |  | ||||||
|   language; currently for English n=1, i.e. prefix = word.orth\_[:1] |  | ||||||
| 
 | 
 | ||||||
| :code:`suffix` / :code:`suffix_` |     The "base" of the word, with no inflectional suffixes, e.g. the lemma of | ||||||
|   A length-N substring from the end of the word.  Length may vary by |     "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that | ||||||
|   language; currently for English n=3, i.e. suffix = word.orth\_[-3:] |     *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" | ||||||
|  |     is "institution", not "institute".  Lemmatization is performed using the | ||||||
|  |     WordNet data, but extended to also cover closed-class words such as | ||||||
|  |     pronouns.  By default, the WN lemmatizer returns "hi" as the lemma of "his". | ||||||
|  |     We assign pronouns the lemma -PRON-. | ||||||
| 
 | 
 | ||||||
| :code:`string` |   .. py:attribute:: lower / lower\_ | ||||||
|   The form of the word as it appears in the string, **including trailing |  | ||||||
|   whitespace**.  This is useful when you need to use linguistic features to |  | ||||||
|   add inline mark-up to the string. |  | ||||||
| 
 | 
 | ||||||
|  |     The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() | ||||||
| 
 | 
 | ||||||
| **Distributional Features** |   .. py:attribute:: norm / norm\_ | ||||||
| 
 | 
 | ||||||
| :code:`prob` |     The form of the word, after language-specific normalizations have been | ||||||
|   The unigram log-probability of the word, estimated from counts from a |     applied. | ||||||
|   large corpus, smoothed using Simple Good Turing estimation. |  | ||||||
| 
 | 
 | ||||||
| :code:`cluster` |   .. py:attribute:: shape / shape\_ | ||||||
|   The Brown cluster ID of the word.  These are often useful features for |  | ||||||
|   linear models.  If you're using a non-linear model, particularly |  | ||||||
|   a neural net or random forest, consider using the real-valued word |  | ||||||
|   representation vector, in Token.repvec, instead. |  | ||||||
| 
 | 
 | ||||||
| :code:`repvec` |     A transform of the word's string, to show orthographic features.  The | ||||||
|   A "word embedding" representation: a dense real-valued vector that supports |     characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. | ||||||
|   similarity queries between words.  By default, spaCy currently loads |     After these mappings, sequences of 4 or more of the same character are | ||||||
|   vectors produced by the Levy and Goldberg (2014) dependency-based word2vec |     truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, | ||||||
|   model. |     :) --> :) | ||||||
| 
 | 
 | ||||||
| **Syntactic Features** |   .. py:attribute:: prefix / prefix\_ | ||||||
| 
 | 
 | ||||||
| :code:`tag` |     A length-N substring from the start of the word.  Length may vary by | ||||||
|   A morphosyntactic tag, e.g. NN, VBZ, DT, etc.  These tags are |     language; currently for English n=1, i.e. prefix = word.orth\_[:1] | ||||||
|   language/corpus specific, and typically describe part-of-speech and some |  | ||||||
|   amount of morphological information.  For instance, in the Penn Treebank |  | ||||||
|   tag set, VBZ is assigned to a present-tense singular verb. |  | ||||||
| 
 | 
 | ||||||
| :code:`pos` |   .. py:attribute:: suffix / suffix\_ | ||||||
|   A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, |  | ||||||
|   ADV.  Constants for the 17 tag values are provided in spacy.parts\_of\_speech. |  | ||||||
| 
 | 
 | ||||||
| :code:`dep` |     A length-N substring from the end of the word.  Length may vary by | ||||||
|   The type of syntactic dependency relation between the word and its |     language; currently for English n=3, i.e. suffix = word.orth\_[-3:] | ||||||
|   syntactic head. |  | ||||||
| 
 | 
 | ||||||
| :code:`n_lefts` |   .. py:attribute:: lex_id | ||||||
|   The number of immediate syntactic children preceding the word in the |  | ||||||
|   string. |  | ||||||
| 
 | 
 | ||||||
| :code:`n_rights` |   **Alignment and Output** | ||||||
|   The number of immediate syntactic children following the word in the |  | ||||||
|   string. |  | ||||||
| 
 | 
 | ||||||
| **Navigating the Dependency Tree** |   .. py:attribute:: idx | ||||||
| 
 | 
 | ||||||
| :code:`head` |   .. py:method:: __len__(self) | ||||||
|   The Token that is the immediate syntactic head of the word.  If the word is |  | ||||||
|   the root of the dependency tree, the same word is returned. |  | ||||||
| 
 | 
 | ||||||
| :code:`lefts` |   .. py:method:: __unicode__(self) | ||||||
|   An iterator for the immediate leftward syntactic children of the word. |  | ||||||
| 
 | 
 | ||||||
| :code:`rights` |   .. py:method:: __str__(self) | ||||||
|   An iterator for the immediate rightward syntactic children of the word. |  | ||||||
| 
 | 
 | ||||||
| :code:`children` |   .. py:attribute:: string | ||||||
|   An iterator that yields from lefts, and then yields from rights. |  | ||||||
| 
 | 
 | ||||||
| :code:`subtree` |     The form of the word as it appears in the string, **including trailing | ||||||
|   An iterator for the part of the sentence syntactically governed by the |     whitespace**.  This is useful when you need to use linguistic features to | ||||||
|   word, including the word itself. |     add inline mark-up to the string. | ||||||
| 
 | 
 | ||||||
|  |   .. py:method:: nbor(self, int i=1) | ||||||
| 
 | 
 | ||||||
| **Named Entities** |   **Distributional Features** | ||||||
| 
 | 
 | ||||||
| :code:`ent_type` |   .. py:attribute:: repvec | ||||||
|   If the token is part of an entity, its entity type |  | ||||||
| 
 | 
 | ||||||
| :code:`ent_iob` |     A "word embedding" representation: a dense real-valued vector that supports | ||||||
|   The IOB (inside, outside, begin) entity recognition tag for the token |     similarity queries between words.  By default, spaCy currently loads | ||||||
|  |     vectors produced by the Levy and Goldberg (2014) dependency-based word2vec | ||||||
|  |     model. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: cluster | ||||||
|  | 
 | ||||||
|  |     The Brown cluster ID of the word.  These are often useful features for | ||||||
|  |     linear models.  If you're using a non-linear model, particularly | ||||||
|  |     a neural net or random forest, consider using the real-valued word | ||||||
|  |     representation vector, in Token.repvec, instead. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: prob | ||||||
|  | 
 | ||||||
|  |     The unigram log-probability of the word, estimated from counts from a | ||||||
|  |     large corpus, smoothed using Simple Good Turing estimation. | ||||||
|  | 
 | ||||||
|  |   **Navigating the Dependency Tree** | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: pos / pos\_ | ||||||
|  | 
 | ||||||
|  |     A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, | ||||||
|  |     ADV.  Constants for the 17 tag values are provided in spacy.parts\_of\_speech. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: tag / tag\_ | ||||||
|  | 
 | ||||||
|  |     A morphosyntactic tag, e.g. NN, VBZ, DT, etc.  These tags are | ||||||
|  |     language/corpus specific, and typically describe part-of-speech and some | ||||||
|  |     amount of morphological information.  For instance, in the Penn Treebank | ||||||
|  |     tag set, VBZ is assigned to a present-tense singular verb. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: dep / dep\_ | ||||||
|  | 
 | ||||||
|  |     The type of syntactic dependency relation between the word and its | ||||||
|  |     syntactic head. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: head | ||||||
|  | 
 | ||||||
|  |     The Token that is the immediate syntactic head of the word.  If the word is | ||||||
|  |     the root of the dependency tree, the same word is returned. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: lefts | ||||||
|  | 
 | ||||||
|  |     An iterator for the immediate leftward syntactic children of the word. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: rights | ||||||
|  | 
 | ||||||
|  |     An iterator for the immediate rightward syntactic children of the word. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: n_lefts | ||||||
|  | 
 | ||||||
|  |     The number of immediate syntactic children preceding the word in the | ||||||
|  |     string. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: n_rights | ||||||
|  | 
 | ||||||
|  |     The number of immediate syntactic children following the word in the | ||||||
|  |     string. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: children | ||||||
|  | 
 | ||||||
|  |     An iterator that yields from lefts, and then yields from rights. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: subtree | ||||||
|  | 
 | ||||||
|  |     An iterator for the part of the sentence syntactically governed by the | ||||||
|  |     word, including the word itself. | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: left_edge | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: right_edge | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: conjuncts | ||||||
|  | 
 | ||||||
|  |   **Named Entities** | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: ent_type | ||||||
|  | 
 | ||||||
|  |     If the token is part of an entity, its entity type | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: ent_iob | ||||||
|  | 
 | ||||||
|  |     The IOB (inside, outside, begin) entity recognition tag for the token | ||||||
|  | 
 | ||||||
|  |   **Lexeme Flags** | ||||||
|  | 
 | ||||||
|  |   .. py:method:: check_flag(self, attr_id_t flag_id) | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_oov | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_alpha | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_ascii | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_digit | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_lower | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_title | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_punct | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: is_space | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: like_url | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: like_num | ||||||
|  | 
 | ||||||
|  |   .. py:attribute:: like_email | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user