mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Merge branch 'gaz' of https://github.com/honnibal/spaCy into gaz
This commit is contained in:
		
						commit
						4f765eee79
					
				
							
								
								
									
										705
									
								
								docs/redesign/docs.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										705
									
								
								docs/redesign/docs.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,705 @@ | ||||||
|  | - var py_docs = '<a class="reference" href="http://docs.python.org/library/' | ||||||
|  | 
 | ||||||
|  | - | ||||||
|  |   var types = { | ||||||
|  |    'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>', | ||||||
|  |    'bool': py_docs + 'functions.html#bool"><em>bool</em></a>', | ||||||
|  |    'int': py_docs + 'functions.html#int"><em>int</em></a>', | ||||||
|  |    'generator': "", | ||||||
|  |    'Vocab': "", | ||||||
|  |    'Span': "", | ||||||
|  |    'Doc': "" | ||||||
|  |   } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin declare_class(name) | ||||||
|  |   details | ||||||
|  |     summary | ||||||
|  |       span.declaration | ||||||
|  |         span.label class | ||||||
|  |         code #{name} | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | mixin method(name, parameters) | ||||||
|  |   details(open=attributes.open) | ||||||
|  |     summary | ||||||
|  |       span.declaration | ||||||
|  |         span.label #{name} | ||||||
|  |         span.parameters | ||||||
|  |           | self, #{parameters} | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin params | ||||||
|  |   ul | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin param(name, type, value) | ||||||
|  |   li | ||||||
|  |     if type | ||||||
|  |       <strong>#{name}</strong> (!{type}) – | ||||||
|  |     else | ||||||
|  |       <strong>#{name}</strong> – | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin attribute(name, type, value) | ||||||
|  |   details(open=attributes.open) | ||||||
|  |     summary | ||||||
|  |       span.declaration | ||||||
|  |         span.label #{name} | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin returns(name, type, value) | ||||||
|  |   li | ||||||
|  |     if type | ||||||
|  |       <strong>#{name}</strong> (!{type}) – | ||||||
|  |     else | ||||||
|  |       <strong>#{name}</strong> – | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin returns(type) | ||||||
|  |   | tmp | ||||||
|  | 
 | ||||||
|  | mixin init | ||||||
|  |   details | ||||||
|  |     summary: h4 Init | ||||||
|  | 
 | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin callable | ||||||
|  |   details | ||||||
|  |     summary: h4 Callable | ||||||
|  | 
 | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin sequence | ||||||
|  |   details | ||||||
|  |     summary: h4 Sequence | ||||||
|  | 
 | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin maptype | ||||||
|  |   details | ||||||
|  |     summary: h4 Map | ||||||
|  | 
 | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin summary | ||||||
|  |   block | ||||||
|  | 
 | ||||||
|  | mixin en_example | ||||||
|  |   pre.language-python | ||||||
|  |     code | ||||||
|  |       | from spacy.en import English | ||||||
|  |       | from spacy._doc_examples import download_war_and_peace | ||||||
|  |       |  | ||||||
|  |       | unprocessed_unicode = download_war_and_peace() | ||||||
|  |       |  | ||||||
|  |       | nlp = English() | ||||||
|  |       | doc = nlp(unprocessed_unicode) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | doctype html | ||||||
|  | html(lang="en") | ||||||
|  |   head | ||||||
|  |     meta(charset="utf-8") | ||||||
|  |     title spaCy – Industrial-strength NLP | ||||||
|  |     meta(name="description" content="") | ||||||
|  |     meta(name="author" content="Matthew Honnibal") | ||||||
|  |     link(rel="stylesheet" href="css/style.css") | ||||||
|  |     <!--[if lt IE 9]> | ||||||
|  |     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") | ||||||
|  |     <![endif]--> | ||||||
|  | 
 | ||||||
|  |   body(id="docs") | ||||||
|  |     header(role="banner") | ||||||
|  |       h1.logo spaCy – Industrial-strength NLP | ||||||
|  |       div.slogan API | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     nav(role="navigation") | ||||||
|  |       ul | ||||||
|  |         li: a(href="#") Home | ||||||
|  |         li.active: a(href="#") Docs | ||||||
|  |         li: a(href="#") License | ||||||
|  |         li: a(href="#") Blog | ||||||
|  | 
 | ||||||
|  |     main.docs#content | ||||||
|  | 
 | ||||||
|  |       article | ||||||
|  |         +declare_class("English") | ||||||
|  |           p Load models into a callable object to process English text. | ||||||
|  | 
 | ||||||
|  |           +summary | ||||||
|  |             +en_example | ||||||
|  | 
 | ||||||
|  |           +init | ||||||
|  |             p | ||||||
|  |               | Load the resources.  Loading takes 20 seconds, and the instance | ||||||
|  |               | consumes 2 to 3 gigabytes of memory. | ||||||
|  |              | ||||||
|  |             p  | ||||||
|  |               | Intended use is for one instance to be created per process. | ||||||
|  |               | You can create more if you're doing something unusual. | ||||||
|  |             p | ||||||
|  |               | You may wish to make the instance a global variable or "singleton". | ||||||
|  |               | We usually instantiate the object in the <code>main()</code> | ||||||
|  |               | function and pass it around as an explicit argument.  | ||||||
|  |             +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") | ||||||
|  | 
 | ||||||
|  |               +params | ||||||
|  |                 +param("data_dir") | ||||||
|  |                   | The data directory.  May be #{None}, to disable any data loading | ||||||
|  |                   | (including the vocabulary). | ||||||
|  | 
 | ||||||
|  |                 +param("Tokenizer") | ||||||
|  |                   | A class/function that creates the tokenizer. | ||||||
|  | 
 | ||||||
|  |                 +param("Tagger") | ||||||
|  |                   | A class/function that creates the part-of-speech tagger. | ||||||
|  | 
 | ||||||
|  |                 +param("Parser") | ||||||
|  |                   | A class/function that creates the dependency parser. | ||||||
|  | 
 | ||||||
|  |                 +param("Entity") | ||||||
|  |                   | A class/function that creates the named entity recogniser. | ||||||
|  | 
 | ||||||
|  |                 +param("load_vectors") | ||||||
|  |                   | A boolean value to control whether the word vectors are loaded. | ||||||
|  |            | ||||||
|  |           +callable | ||||||
|  |             +method("__call__", "text, tag=True, parse=True, entity=True") | ||||||
|  | 
 | ||||||
|  |               +params | ||||||
|  |                 +param("text", types.unicode) | ||||||
|  |                   | The text to be processed.  No pre-processing needs to be applied, | ||||||
|  |                   | and any length of text can be submitted.  Usually you will submit | ||||||
|  |                   | a whole document. Text may be zero-length. An exception is raised | ||||||
|  |                   | if byte strings are supplied. | ||||||
|  | 
 | ||||||
|  |                 +param("tag", bool_type) | ||||||
|  |                   | Whether to apply the part-of-speech tagger. Required for parsing | ||||||
|  |                   | and entity recognition. | ||||||
|  | 
 | ||||||
|  |                 +param("parse", bool_type) | ||||||
|  |                   | Whether to apply the syntactic dependency parser. | ||||||
|  | 
 | ||||||
|  |                 +param("entity", bool_type) | ||||||
|  |                   | Whether to apply the named entity recognizer. | ||||||
|  | 
 | ||||||
|  |                 pre.language-python | ||||||
|  |                   code | ||||||
|  |                     | from spacy.en import English | ||||||
|  |                     | nlp = English() | ||||||
|  |                     | doc = nlp(u'Some text.) # Applies tagger, parser, entity | ||||||
|  |                     | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser | ||||||
|  |                     | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity | ||||||
|  |                     | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser | ||||||
|  |                     | doc = nlp(u'') # Zero-length tokens, not an error | ||||||
|  |                     | # doc = nlp(b'Some text') <-- Error: need unicode | ||||||
|  |                     | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  |         +declare_class("Doc") | ||||||
|  |           p I'm a doc | ||||||
|  | 
 | ||||||
|  |           +init | ||||||
|  |             +method("__init__", "vocab") | ||||||
|  |               +params | ||||||
|  |                 +param("vocab", vocab_type) | ||||||
|  |                   | A vocabulary object | ||||||
|  | 
 | ||||||
|  |           +sequence | ||||||
|  |             +method("__getitem__", "i", types.int) | ||||||
|  |               +returns(types.Token) | ||||||
|  | 
 | ||||||
|  |             +method("__getitem__", "start_end", types.slice) | ||||||
|  |               +returns(types.Span) | ||||||
|  |    | ||||||
|  |             +method("__iter__") | ||||||
|  |               | Iterate over tokens | ||||||
|  |    | ||||||
|  |             +method("__len__") | ||||||
|  |               | Number of tokens in the document. | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 Spans | ||||||
|  |              | ||||||
|  |             +attribute("sents", types.generator) | ||||||
|  |               | Iterate over sentences in the document. | ||||||
|  |            | ||||||
|  |             +attribute("ents", types.generator) | ||||||
|  |               | Iterate over named entities in the document. | ||||||
|  |      | ||||||
|  |             +attribute("noun_chunks", types.generator) | ||||||
|  |            | ||||||
|  |           details | ||||||
|  |             summary: h4 Export/Import | ||||||
|  |              | ||||||
|  |             +method("to_array", "attr_ids") | ||||||
|  |    | ||||||
|  |               | Given a list of M attribute IDs, export the tokens to a numpy ndarray | ||||||
|  |               | of shape N*M, where N is the length of the sentence. | ||||||
|  |    | ||||||
|  |               +params | ||||||
|  |                 +param("attr_ids", "list[int]") | ||||||
|  |                   | A list of attribute ID ints. | ||||||
|  |    | ||||||
|  |               +returns("feat_array") | ||||||
|  |                 | A feature matrix, with one row per word, and one column per attribute | ||||||
|  |                 | indicated in the input attr_ids. | ||||||
|  |    | ||||||
|  |             +method("count_by", "attr_id") | ||||||
|  |               | Produce a dict of {attribute (int): count (ints)} frequencies, keyed | ||||||
|  |               | by the values of the given attribute ID. | ||||||
|  |              | ||||||
|  |               pre.language-python | ||||||
|  |                 code | ||||||
|  |                   | >>> from spacy.en import English, attrs | ||||||
|  |                   | >>> nlp = English() | ||||||
|  |                   | >>> tokens = nlp(u'apple apple orange banana') | ||||||
|  |                   | >>> tokens.count_by(attrs.ORTH) | ||||||
|  |                   | {12800L: 1, 11880L: 2, 7561L: 1} | ||||||
|  |                   | >>> tokens.to_array([attrs.ORTH]) | ||||||
|  |                   | array([[11880], | ||||||
|  |                   |         [11880], | ||||||
|  |                   |         [7561], | ||||||
|  |                   |         [12800]]) | ||||||
|  |    | ||||||
|  |             +method("from_array", "attrs, array") | ||||||
|  |               | Load from array | ||||||
|  |            | ||||||
|  |             +method("from_bytes") | ||||||
|  |               | Deserialize, loading from bytes | ||||||
|  |    | ||||||
|  |             +method("read_bytes") | ||||||
|  |               | classmethod | ||||||
|  |    | ||||||
|  |             //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") | ||||||
|  |    | ||||||
|  |             //  | Merge a multi-word expression into a single token.  Currently | ||||||
|  |             //  | experimental; API is likely to change. | ||||||
|  |    | ||||||
|  |    | ||||||
|  |         +declare_class("Token") | ||||||
|  |           +init | ||||||
|  |             +method("__init__", "vocab, doc, offset") | ||||||
|  |               +params | ||||||
|  |                 +param("vocab", types.Vocab) | ||||||
|  |                   p A Vocab object | ||||||
|  |    | ||||||
|  |                 +param("doc", types.Doc) | ||||||
|  |                   p The parent sequence | ||||||
|  |    | ||||||
|  |               +param("offset", types.int) | ||||||
|  |                 p The index of the token within the document | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 String Views | ||||||
|  |    | ||||||
|  |             +attribute("orth / orth_") | ||||||
|  |               | The form of the word with no string normalization or processing, as | ||||||
|  |               | it appears in the string, without trailing whitespace. | ||||||
|  |    | ||||||
|  |             +attribute("lemma / lemma_") | ||||||
|  |               | The "base" of the word, with no inflectional suffixes, e.g. the lemma of | ||||||
|  |               | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that | ||||||
|  |               | <em>derivational</em> suffixes are not stripped, e.g. the lemma of | ||||||
|  |               | "instutitions" is "institution", not "institute".  Lemmatization is | ||||||
|  |               | performed using the WordNet data, but extended to also cover closed-class | ||||||
|  |               | words such as pronouns.  By default, the WN lemmatizer returns "hi" | ||||||
|  |               | as the lemma of "his". We assign pronouns the lemma -PRON-. | ||||||
|  |    | ||||||
|  |             +attribute("lower / lower_") | ||||||
|  |               | The form of the word, but forced to lower-case, i.e. | ||||||
|  |               pre.language-python: code lower = word.orth\_.lower() | ||||||
|  |    | ||||||
|  |             //+attribute("norm / norm_") | ||||||
|  |             //  | The form of the word, after language-specific normalizations has been | ||||||
|  |             //  | applied. | ||||||
|  |    | ||||||
|  |             +attribute("shape / shape_") | ||||||
|  |               | A transform of the word's string, to show orthographic features. | ||||||
|  |               | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped | ||||||
|  |               | to d. After these mappings, sequences of 4 or more of the same character | ||||||
|  |               | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, | ||||||
|  |               | :) --> :) | ||||||
|  |    | ||||||
|  |             +attribute("prefix / prefix_") | ||||||
|  |               | A length-N substring from the start of the word.  Length may vary by | ||||||
|  |               | language; currently for English n=1, i.e. | ||||||
|  |               pre.language-python: code prefix = word.orth\_[:1] | ||||||
|  |    | ||||||
|  |             +attribute("suffix / suffix_") | ||||||
|  |               | A length-N substring from the end of the word.  Length may vary by | ||||||
|  |               | language; currently for English n=3, i.e. | ||||||
|  |               pre.language-python: code suffix = word.orth\_[-3:] | ||||||
|  |    | ||||||
|  |             //+attribute("lex_id") | ||||||
|  |             //  | lex_id | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 Alignment and Output | ||||||
|  |    | ||||||
|  |             +attribute("idx") | ||||||
|  |               p Start index of the token in the string | ||||||
|  |    | ||||||
|  |             +method("__len__", "") | ||||||
|  |               p Length of the token's orth string, in unicode code-points. | ||||||
|  |    | ||||||
|  |             +method("__unicode__", "") | ||||||
|  |               p Same as token.orth_ | ||||||
|  |    | ||||||
|  |             +method("__str__", "") | ||||||
|  |               p Varies between Python 2 and Python 3 | ||||||
|  |    | ||||||
|  |             +attribute("string") | ||||||
|  |               p | ||||||
|  |                 | The form of the word as it appears in the string, <strong>including | ||||||
|  |                 | trailing whitespace</strong>.  This is useful when you need to use | ||||||
|  |                 | linguistic features to add inline mark-up to the string. | ||||||
|  |    | ||||||
|  |             +method("nbor, i=1") | ||||||
|  |               +params | ||||||
|  |                 +param("i") | ||||||
|  |                   p Offset relative to token | ||||||
|  |      | ||||||
|  |           details | ||||||
|  |             summary: h4 Distributional Features | ||||||
|  |      | ||||||
|  |             +attribute("repvec") | ||||||
|  |               p | ||||||
|  |                 | A "word embedding" representation: a dense real-valued vector that supports | ||||||
|  |                 | similarity queries between words.  By default, spaCy currently loads | ||||||
|  |                 | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec | ||||||
|  |                 | model. | ||||||
|  |      | ||||||
|  |             +attribute("cluster") | ||||||
|  |               p | ||||||
|  |                 | The Brown cluster ID of the word.  These are often useful features for | ||||||
|  |                 | linear models.  If you're using a non-linear model, particularly a | ||||||
|  |                 | neural net or random forest, consider using the real-valued word | ||||||
|  |                 | representation vector, in Token.repvec, instead. | ||||||
|  |      | ||||||
|  |             +attribute("prob") | ||||||
|  |               p | ||||||
|  |                 | The unigram log-probability of the word, estimated from counts from a | ||||||
|  |                 | large corpus, smoothed using Simple Good Turing estimation. | ||||||
|  |      | ||||||
|  |           details | ||||||
|  |             summary: h4 Syntactic Tags | ||||||
|  |      | ||||||
|  |             +attribute("pos / pos_") | ||||||
|  |               p | ||||||
|  |                 | A part-of-speech tag, from the Google Universal Tag Set, e.g.  | ||||||
|  |                 | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for | ||||||
|  |                 | the 17 tag values are provided in <code>spacy.parts_of_speech.</code> | ||||||
|  |      | ||||||
|  |             +attribute("tag / tag_") | ||||||
|  |               p | ||||||
|  |                 | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>, | ||||||
|  |                 | <code>DT</code>, etc.  These tags are language/corpus specific, and | ||||||
|  |                 | typically describe part-of-speech and some amount of morphological | ||||||
|  |                 | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code> | ||||||
|  |                 | is assigned to a present-tense singular verb. | ||||||
|  |      | ||||||
|  |             +attribute("dep / dep_") | ||||||
|  |               p | ||||||
|  |                 | The type of syntactic dependency relation between the word and its | ||||||
|  |                 | syntactic head. | ||||||
|  |      | ||||||
|  |           details | ||||||
|  |             summary: h4 Navigating the Parse Tree | ||||||
|  |            | ||||||
|  |             +attribute("head") | ||||||
|  |               p | ||||||
|  |                 | The Token that is the immediate syntactic head of the word.  If the | ||||||
|  |                 | word is the root of the dependency tree, the same word is returned. | ||||||
|  |      | ||||||
|  |             +attribute("lefts") | ||||||
|  |               p | ||||||
|  |                 | An iterator for the immediate leftward syntactic children of the | ||||||
|  |                 | word. | ||||||
|  |      | ||||||
|  |             +attribute("rights") | ||||||
|  |               p | ||||||
|  |                 | An iterator for the immediate rightward syntactic children of the | ||||||
|  |                 | word. | ||||||
|  |      | ||||||
|  |             +attribute("n_lefts") | ||||||
|  |               p | ||||||
|  |                 | The number of immediate syntactic children preceding the word in  | ||||||
|  |                 | the string. | ||||||
|  |      | ||||||
|  |             +attribute("n_rights") | ||||||
|  |               p | ||||||
|  |                 | The number of immediate syntactic children following the word in | ||||||
|  |                 | the string. | ||||||
|  |      | ||||||
|  |             +attribute("children") | ||||||
|  |               p | ||||||
|  |                 | An iterator that yields from lefts, and then yields from rights. | ||||||
|  |      | ||||||
|  |             +attribute("subtree") | ||||||
|  |               p | ||||||
|  |                 | An iterator for the part of the sentence syntactically governed by | ||||||
|  |                 | the word, including the word itself. | ||||||
|  |      | ||||||
|  |             +attribute("left_edge") | ||||||
|  |               p The leftmost edge of the token's subtree | ||||||
|  |      | ||||||
|  |             +attribute("right_edge") | ||||||
|  |               p The rightmost edge of the token's subtree | ||||||
|  |      | ||||||
|  |           details | ||||||
|  |             summary: h4 Named Entities | ||||||
|  |      | ||||||
|  |             +attribute("ent_type") | ||||||
|  |               p If the token is part of an entity, its entity type. | ||||||
|  |      | ||||||
|  |             +attribute("ent_iob") | ||||||
|  |               p The IOB (inside, outside, begin) entity recognition tag for the token. | ||||||
|  |      | ||||||
|  |           details | ||||||
|  |             summary: h4 Lexeme Flags | ||||||
|  |      | ||||||
|  |             +method("check_flag", "flag_id") | ||||||
|  |               +params | ||||||
|  |                 +param("flag_id") | ||||||
|  |                   | flag ID | ||||||
|  |      | ||||||
|  |             +attribute("is_oov") | ||||||
|  |             +attribute("is_alpha") | ||||||
|  |             +attribute("is_ascii") | ||||||
|  |             +attribute("is_digit") | ||||||
|  |             +attribute("is_lower") | ||||||
|  |             +attribute("is_title") | ||||||
|  |             +attribute("is_punct") | ||||||
|  |             +attribute("is_space") | ||||||
|  |             +attribute("like_url") | ||||||
|  |             +attribute("like_num") | ||||||
|  |             +attribute("like_email") | ||||||
|  |      | ||||||
|  |             //+attribute("conjuncts") | ||||||
|  |             //  | Conjuncts | ||||||
|  |      | ||||||
|  |         +declare_class("Span") | ||||||
|  |           +init | ||||||
|  |             +method("__init__") | ||||||
|  |               Temp | ||||||
|  |    | ||||||
|  |             <code>span = doc[0:4]</code> | ||||||
|  |    | ||||||
|  |           +sequence | ||||||
|  |             +method("__getitem__") | ||||||
|  |               p Get item | ||||||
|  |    | ||||||
|  |             +method("__iter__") | ||||||
|  |               p Iter | ||||||
|  |                  | ||||||
|  |             +method("__len__") | ||||||
|  |               p Len | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 Parse | ||||||
|  |    | ||||||
|  |             +attribute("root") | ||||||
|  |               p Syntactic head | ||||||
|  |    | ||||||
|  |             +attribute("lefts") | ||||||
|  |               p Tokens that are: | ||||||
|  |               ol | ||||||
|  |                 li To the left of the span; | ||||||
|  |                 li Syntactic children of words within the span | ||||||
|  |    | ||||||
|  |               p i.e. | ||||||
|  |    | ||||||
|  |               pre.language-python | ||||||
|  |                 code | ||||||
|  |                   | lefts = [span.doc[i] for i in range(0, span.start) | ||||||
|  |                   |          if span.doc[i].head in span] | ||||||
|  |    | ||||||
|  |             +attribute("rights") | ||||||
|  |               p Tokens that are: | ||||||
|  |                 ol  | ||||||
|  |                   li To the right of the span; | ||||||
|  |                   li Syntactic children of words within the span | ||||||
|  |               p i.e. | ||||||
|  |               pre.language-python | ||||||
|  |                 code | ||||||
|  |                   | rights = [span.doc[i] for i in range(span.end, len(span.doc)) | ||||||
|  |                   |           if span.doc[i].head in span] | ||||||
|  |    | ||||||
|  |    | ||||||
|  |             +attribute("subtree") | ||||||
|  |               p String | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 String Views | ||||||
|  |    | ||||||
|  |             +attribute("string") | ||||||
|  |               p String | ||||||
|  |      | ||||||
|  |             +attribute("lemma / lemma_") | ||||||
|  |               p String | ||||||
|  |    | ||||||
|  |             +attribute("label / label_") | ||||||
|  |               p String | ||||||
|  |    | ||||||
|  |         +declare_class("Lexeme") | ||||||
|  |           p | ||||||
|  |             | The Lexeme object represents a lexical type, stored in the vocabulary | ||||||
|  |             | – as opposed to a token, occurring in a document. | ||||||
|  |           p | ||||||
|  |             | Lexemes store various features, so that these features can be computed | ||||||
|  |             | once per type, rather than once per token. As job sizes grow, this | ||||||
|  |             | can amount to a substantial efficiency improvement. | ||||||
|  |    | ||||||
|  |           p | ||||||
|  |             | All Lexeme attributes are therefore context independent, as a single | ||||||
|  |             | lexeme is reused for all usages of that word. Lexemes are keyed by | ||||||
|  |             | the “orth” attribute. | ||||||
|  |    | ||||||
|  |           p | ||||||
|  |             All Lexeme attributes are accessible directly on the Token object. | ||||||
|  |    | ||||||
|  |           +init | ||||||
|  |             +method("__init__") | ||||||
|  |               p Init | ||||||
|  |    | ||||||
|  |             details | ||||||
|  |               summary: h4 String Features | ||||||
|  |    | ||||||
|  |                 +attribute("orth / orth_") | ||||||
|  |                   p | ||||||
|  |                     | The form of the word with no string normalization or processing, | ||||||
|  |                     | as it appears in the string, without trailing whitespace. | ||||||
|  |                | ||||||
|  |                 +attribute("lower / lower_") | ||||||
|  |                   p Tmp | ||||||
|  |                | ||||||
|  |                 +attribute("norm / norm_") | ||||||
|  |                   p Tmp | ||||||
|  |                | ||||||
|  |                 +attribute("shape / shape_") | ||||||
|  |                   p Tmp | ||||||
|  |                | ||||||
|  |                 +attribute("prefix / prefix_") | ||||||
|  |                   p Tmp | ||||||
|  |                | ||||||
|  |                 +attribute("suffix / suffix_") | ||||||
|  |                   p TMP | ||||||
|  |    | ||||||
|  |         +declare_class("Vocab", "data_dir=None, lex_props_getter=None") | ||||||
|  |           +sequence | ||||||
|  |             +method("__len__") | ||||||
|  |               +returns | ||||||
|  |                 p Number of words in the vocabulary. | ||||||
|  |    | ||||||
|  |             +method("__iter__") | ||||||
|  |               +returns | ||||||
|  |                 p Lexeme | ||||||
|  |      | ||||||
|  |           +maptype | ||||||
|  |             +method("__getitem__", "key_int") | ||||||
|  |               +params | ||||||
|  |                 +param("key") | ||||||
|  |                   p Integer ID | ||||||
|  |      | ||||||
|  |               +returns: p A Lexeme object | ||||||
|  |      | ||||||
|  |             +method("__getitem__", "key_str") | ||||||
|  |               +params | ||||||
|  |                 +param("key_str", types.unicode) | ||||||
|  |                   p A string in the vocabulary | ||||||
|  |      | ||||||
|  |               +returns("Lexeme") | ||||||
|  |      | ||||||
|  |             +method("__setitem__", "orth_str", "props") | ||||||
|  |               +params | ||||||
|  |                 +param("orth_str", types.unicode) | ||||||
|  |                   p The orth key | ||||||
|  |      | ||||||
|  |                 +param("props", types.dict) | ||||||
|  |                   p A props dictionary | ||||||
|  |      | ||||||
|  |               +returns("None") | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 Import/Export | ||||||
|  |      | ||||||
|  |             +method("dump", "loc") | ||||||
|  |               +params | ||||||
|  |                 +param("loc", types.unicode) | ||||||
|  |                   p Path where the vocabulary should be saved | ||||||
|  |      | ||||||
|  |             +method("load_lexemes", "loc") | ||||||
|  |             +params | ||||||
|  |               +param("loc", types.unicode) | ||||||
|  |                 p Path to load the lexemes.bin file from | ||||||
|  |      | ||||||
|  |             +method("load_vectors", "loc") | ||||||
|  |               +params | ||||||
|  |                 +param("loc", types.unicode) | ||||||
|  |                   p Path to load the vectors.bin from | ||||||
|  |    | ||||||
|  |         +declare_class("StringStore") | ||||||
|  |           +init | ||||||
|  |             Tmp | ||||||
|  |    | ||||||
|  |           +sequence | ||||||
|  |             +method("__len__") | ||||||
|  |               +returns("int") | ||||||
|  |                 p Number of strings in the string-store | ||||||
|  |    | ||||||
|  |             +method("__iter__") | ||||||
|  |               +returns | ||||||
|  |                 p Lexeme | ||||||
|  |    | ||||||
|  |           +maptype | ||||||
|  |             +method("__getitem__", "key_int") | ||||||
|  |               +params | ||||||
|  |                 +param("key_int") | ||||||
|  |                   p An integer key | ||||||
|  |      | ||||||
|  |               +returns(types.unicode) | ||||||
|  |                 p The string that the integer key maps to | ||||||
|  |      | ||||||
|  |             +method("__getitem__", "key_unicode") | ||||||
|  |               +params | ||||||
|  |                 +param("key_unicode") | ||||||
|  |                   p A key, as a unicode string | ||||||
|  |      | ||||||
|  |               +returns(types.int) | ||||||
|  |                 p The integer ID of the string. | ||||||
|  |      | ||||||
|  |             +method("__getitem__", "key_utf8_bytes") | ||||||
|  |               +params | ||||||
|  |                 +param("key_utf8_bytes", types.bytes) | ||||||
|  |                   p p A key, as a UTF-8 encoded byte-string | ||||||
|  |      | ||||||
|  |               +returns(types.int) | ||||||
|  |                 p The integer ID of the string. | ||||||
|  |    | ||||||
|  |           details | ||||||
|  |             summary: h4 Import/Export | ||||||
|  |      | ||||||
|  |             +method("dump", "loc") | ||||||
|  |               +params | ||||||
|  |                 +param("loc") | ||||||
|  |                   p File path to save the strings.txt to. | ||||||
|  |      | ||||||
|  |             +method("load") | ||||||
|  |               +params | ||||||
|  |                 +param("loc") | ||||||
|  |                   p File path to load the strings.txt from. | ||||||
|  |    | ||||||
|  |     script(src="js/prism.js") | ||||||
							
								
								
									
										106
									
								
								docs/redesign/home.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								docs/redesign/home.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,106 @@ | ||||||
|  | extends ./outline.jade | ||||||
|  | 
 | ||||||
|  | // Notes | ||||||
|  | // | ||||||
|  | // 1. Where to put version notice? Should say something like | ||||||
|  | //   2015-08-12: v0.89 | ||||||
|  | //   and be a link | ||||||
|  | //    | ||||||
|  | //   Only needs to appear on home page. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | - var slogan = "Build Tomorrow's Language Technologies" | ||||||
|  | - var tag_line = "spaCy – " + slogan | ||||||
|  | 
 | ||||||
|  | mixin lede | ||||||
|  |   - var state_of_the_art = '<a href="#">state-of-the-art</a>' | ||||||
|  |   - var a_minor_miracle = '<a href="">a minor miracle</a>' | ||||||
|  |   - var great_documentation = '<a href="">great documentation</a>' | ||||||
|  |    | ||||||
|  |   p. | ||||||
|  |     <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a | ||||||
|  |     library for industrial-strength NLP in Python and Cython.  It features | ||||||
|  |     !{state_of_the_art} speed and accuracy, a concise API, and great documentation. | ||||||
|  |     If you're a small company doing NLP, we want <strong>spaCy</strong> to seem | ||||||
|  |     like !{a_minor_miracle}. | ||||||
|  | 
 | ||||||
|  | mixin overview() | ||||||
|  |   p. | ||||||
|  |     Overview text | ||||||
|  | 
 | ||||||
|  | mixin benchmarks() | ||||||
|  |   p. | ||||||
|  |     Benchmarks | ||||||
|  | 
 | ||||||
|  | mixin get_started() | ||||||
|  |   p. | ||||||
|  |     Get Started | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin comparison(name) | ||||||
|  |   details | ||||||
|  |     summary | ||||||
|  |       h4= name | ||||||
|  | 
 | ||||||
|  |     block | ||||||
|  |   | ||||||
|  | mixin columns(...names) | ||||||
|  |   tr | ||||||
|  |     each name in names | ||||||
|  |       th= name | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin row(...cells) | ||||||
|  |   tr | ||||||
|  |     each cell in cells | ||||||
|  |       td= cell | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin social       | ||||||
|  |   footer(role="contentinfo") | ||||||
|  |     a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter | ||||||
|  | 
 | ||||||
|  |     div.discuss | ||||||
|  |       a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") | ||||||
|  |         | Discuss on Hacker News | ||||||
|  | 
 | ||||||
|  |       a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") | ||||||
|  |         | Discuss on Reddit | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | mixin Section(title_text, link_name, include_file) | ||||||
|  |   a(name=link_name): h3 #{title_text} | ||||||
|  | 
 | ||||||
|  |   if (link_name == "example-use") | ||||||
|  |     include ./usage_examples.jade | ||||||
|  |   else if (link_name == "online-demo") | ||||||
|  |     include ./online_demo.jade | ||||||
|  |   else if (link_name == "comparisons") | ||||||
|  |     include ./comparisons.jade | ||||||
|  |   else if (link_name == "install") | ||||||
|  |     include ./installation.jade | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | block intro_block | ||||||
|  |   section(class="intro") | ||||||
|  |     +lede | ||||||
|  | 
 | ||||||
|  |     nav(role="navigation") | ||||||
|  |       ul | ||||||
|  |         li: a(href="#example-use" class="button") Examples | ||||||
|  |         li: a(href="#online-demo" class="button") Demo | ||||||
|  |         li: a(href="#comparisons" class="button") Comparisons | ||||||
|  |         li: a(href="#install" class="button") Install v0.89 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | block body_block | ||||||
|  |   article(class="page landing-page") | ||||||
|  | 
 | ||||||
|  |     +Section("Usage by Example", "example-use", "./usage_examples.jade") | ||||||
|  | 
 | ||||||
|  |     +Section("Online Demo", "online-demo", "./online_demo.jade") | ||||||
|  | 
 | ||||||
|  |     +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") | ||||||
|  | 
 | ||||||
|  |     +Section("Install", "install", "./install.jade") | ||||||
|  | 
 | ||||||
							
								
								
									
										40
									
								
								docs/redesign/installation.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								docs/redesign/installation.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | ||||||
|  | p With Python 2.7 or Python 3, using Linux or OSX, run: | ||||||
|  | 
 | ||||||
|  | pre.language-bash: code | ||||||
|  |   | $ pip install spacy | ||||||
|  |   | $ python -m spacy.en.download</code></pre> | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |   | The download command fetches and installs about 300mb of data, for | ||||||
|  |   | the parser model and word vectors, which it installs within the spacy.en | ||||||
|  |   | package directory. | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |   | If you're stuck using a server with an old version of Python, and you | ||||||
|  |   | don't have root access, I've prepared a bootstrap script to help you | ||||||
|  |   | compile a local Python install.  Run: | ||||||
|  | 
 | ||||||
|  | pre.language-bash: code | ||||||
|  |   | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |   | The other way to install the package is to clone the github repository, | ||||||
|  |   | and build it from source.  This installs an additional dependency, | ||||||
|  |   | Cython.  If you're using Python 2, I also recommend installing fabric | ||||||
|  |   | and fabtools – this is how I build the project. | ||||||
|  | 
 | ||||||
|  | pre.language-bash: code | ||||||
|  |   | $ git clone https://github.com/honnibal/spaCy.git | ||||||
|  |   | $ cd spaCy | ||||||
|  |   | $ virtualenv .env && source .env/bin/activate | ||||||
|  |   | $ export PYTHONPATH=`pwd` | ||||||
|  |   | $ pip install -r requirements.txt | ||||||
|  |   | $ python setup.py build_ext --inplace | ||||||
|  |   | $ python -m spacy.en.download | ||||||
|  |   | $ pip install pytest | ||||||
|  |   | $ py.test tests/ | ||||||
|  | 
 | ||||||
|  | p | ||||||
|  |   | Python packaging is awkward at the best of times, and it's particularly tricky | ||||||
|  |   | with C extensions, built via Cython, requiring large data files.  So, | ||||||
|  |   | please report issues as you encounter them. | ||||||
							
								
								
									
										0
									
								
								docs/redesign/online_demo.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								docs/redesign/online_demo.jade
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										37
									
								
								docs/redesign/outline.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								docs/redesign/outline.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | - var slogan = "Build Tomorrow's Language Technologies" | ||||||
|  | - var tag_line = "spaCy – " + slogan | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | doctype html | ||||||
|  | html(lang="en") | ||||||
|  |   head | ||||||
|  |     meta(charset="utf-8") | ||||||
|  |     title!= tag_line | ||||||
|  |     meta(name="description" content="") | ||||||
|  |     meta(name="author" content="Matthew Honnibal") | ||||||
|  |     link(rel="stylesheet" href="css/style.css") | ||||||
|  |     <!--[if lt IE 9]> | ||||||
|  |     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") | ||||||
|  |     <![endif]--> | ||||||
|  | 
 | ||||||
|  |   body(id="home" role="document") | ||||||
|  |     header(role="banner") | ||||||
|  |       h1(class="logo")!= tag_line | ||||||
|  |       div(class="slogan")!= slogan | ||||||
|  | 
 | ||||||
|  |     nav(role="navigation") | ||||||
|  |       ul | ||||||
|  |         li: a(href="#") Home | ||||||
|  |         li: a(href="#") Docs | ||||||
|  |         li: a(href="#") License | ||||||
|  |         li: a(href="#") More | ||||||
|  | 
 | ||||||
|  |     main(id="content" role="main") | ||||||
|  |       block intro_block | ||||||
|  | 
 | ||||||
|  |       block body_block | ||||||
|  |   | ||||||
|  |   footer(role="contentinfo") | ||||||
|  | 
 | ||||||
|  |   script(src="js/prism.js") | ||||||
|  |   script(src="js/details_polyfill.js") | ||||||
							
								
								
									
										109
									
								
								docs/redesign/usage_examples.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								docs/redesign/usage_examples.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,109 @@ | ||||||
|  | mixin example(name) | ||||||
|  |   details | ||||||
|  |     summary | ||||||
|  |       h4= name | ||||||
|  |     block | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +example("Load resources and process text") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | from __future__ import unicode_literals, print_function | ||||||
|  |     | from spacy.en import English | ||||||
|  |     | nlp = English() | ||||||
|  |     | doc = nlp('Hello, world. Here are two sentences.') | ||||||
|  | 
 | ||||||
|  | +example("Get tokens and sentences") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | token = doc[0] | ||||||
|  |     | sentence = doc.sents[0] | ||||||
|  |     | assert token[0] is sentence[0] | ||||||
|  | 
 | ||||||
|  | +example("Use integer IDs for any string") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | hello_id = nlp.vocab.strings['Hello'] | ||||||
|  |     | hello_str = nlp.vocab.strings[hello_id] | ||||||
|  |     |  | ||||||
|  |     | assert token.orth == hello_id == 52 | ||||||
|  |     | assert token.orth_ == hello_str == 'Hello' | ||||||
|  | 
 | ||||||
|  | +example("Get and set string views and flags") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | assert token.shape_ == 'Xxxx' | ||||||
|  |     | for lexeme in nlp.vocab: | ||||||
|  |     |     if lexeme.is_alpha: | ||||||
|  |     |         lexeme.shape_ = 'W' | ||||||
|  |     |     elif lexeme.is_digit: | ||||||
|  |     |         lexeme.shape_ = 'D' | ||||||
|  |     |     elif lexeme.is_punct: | ||||||
|  |     |         lexeme.shape_ = 'P' | ||||||
|  |     |     else: | ||||||
|  |     |         lexeme.shape_ = 'M' | ||||||
|  |     | assert token.shape_ == 'W' | ||||||
|  | 
 | ||||||
|  | +example("Export to numpy arrays") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV | ||||||
|  |     |  | ||||||
|  |     | attr_ids = [ORTH, LIKE_URL, IS_OOV] | ||||||
|  |     | doc_array = doc.to_array(attr_ids) | ||||||
|  |     | assert doc_array.shape == (len(doc), len(attrs) | ||||||
|  |     | assert doc[0].orth == doc_array[0, 0] | ||||||
|  |     | assert doc[1].orth == doc_array[1, 0] | ||||||
|  |     | assert doc[0].like_url == doc_array[0, 1] | ||||||
|  |     | assert doc_array[, 1] == [t.like_url for t in doc] | ||||||
|  | 
 | ||||||
|  | +example("Word vectors") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") | ||||||
|  |     |  | ||||||
|  |     | apples = doc[0] | ||||||
|  |     | oranges = doc[1] | ||||||
|  |     | boots = doc[6] | ||||||
|  |     | hippos = doc[8] | ||||||
|  |     |  | ||||||
|  |     | assert apples.similarity(oranges) > boots.similarity(hippos) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | +example("Part-of-speech tags") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | doc[0].pos | ||||||
|  |     | doc[0].tag | ||||||
|  | 
 | ||||||
|  | +example("Syntactic dependencies") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | for head in tokens: | ||||||
|  |     |     for child in head.lefts: | ||||||
|  |     |         assert child.head is head | ||||||
|  |     |     for child in head.rights: | ||||||
|  |     |         assert child.head is head | ||||||
|  |     | sent = nlp('The four wheels on the bus turned quickly.') | ||||||
|  |     | wheels = sent[2] | ||||||
|  |     | bus = sent[5] | ||||||
|  |     | assert len(list(wheels.lefts)) == 2 | ||||||
|  |     | assert len(list(wheels.rights)) == 1 | ||||||
|  |     | assert len(list(wheels.children)) == 3 | ||||||
|  |     | assert len(list(bus.lefts)) == 1 | ||||||
|  |     | assert len(list(bus.rights)) == 0 | ||||||
|  |     | assert len(list(bus.children)) == 1 | ||||||
|  |     |  | ||||||
|  |     | assert len(list(wheels.subtree)) == 6  | ||||||
|  | 
 | ||||||
|  | +example("Named entities") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | doc.ents | ||||||
|  |     | token.ent_type | ||||||
|  |     | token.ent_iob | ||||||
|  | 
 | ||||||
|  | +example("Define custom NER rules") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | nlp.matcher | ||||||
|  | 
 | ||||||
|  | +example("Calculate inline mark-up on original string") | ||||||
|  |   pre.language-python: code | ||||||
|  |     | token.string | ||||||
|  |     | token.spacy | ||||||
|  |     | token.whitespace_ | ||||||
|  | 
 | ||||||
|  | +example("Efficient binary serialization") | ||||||
|  |   pre.language-python: code | ||||||
|  |     |  | ||||||
|  | @ -14,8 +14,8 @@ | ||||||
| 				{"orth": "9/11"} | 				{"orth": "9/11"} | ||||||
| 			], | 			], | ||||||
| 			[ | 			[ | ||||||
| 				{"lower": "Septmber"}, | 				{"lower": "septmber"}, | ||||||
| 				{"lower": "Eleven"} | 				{"lower": "eleven"} | ||||||
| 			], | 			], | ||||||
| 			[ | 			[ | ||||||
| 				{"lower": "september"}, | 				{"lower": "september"}, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user