mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'gaz' of https://github.com/honnibal/spaCy into gaz
This commit is contained in:
		
						commit
						4f765eee79
					
				
							
								
								
									
										705
									
								
								docs/redesign/docs.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										705
									
								
								docs/redesign/docs.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,705 @@ | |||
| - var py_docs = '<a class="reference" href="http://docs.python.org/library/' | ||||
| 
 | ||||
| - | ||||
|   var types = { | ||||
|    'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>', | ||||
|    'bool': py_docs + 'functions.html#bool"><em>bool</em></a>', | ||||
|    'int': py_docs + 'functions.html#int"><em>int</em></a>', | ||||
|    'generator': "", | ||||
|    'Vocab': "", | ||||
|    'Span': "", | ||||
|    'Doc': "" | ||||
|   } | ||||
| 
 | ||||
| 
 | ||||
| mixin declare_class(name) | ||||
|   details | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label class | ||||
|         code #{name} | ||||
|     block | ||||
| 
 | ||||
| mixin method(name, parameters) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|         span.parameters | ||||
|           | self, #{parameters} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin params | ||||
|   ul | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin param(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin attribute(name, type, value) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(type) | ||||
|   | tmp | ||||
| 
 | ||||
| mixin init | ||||
|   details | ||||
|     summary: h4 Init | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin callable | ||||
|   details | ||||
|     summary: h4 Callable | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin sequence | ||||
|   details | ||||
|     summary: h4 Sequence | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin maptype | ||||
|   details | ||||
|     summary: h4 Map | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin summary | ||||
|   block | ||||
| 
 | ||||
| mixin en_example | ||||
|   pre.language-python | ||||
|     code | ||||
|       | from spacy.en import English | ||||
|       | from spacy._doc_examples import download_war_and_peace | ||||
|       |  | ||||
|       | unprocessed_unicode = download_war_and_peace() | ||||
|       |  | ||||
|       | nlp = English() | ||||
|       | doc = nlp(unprocessed_unicode) | ||||
| 
 | ||||
| 
 | ||||
| doctype html | ||||
| html(lang="en") | ||||
|   head | ||||
|     meta(charset="utf-8") | ||||
|     title spaCy – Industrial-strength NLP | ||||
|     meta(name="description" content="") | ||||
|     meta(name="author" content="Matthew Honnibal") | ||||
|     link(rel="stylesheet" href="css/style.css") | ||||
|     <!--[if lt IE 9]> | ||||
|     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") | ||||
|     <![endif]--> | ||||
| 
 | ||||
|   body(id="docs") | ||||
|     header(role="banner") | ||||
|       h1.logo spaCy – Industrial-strength NLP | ||||
|       div.slogan API | ||||
| 
 | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="#") Home | ||||
|         li.active: a(href="#") Docs | ||||
|         li: a(href="#") License | ||||
|         li: a(href="#") Blog | ||||
| 
 | ||||
|     main.docs#content | ||||
| 
 | ||||
|       article | ||||
|         +declare_class("English") | ||||
|           p Load models into a callable object to process English text. | ||||
| 
 | ||||
|           +summary | ||||
|             +en_example | ||||
| 
 | ||||
|           +init | ||||
|             p | ||||
|               | Load the resources.  Loading takes 20 seconds, and the instance | ||||
|               | consumes 2 to 3 gigabytes of memory. | ||||
|              | ||||
|             p  | ||||
|               | Intended use is for one instance to be created per process. | ||||
|               | You can create more if you're doing something unusual. | ||||
|             p | ||||
|               | You may wish to make the instance a global variable or "singleton". | ||||
|               | We usually instantiate the object in the <code>main()</code> | ||||
|               | function and pass it around as an explicit argument.  | ||||
|             +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") | ||||
| 
 | ||||
|               +params | ||||
|                 +param("data_dir") | ||||
|                   | The data directory.  May be #{None}, to disable any data loading | ||||
|                   | (including the vocabulary). | ||||
| 
 | ||||
|                 +param("Tokenizer") | ||||
|                   | A class/function that creates the tokenizer. | ||||
| 
 | ||||
|                 +param("Tagger") | ||||
|                   | A class/function that creates the part-of-speech tagger. | ||||
| 
 | ||||
|                 +param("Parser") | ||||
|                   | A class/function that creates the dependency parser. | ||||
| 
 | ||||
|                 +param("Entity") | ||||
|                   | A class/function that creates the named entity recogniser. | ||||
| 
 | ||||
|                 +param("load_vectors") | ||||
|                   | A boolean value to control whether the word vectors are loaded. | ||||
|            | ||||
|           +callable | ||||
|             +method("__call__", "text, tag=True, parse=True, entity=True") | ||||
| 
 | ||||
|               +params | ||||
|                 +param("text", types.unicode) | ||||
|                   | The text to be processed.  No pre-processing needs to be applied, | ||||
|                   | and any length of text can be submitted.  Usually you will submit | ||||
|                   | a whole document. Text may be zero-length. An exception is raised | ||||
|                   | if byte strings are supplied. | ||||
| 
 | ||||
|                 +param("tag", bool_type) | ||||
|                   | Whether to apply the part-of-speech tagger. Required for parsing | ||||
|                   | and entity recognition. | ||||
| 
 | ||||
|                 +param("parse", bool_type) | ||||
|                   | Whether to apply the syntactic dependency parser. | ||||
| 
 | ||||
|                 +param("entity", bool_type) | ||||
|                   | Whether to apply the named entity recognizer. | ||||
| 
 | ||||
|                 pre.language-python | ||||
|                   code | ||||
|                     | from spacy.en import English | ||||
|                     | nlp = English() | ||||
|                     | doc = nlp(u'Some text.) # Applies tagger, parser, entity | ||||
|                     | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser | ||||
|                     | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity | ||||
|                     | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser | ||||
|                     | doc = nlp(u'') # Zero-length tokens, not an error | ||||
|                     | # doc = nlp(b'Some text') <-- Error: need unicode | ||||
|                     | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. | ||||
|      | ||||
| 
 | ||||
|         +declare_class("Doc") | ||||
|           p I'm a doc | ||||
| 
 | ||||
|           +init | ||||
|             +method("__init__", "vocab") | ||||
|               +params | ||||
|                 +param("vocab", vocab_type) | ||||
|                   | A vocabulary object | ||||
| 
 | ||||
|           +sequence | ||||
|             +method("__getitem__", "i", types.int) | ||||
|               +returns(types.Token) | ||||
| 
 | ||||
|             +method("__getitem__", "start_end", types.slice) | ||||
|               +returns(types.Span) | ||||
|    | ||||
|             +method("__iter__") | ||||
|               | Iterate over tokens | ||||
|    | ||||
|             +method("__len__") | ||||
|               | Number of tokens in the document. | ||||
|    | ||||
|           details | ||||
|             summary: h4 Spans | ||||
|              | ||||
|             +attribute("sents", types.generator) | ||||
|               | Iterate over sentences in the document. | ||||
|            | ||||
|             +attribute("ents", types.generator) | ||||
|               | Iterate over named entities in the document. | ||||
|      | ||||
|             +attribute("noun_chunks", types.generator) | ||||
|            | ||||
|           details | ||||
|             summary: h4 Export/Import | ||||
|              | ||||
|             +method("to_array", "attr_ids") | ||||
|    | ||||
|               | Given a list of M attribute IDs, export the tokens to a numpy ndarray | ||||
|               | of shape N*M, where N is the length of the sentence. | ||||
|    | ||||
|               +params | ||||
|                 +param("attr_ids", "list[int]") | ||||
|                   | A list of attribute ID ints. | ||||
|    | ||||
|               +returns("feat_array") | ||||
|                 | A feature matrix, with one row per word, and one column per attribute | ||||
|                 | indicated in the input attr_ids. | ||||
|    | ||||
|             +method("count_by", "attr_id") | ||||
|               | Produce a dict of {attribute (int): count (ints)} frequencies, keyed | ||||
|               | by the values of the given attribute ID. | ||||
|              | ||||
|               pre.language-python | ||||
|                 code | ||||
|                   | >>> from spacy.en import English, attrs | ||||
|                   | >>> nlp = English() | ||||
|                   | >>> tokens = nlp(u'apple apple orange banana') | ||||
|                   | >>> tokens.count_by(attrs.ORTH) | ||||
|                   | {12800L: 1, 11880L: 2, 7561L: 1} | ||||
|                   | >>> tokens.to_array([attrs.ORTH]) | ||||
|                   | array([[11880], | ||||
|                   |         [11880], | ||||
|                   |         [7561], | ||||
|                   |         [12800]]) | ||||
|    | ||||
|             +method("from_array", "attrs, array") | ||||
|               | Load from array | ||||
|            | ||||
|             +method("from_bytes") | ||||
|               | Deserialize, loading from bytes | ||||
|    | ||||
|             +method("read_bytes") | ||||
|               | classmethod | ||||
|    | ||||
|             //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") | ||||
|    | ||||
|             //  | Merge a multi-word expression into a single token.  Currently | ||||
|             //  | experimental; API is likely to change. | ||||
|    | ||||
|    | ||||
|         +declare_class("Token") | ||||
|           +init | ||||
|             +method("__init__", "vocab, doc, offset") | ||||
|               +params | ||||
|                 +param("vocab", types.Vocab) | ||||
|                   p A Vocab object | ||||
|    | ||||
|                 +param("doc", types.Doc) | ||||
|                   p The parent sequence | ||||
|    | ||||
|               +param("offset", types.int) | ||||
|                 p The index of the token within the document | ||||
|    | ||||
|           details | ||||
|             summary: h4 String Views | ||||
|    | ||||
|             +attribute("orth / orth_") | ||||
|               | The form of the word with no string normalization or processing, as | ||||
|               | it appears in the string, without trailing whitespace. | ||||
|    | ||||
|             +attribute("lemma / lemma_") | ||||
|               | The "base" of the word, with no inflectional suffixes, e.g. the lemma of | ||||
|               | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that | ||||
|               | <em>derivational</em> suffixes are not stripped, e.g. the lemma of | ||||
|               | "instutitions" is "institution", not "institute".  Lemmatization is | ||||
|               | performed using the WordNet data, but extended to also cover closed-class | ||||
|               | words such as pronouns.  By default, the WN lemmatizer returns "hi" | ||||
|               | as the lemma of "his". We assign pronouns the lemma -PRON-. | ||||
|    | ||||
|             +attribute("lower / lower_") | ||||
|               | The form of the word, but forced to lower-case, i.e. | ||||
|               pre.language-python: code lower = word.orth\_.lower() | ||||
|    | ||||
|             //+attribute("norm / norm_") | ||||
|             //  | The form of the word, after language-specific normalizations has been | ||||
|             //  | applied. | ||||
|    | ||||
|             +attribute("shape / shape_") | ||||
|               | A transform of the word's string, to show orthographic features. | ||||
|               | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped | ||||
|               | to d. After these mappings, sequences of 4 or more of the same character | ||||
|               | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, | ||||
|               | :) --> :) | ||||
|    | ||||
|             +attribute("prefix / prefix_") | ||||
|               | A length-N substring from the start of the word.  Length may vary by | ||||
|               | language; currently for English n=1, i.e. | ||||
|               pre.language-python: code prefix = word.orth\_[:1] | ||||
|    | ||||
|             +attribute("suffix / suffix_") | ||||
|               | A length-N substring from the end of the word.  Length may vary by | ||||
|               | language; currently for English n=3, i.e. | ||||
|               pre.language-python: code suffix = word.orth\_[-3:] | ||||
|    | ||||
|             //+attribute("lex_id") | ||||
|             //  | lex_id | ||||
|    | ||||
|           details | ||||
|             summary: h4 Alignment and Output | ||||
|    | ||||
|             +attribute("idx") | ||||
|               p Start index of the token in the string | ||||
|    | ||||
|             +method("__len__", "") | ||||
|               p Length of the token's orth string, in unicode code-points. | ||||
|    | ||||
|             +method("__unicode__", "") | ||||
|               p Same as token.orth_ | ||||
|    | ||||
|             +method("__str__", "") | ||||
|               p Varies between Python 2 and Python 3 | ||||
|    | ||||
|             +attribute("string") | ||||
|               p | ||||
|                 | The form of the word as it appears in the string, <strong>including | ||||
|                 | trailing whitespace</strong>.  This is useful when you need to use | ||||
|                 | linguistic features to add inline mark-up to the string. | ||||
|    | ||||
|             +method("nbor, i=1") | ||||
|               +params | ||||
|                 +param("i") | ||||
|                   p Offset relative to token | ||||
|      | ||||
|           details | ||||
|             summary: h4 Distributional Features | ||||
|      | ||||
|             +attribute("repvec") | ||||
|               p | ||||
|                 | A "word embedding" representation: a dense real-valued vector that supports | ||||
|                 | similarity queries between words.  By default, spaCy currently loads | ||||
|                 | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec | ||||
|                 | model. | ||||
|      | ||||
|             +attribute("cluster") | ||||
|               p | ||||
|                 | The Brown cluster ID of the word.  These are often useful features for | ||||
|                 | linear models.  If you're using a non-linear model, particularly a | ||||
|                 | neural net or random forest, consider using the real-valued word | ||||
|                 | representation vector, in Token.repvec, instead. | ||||
|      | ||||
|             +attribute("prob") | ||||
|               p | ||||
|                 | The unigram log-probability of the word, estimated from counts from a | ||||
|                 | large corpus, smoothed using Simple Good Turing estimation. | ||||
|      | ||||
|           details | ||||
|             summary: h4 Syntactic Tags | ||||
|      | ||||
|             +attribute("pos / pos_") | ||||
|               p | ||||
|                 | A part-of-speech tag, from the Google Universal Tag Set, e.g.  | ||||
|                 | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for | ||||
|                 | the 17 tag values are provided in <code>spacy.parts_of_speech.</code> | ||||
|      | ||||
|             +attribute("tag / tag_") | ||||
|               p | ||||
|                 | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>, | ||||
|                 | <code>DT</code>, etc.  These tags are language/corpus specific, and | ||||
|                 | typically describe part-of-speech and some amount of morphological | ||||
|                 | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code> | ||||
|                 | is assigned to a present-tense singular verb. | ||||
|      | ||||
|             +attribute("dep / dep_") | ||||
|               p | ||||
|                 | The type of syntactic dependency relation between the word and its | ||||
|                 | syntactic head. | ||||
|      | ||||
|           details | ||||
|             summary: h4 Navigating the Parse Tree | ||||
|            | ||||
|             +attribute("head") | ||||
|               p | ||||
|                 | The Token that is the immediate syntactic head of the word.  If the | ||||
|                 | word is the root of the dependency tree, the same word is returned. | ||||
|      | ||||
|             +attribute("lefts") | ||||
|               p | ||||
|                 | An iterator for the immediate leftward syntactic children of the | ||||
|                 | word. | ||||
|      | ||||
|             +attribute("rights") | ||||
|               p | ||||
|                 | An iterator for the immediate rightward syntactic children of the | ||||
|                 | word. | ||||
|      | ||||
|             +attribute("n_lefts") | ||||
|               p | ||||
|                 | The number of immediate syntactic children preceding the word in  | ||||
|                 | the string. | ||||
|      | ||||
|             +attribute("n_rights") | ||||
|               p | ||||
|                 | The number of immediate syntactic children following the word in | ||||
|                 | the string. | ||||
|      | ||||
|             +attribute("children") | ||||
|               p | ||||
|                 | An iterator that yields from lefts, and then yields from rights. | ||||
|      | ||||
|             +attribute("subtree") | ||||
|               p | ||||
|                 | An iterator for the part of the sentence syntactically governed by | ||||
|                 | the word, including the word itself. | ||||
|      | ||||
|             +attribute("left_edge") | ||||
|               p The leftmost edge of the token's subtree | ||||
|      | ||||
|             +attribute("right_edge") | ||||
|               p The rightmost edge of the token's subtree | ||||
|      | ||||
|           details | ||||
|             summary: h4 Named Entities | ||||
|      | ||||
|             +attribute("ent_type") | ||||
|               p If the token is part of an entity, its entity type. | ||||
|      | ||||
|             +attribute("ent_iob") | ||||
|               p The IOB (inside, outside, begin) entity recognition tag for the token. | ||||
|      | ||||
|           details | ||||
|             summary: h4 Lexeme Flags | ||||
|      | ||||
|             +method("check_flag", "flag_id") | ||||
|               +params | ||||
|                 +param("flag_id") | ||||
|                   | flag ID | ||||
|      | ||||
|             +attribute("is_oov") | ||||
|             +attribute("is_alpha") | ||||
|             +attribute("is_ascii") | ||||
|             +attribute("is_digit") | ||||
|             +attribute("is_lower") | ||||
|             +attribute("is_title") | ||||
|             +attribute("is_punct") | ||||
|             +attribute("is_space") | ||||
|             +attribute("like_url") | ||||
|             +attribute("like_num") | ||||
|             +attribute("like_email") | ||||
|      | ||||
|             //+attribute("conjuncts") | ||||
|             //  | Conjuncts | ||||
|      | ||||
|         +declare_class("Span") | ||||
|           +init | ||||
|             +method("__init__") | ||||
|               Temp | ||||
|    | ||||
|             <code>span = doc[0:4]</code> | ||||
|    | ||||
|           +sequence | ||||
|             +method("__getitem__") | ||||
|               p Get item | ||||
|    | ||||
|             +method("__iter__") | ||||
|               p Iter | ||||
|                  | ||||
|             +method("__len__") | ||||
|               p Len | ||||
|    | ||||
|           details | ||||
|             summary: h4 Parse | ||||
|    | ||||
|             +attribute("root") | ||||
|               p Syntactic head | ||||
|    | ||||
|             +attribute("lefts") | ||||
|               p Tokens that are: | ||||
|               ol | ||||
|                 li To the left of the span; | ||||
|                 li Syntactic children of words within the span | ||||
|    | ||||
|               p i.e. | ||||
|    | ||||
|               pre.language-python | ||||
|                 code | ||||
|                   | lefts = [span.doc[i] for i in range(0, span.start) | ||||
|                   |          if span.doc[i].head in span] | ||||
|    | ||||
|             +attribute("rights") | ||||
|               p Tokens that are: | ||||
|                 ol  | ||||
|                   li To the right of the span; | ||||
|                   li Syntactic children of words within the span | ||||
|               p i.e. | ||||
|               pre.language-python | ||||
|                 code | ||||
|                   | rights = [span.doc[i] for i in range(span.end, len(span.doc)) | ||||
|                   |           if span.doc[i].head in span] | ||||
|    | ||||
|    | ||||
|             +attribute("subtree") | ||||
|               p String | ||||
|    | ||||
|           details | ||||
|             summary: h4 String Views | ||||
|    | ||||
|             +attribute("string") | ||||
|               p String | ||||
|      | ||||
|             +attribute("lemma / lemma_") | ||||
|               p String | ||||
|    | ||||
|             +attribute("label / label_") | ||||
|               p String | ||||
|    | ||||
|         +declare_class("Lexeme") | ||||
|           p | ||||
|             | The Lexeme object represents a lexical type, stored in the vocabulary | ||||
|             | – as opposed to a token, occurring in a document. | ||||
|           p | ||||
|             | Lexemes store various features, so that these features can be computed | ||||
|             | once per type, rather than once per token. As job sizes grow, this | ||||
|             | can amount to a substantial efficiency improvement. | ||||
|    | ||||
|           p | ||||
|             | All Lexeme attributes are therefore context independent, as a single | ||||
|             | lexeme is reused for all usages of that word. Lexemes are keyed by | ||||
|             | the “orth” attribute. | ||||
|    | ||||
|           p | ||||
|             All Lexeme attributes are accessible directly on the Token object. | ||||
|    | ||||
|           +init | ||||
|             +method("__init__") | ||||
|               p Init | ||||
|    | ||||
|             details | ||||
|               summary: h4 String Features | ||||
|    | ||||
|                 +attribute("orth / orth_") | ||||
|                   p | ||||
|                     | The form of the word with no string normalization or processing, | ||||
|                     | as it appears in the string, without trailing whitespace. | ||||
|                | ||||
|                 +attribute("lower / lower_") | ||||
|                   p Tmp | ||||
|                | ||||
|                 +attribute("norm / norm_") | ||||
|                   p Tmp | ||||
|                | ||||
|                 +attribute("shape / shape_") | ||||
|                   p Tmp | ||||
|                | ||||
|                 +attribute("prefix / prefix_") | ||||
|                   p Tmp | ||||
|                | ||||
|                 +attribute("suffix / suffix_") | ||||
|                   p TMP | ||||
|    | ||||
|         +declare_class("Vocab", "data_dir=None, lex_props_getter=None") | ||||
|           +sequence | ||||
|             +method("__len__") | ||||
|               +returns | ||||
|                 p Number of words in the vocabulary. | ||||
|    | ||||
|             +method("__iter__") | ||||
|               +returns | ||||
|                 p Lexeme | ||||
|      | ||||
|           +maptype | ||||
|             +method("__getitem__", "key_int") | ||||
|               +params | ||||
|                 +param("key") | ||||
|                   p Integer ID | ||||
|      | ||||
|               +returns: p A Lexeme object | ||||
|      | ||||
|             +method("__getitem__", "key_str") | ||||
|               +params | ||||
|                 +param("key_str", types.unicode) | ||||
|                   p A string in the vocabulary | ||||
|      | ||||
|               +returns("Lexeme") | ||||
|      | ||||
|             +method("__setitem__", "orth_str", "props") | ||||
|               +params | ||||
|                 +param("orth_str", types.unicode) | ||||
|                   p The orth key | ||||
|      | ||||
|                 +param("props", types.dict) | ||||
|                   p A props dictionary | ||||
|      | ||||
|               +returns("None") | ||||
|    | ||||
|           details | ||||
|             summary: h4 Import/Export | ||||
|      | ||||
|             +method("dump", "loc") | ||||
|               +params | ||||
|                 +param("loc", types.unicode) | ||||
|                   p Path where the vocabulary should be saved | ||||
|      | ||||
|             +method("load_lexemes", "loc") | ||||
|             +params | ||||
|               +param("loc", types.unicode) | ||||
|                 p Path to load the lexemes.bin file from | ||||
|      | ||||
|             +method("load_vectors", "loc") | ||||
|               +params | ||||
|                 +param("loc", types.unicode) | ||||
|                   p Path to load the vectors.bin from | ||||
|    | ||||
|         +declare_class("StringStore") | ||||
|           +init | ||||
|             Tmp | ||||
|    | ||||
|           +sequence | ||||
|             +method("__len__") | ||||
|               +returns("int") | ||||
|                 p Number of strings in the string-store | ||||
|    | ||||
|             +method("__iter__") | ||||
|               +returns | ||||
|                 p Lexeme | ||||
|    | ||||
|           +maptype | ||||
|             +method("__getitem__", "key_int") | ||||
|               +params | ||||
|                 +param("key_int") | ||||
|                   p An integer key | ||||
|      | ||||
|               +returns(types.unicode) | ||||
|                 p The string that the integer key maps to | ||||
|      | ||||
|             +method("__getitem__", "key_unicode") | ||||
|               +params | ||||
|                 +param("key_unicode") | ||||
|                   p A key, as a unicode string | ||||
|      | ||||
|               +returns(types.int) | ||||
|                 p The integer ID of the string. | ||||
|      | ||||
|             +method("__getitem__", "key_utf8_bytes") | ||||
|               +params | ||||
|                 +param("key_utf8_bytes", types.bytes) | ||||
|                   p p A key, as a UTF-8 encoded byte-string | ||||
|      | ||||
|               +returns(types.int) | ||||
|                 p The integer ID of the string. | ||||
|    | ||||
|           details | ||||
|             summary: h4 Import/Export | ||||
|      | ||||
|             +method("dump", "loc") | ||||
|               +params | ||||
|                 +param("loc") | ||||
|                   p File path to save the strings.txt to. | ||||
|      | ||||
|             +method("load") | ||||
|               +params | ||||
|                 +param("loc") | ||||
|                   p File path to load the strings.txt from. | ||||
|    | ||||
|     script(src="js/prism.js") | ||||
							
								
								
									
										106
									
								
								docs/redesign/home.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								docs/redesign/home.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,106 @@ | |||
| extends ./outline.jade | ||||
| 
 | ||||
| // Notes | ||||
| // | ||||
| // 1. Where to put version notice? Should say something like | ||||
| //   2015-08-12: v0.89 | ||||
| //   and be a link | ||||
| //    | ||||
| //   Only needs to appear on home page. | ||||
| 
 | ||||
| 
 | ||||
| - var slogan = "Build Tomorrow's Language Technologies" | ||||
| - var tag_line = "spaCy – " + slogan | ||||
| 
 | ||||
| mixin lede | ||||
|   - var state_of_the_art = '<a href="#">state-of-the-art</a>' | ||||
|   - var a_minor_miracle = '<a href="">a minor miracle</a>' | ||||
|   - var great_documentation = '<a href="">great documentation</a>' | ||||
|    | ||||
|   p. | ||||
|     <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a | ||||
|     library for industrial-strength NLP in Python and Cython.  It features | ||||
|     !{state_of_the_art} speed and accuracy, a concise API, and great documentation. | ||||
|     If you're a small company doing NLP, we want <strong>spaCy</strong> to seem | ||||
|     like !{a_minor_miracle}. | ||||
| 
 | ||||
| mixin overview() | ||||
|   p. | ||||
|     Overview text | ||||
| 
 | ||||
| mixin benchmarks() | ||||
|   p. | ||||
|     Benchmarks | ||||
| 
 | ||||
| mixin get_started() | ||||
|   p. | ||||
|     Get Started | ||||
| 
 | ||||
| 
 | ||||
| mixin comparison(name) | ||||
|   details | ||||
|     summary | ||||
|       h4= name | ||||
| 
 | ||||
|     block | ||||
|   | ||||
| mixin columns(...names) | ||||
|   tr | ||||
|     each name in names | ||||
|       th= name | ||||
| 
 | ||||
| 
 | ||||
| mixin row(...cells) | ||||
|   tr | ||||
|     each cell in cells | ||||
|       td= cell | ||||
| 
 | ||||
| 
 | ||||
| mixin social       | ||||
|   footer(role="contentinfo") | ||||
|     a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter | ||||
| 
 | ||||
|     div.discuss | ||||
|       a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") | ||||
|         | Discuss on Hacker News | ||||
| 
 | ||||
|       a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") | ||||
|         | Discuss on Reddit | ||||
| 
 | ||||
| 
 | ||||
| mixin Section(title_text, link_name, include_file) | ||||
|   a(name=link_name): h3 #{title_text} | ||||
| 
 | ||||
|   if (link_name == "example-use") | ||||
|     include ./usage_examples.jade | ||||
|   else if (link_name == "online-demo") | ||||
|     include ./online_demo.jade | ||||
|   else if (link_name == "comparisons") | ||||
|     include ./comparisons.jade | ||||
|   else if (link_name == "install") | ||||
|     include ./installation.jade | ||||
| 
 | ||||
| 
 | ||||
| block intro_block | ||||
|   section(class="intro") | ||||
|     +lede | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="#example-use" class="button") Examples | ||||
|         li: a(href="#online-demo" class="button") Demo | ||||
|         li: a(href="#comparisons" class="button") Comparisons | ||||
|         li: a(href="#install" class="button") Install v0.89 | ||||
| 
 | ||||
| 
 | ||||
| block body_block | ||||
|   article(class="page landing-page") | ||||
| 
 | ||||
|     +Section("Usage by Example", "example-use", "./usage_examples.jade") | ||||
| 
 | ||||
|     +Section("Online Demo", "online-demo", "./online_demo.jade") | ||||
| 
 | ||||
|     +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") | ||||
| 
 | ||||
|     +Section("Install", "install", "./install.jade") | ||||
| 
 | ||||
							
								
								
									
										40
									
								
								docs/redesign/installation.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										40
									
								
								docs/redesign/installation.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,40 @@ | |||
| p With Python 2.7 or Python 3, using Linux or OSX, run: | ||||
| 
 | ||||
| pre.language-bash: code | ||||
|   | $ pip install spacy | ||||
|   | $ python -m spacy.en.download</code></pre> | ||||
| 
 | ||||
| p | ||||
|   | The download command fetches and installs about 300mb of data, for | ||||
|   | the parser model and word vectors, which it installs within the spacy.en | ||||
|   | package directory. | ||||
| 
 | ||||
| p | ||||
|   | If you're stuck using a server with an old version of Python, and you | ||||
|   | don't have root access, I've prepared a bootstrap script to help you | ||||
|   | compile a local Python install.  Run: | ||||
| 
 | ||||
| pre.language-bash: code | ||||
|   | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate | ||||
| 
 | ||||
| p | ||||
|   | The other way to install the package is to clone the github repository, | ||||
|   | and build it from source.  This installs an additional dependency, | ||||
|   | Cython.  If you're using Python 2, I also recommend installing fabric | ||||
|   | and fabtools – this is how I build the project. | ||||
| 
 | ||||
| pre.language-bash: code | ||||
|   | $ git clone https://github.com/honnibal/spaCy.git | ||||
|   | $ cd spaCy | ||||
|   | $ virtualenv .env && source .env/bin/activate | ||||
|   | $ export PYTHONPATH=`pwd` | ||||
|   | $ pip install -r requirements.txt | ||||
|   | $ python setup.py build_ext --inplace | ||||
|   | $ python -m spacy.en.download | ||||
|   | $ pip install pytest | ||||
|   | $ py.test tests/ | ||||
| 
 | ||||
| p | ||||
|   | Python packaging is awkward at the best of times, and it's particularly tricky | ||||
|   | with C extensions, built via Cython, requiring large data files.  So, | ||||
|   | please report issues as you encounter them. | ||||
							
								
								
									
										0
									
								
								docs/redesign/online_demo.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								docs/redesign/online_demo.jade
									
									
									
									
									
										Normal file
									
								
							
							
								
								
									
										37
									
								
								docs/redesign/outline.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								docs/redesign/outline.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| - var slogan = "Build Tomorrow's Language Technologies" | ||||
| - var tag_line = "spaCy – " + slogan | ||||
| 
 | ||||
| 
 | ||||
| doctype html | ||||
| html(lang="en") | ||||
|   head | ||||
|     meta(charset="utf-8") | ||||
|     title!= tag_line | ||||
|     meta(name="description" content="") | ||||
|     meta(name="author" content="Matthew Honnibal") | ||||
|     link(rel="stylesheet" href="css/style.css") | ||||
|     <!--[if lt IE 9]> | ||||
|     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") | ||||
|     <![endif]--> | ||||
| 
 | ||||
|   body(id="home" role="document") | ||||
|     header(role="banner") | ||||
|       h1(class="logo")!= tag_line | ||||
|       div(class="slogan")!= slogan | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="#") Home | ||||
|         li: a(href="#") Docs | ||||
|         li: a(href="#") License | ||||
|         li: a(href="#") More | ||||
| 
 | ||||
|     main(id="content" role="main") | ||||
|       block intro_block | ||||
| 
 | ||||
|       block body_block | ||||
|   | ||||
|   footer(role="contentinfo") | ||||
| 
 | ||||
|   script(src="js/prism.js") | ||||
|   script(src="js/details_polyfill.js") | ||||
							
								
								
									
										109
									
								
								docs/redesign/usage_examples.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								docs/redesign/usage_examples.jade
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,109 @@ | |||
| mixin example(name) | ||||
|   details | ||||
|     summary | ||||
|       h4= name | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| +example("Load resources and process text") | ||||
|   pre.language-python: code | ||||
|     | from __future__ import unicode_literals, print_function | ||||
|     | from spacy.en import English | ||||
|     | nlp = English() | ||||
|     | doc = nlp('Hello, world. Here are two sentences.') | ||||
| 
 | ||||
| +example("Get tokens and sentences") | ||||
|   pre.language-python: code | ||||
|     | token = doc[0] | ||||
|     | sentence = doc.sents[0] | ||||
|     | assert token[0] is sentence[0] | ||||
| 
 | ||||
| +example("Use integer IDs for any string") | ||||
|   pre.language-python: code | ||||
|     | hello_id = nlp.vocab.strings['Hello'] | ||||
|     | hello_str = nlp.vocab.strings[hello_id] | ||||
|     |  | ||||
|     | assert token.orth == hello_id == 52 | ||||
|     | assert token.orth_ == hello_str == 'Hello' | ||||
| 
 | ||||
| +example("Get and set string views and flags") | ||||
|   pre.language-python: code | ||||
|     | assert token.shape_ == 'Xxxx' | ||||
|     | for lexeme in nlp.vocab: | ||||
|     |     if lexeme.is_alpha: | ||||
|     |         lexeme.shape_ = 'W' | ||||
|     |     elif lexeme.is_digit: | ||||
|     |         lexeme.shape_ = 'D' | ||||
|     |     elif lexeme.is_punct: | ||||
|     |         lexeme.shape_ = 'P' | ||||
|     |     else: | ||||
|     |         lexeme.shape_ = 'M' | ||||
|     | assert token.shape_ == 'W' | ||||
| 
 | ||||
| +example("Export to numpy arrays") | ||||
|   pre.language-python: code | ||||
|     | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV | ||||
|     |  | ||||
|     | attr_ids = [ORTH, LIKE_URL, IS_OOV] | ||||
|     | doc_array = doc.to_array(attr_ids) | ||||
|     | assert doc_array.shape == (len(doc), len(attrs) | ||||
|     | assert doc[0].orth == doc_array[0, 0] | ||||
|     | assert doc[1].orth == doc_array[1, 0] | ||||
|     | assert doc[0].like_url == doc_array[0, 1] | ||||
|     | assert doc_array[, 1] == [t.like_url for t in doc] | ||||
| 
 | ||||
| +example("Word vectors") | ||||
|   pre.language-python: code | ||||
|     | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") | ||||
|     |  | ||||
|     | apples = doc[0] | ||||
|     | oranges = doc[1] | ||||
|     | boots = doc[6] | ||||
|     | hippos = doc[8] | ||||
|     |  | ||||
|     | assert apples.similarity(oranges) > boots.similarity(hippos) | ||||
| 
 | ||||
| 
 | ||||
| +example("Part-of-speech tags") | ||||
|   pre.language-python: code | ||||
|     | doc[0].pos | ||||
|     | doc[0].tag | ||||
| 
 | ||||
| +example("Syntactic dependencies") | ||||
|   pre.language-python: code | ||||
|     | for head in tokens: | ||||
|     |     for child in head.lefts: | ||||
|     |         assert child.head is head | ||||
|     |     for child in head.rights: | ||||
|     |         assert child.head is head | ||||
|     | sent = nlp('The four wheels on the bus turned quickly.') | ||||
|     | wheels = sent[2] | ||||
|     | bus = sent[5] | ||||
|     | assert len(list(wheels.lefts)) == 2 | ||||
|     | assert len(list(wheels.rights)) == 1 | ||||
|     | assert len(list(wheels.children)) == 3 | ||||
|     | assert len(list(bus.lefts)) == 1 | ||||
|     | assert len(list(bus.rights)) == 0 | ||||
|     | assert len(list(bus.children)) == 1 | ||||
|     |  | ||||
|     | assert len(list(wheels.subtree)) == 6  | ||||
| 
 | ||||
| +example("Named entities") | ||||
|   pre.language-python: code | ||||
|     | doc.ents | ||||
|     | token.ent_type | ||||
|     | token.ent_iob | ||||
| 
 | ||||
| +example("Define custom NER rules") | ||||
|   pre.language-python: code | ||||
|     | nlp.matcher | ||||
| 
 | ||||
| +example("Calculate inline mark-up on original string") | ||||
|   pre.language-python: code | ||||
|     | token.string | ||||
|     | token.spacy | ||||
|     | token.whitespace_ | ||||
| 
 | ||||
| +example("Efficient binary serialization") | ||||
|   pre.language-python: code | ||||
|     |  | ||||
|  | @ -14,8 +14,8 @@ | |||
| 				{"orth": "9/11"} | ||||
| 			], | ||||
| 			[ | ||||
| 				{"lower": "Septmber"}, | ||||
| 				{"lower": "Eleven"} | ||||
| 				{"lower": "septmber"}, | ||||
| 				{"lower": "eleven"} | ||||
| 			], | ||||
| 			[ | ||||
| 				{"lower": "september"}, | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user