mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Remove old docs
This commit is contained in:
		
							parent
							
								
									cad0cca4e3
								
							
						
					
					
						commit
						890d6aa216
					
				|  | @ -1,661 +0,0 @@ | |||
| mixin declare_class(name) | ||||
|   details | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label class | ||||
|         code #{name} | ||||
|     block | ||||
| 
 | ||||
| mixin method(name, parameters) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|         span.parameters | ||||
|           | self, #{parameters} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin params | ||||
|   ul | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin param(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin attribute(name, type, value) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(type) | ||||
|   | tmp | ||||
| 
 | ||||
| mixin init | ||||
|   details | ||||
|     summary: h4 Init | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin callable | ||||
|   details | ||||
|     summary: h4 Callable | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin sequence | ||||
|   details | ||||
|     summary: h4 Sequence | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin maptype | ||||
|   details | ||||
|     summary: h4 Map | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin summary | ||||
|   block | ||||
| 
 | ||||
| mixin en_example | ||||
|   pre.language-python | ||||
|     code | ||||
|       | from spacy.en import English | ||||
|       | from spacy._doc_examples import download_war_and_peace | ||||
|       |  | ||||
|       | unprocessed_unicode = download_war_and_peace() | ||||
|       |  | ||||
|       | nlp = English() | ||||
|       | doc = nlp(unprocessed_unicode) | ||||
| 
 | ||||
| 
 | ||||
| +declare_class("English") | ||||
|   p Load models into a callable object to process English text. | ||||
| 
 | ||||
|   +summary | ||||
|     +en_example | ||||
| 
 | ||||
|   +init | ||||
|     p | ||||
|       | Load the resources.  Loading takes 20 seconds, and the instance | ||||
|       | consumes 2 to 3 gigabytes of memory. | ||||
|      | ||||
|     p  | ||||
|       | Intended use is for one instance to be created per process. | ||||
|       | You can create more if you're doing something unusual. | ||||
|     p | ||||
|       | You may wish to make the instance a global variable or "singleton". | ||||
|       | We usually instantiate the object in the <code>main()</code> | ||||
|       | function and pass it around as an explicit argument.  | ||||
|     +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") | ||||
| 
 | ||||
|       +params | ||||
|         +param("data_dir") | ||||
|           | The data directory.  May be #{None}, to disable any data loading | ||||
|           | (including the vocabulary). | ||||
| 
 | ||||
|         +param("Tokenizer") | ||||
|           | A class/function that creates the tokenizer. | ||||
| 
 | ||||
|         +param("Tagger") | ||||
|           | A class/function that creates the part-of-speech tagger. | ||||
| 
 | ||||
|         +param("Parser") | ||||
|           | A class/function that creates the dependency parser. | ||||
| 
 | ||||
|         +param("Entity") | ||||
|           | A class/function that creates the named entity recogniser. | ||||
| 
 | ||||
|         +param("load_vectors") | ||||
|           | A boolean value to control whether the word vectors are loaded. | ||||
|    | ||||
|   +callable | ||||
|     +method("__call__", "text, tag=True, parse=True, entity=True") | ||||
| 
 | ||||
|       +params | ||||
|         +param("text", types.unicode) | ||||
|           | The text to be processed.  No pre-processing needs to be applied, | ||||
|           | and any length of text can be submitted.  Usually you will submit | ||||
|           | a whole document. Text may be zero-length. An exception is raised | ||||
|           | if byte strings are supplied. | ||||
| 
 | ||||
|         +param("tag", types.bool) | ||||
|           | Whether to apply the part-of-speech tagger. Required for parsing | ||||
|           | and entity recognition. | ||||
| 
 | ||||
|         +param("parse", types.bool) | ||||
|           | Whether to apply the syntactic dependency parser. | ||||
| 
 | ||||
|         +param("entity", types.bool) | ||||
|           | Whether to apply the named entity recognizer. | ||||
| 
 | ||||
|       pre.language-python | ||||
|         code | ||||
|           | from spacy.en import English | ||||
|           | nlp = English() | ||||
|           | doc = nlp(u'Some text.) # Applies tagger, parser, entity | ||||
|           | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser | ||||
|           | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity | ||||
|           | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser | ||||
|           | doc = nlp(u'') # Zero-length tokens, not an error | ||||
|           | # doc = nlp(b'Some text') <-- Error: need unicode | ||||
|           | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. | ||||
| 
 | ||||
| 
 | ||||
| +declare_class("Doc") | ||||
|   p I'm a doc | ||||
| 
 | ||||
|   +init | ||||
|     +method("__init__", "vocab") | ||||
|       +params | ||||
|         +param("vocab", vocab_type) | ||||
|           | A vocabulary object | ||||
| 
 | ||||
|   +sequence | ||||
|     +method("__getitem__", "i", types.int) | ||||
|       +returns(types.Token) | ||||
| 
 | ||||
|     +method("__getitem__", "start_end", types.slice) | ||||
|       +returns(types.Span) | ||||
| 
 | ||||
|     +method("__iter__") | ||||
|       | Iterate over tokens | ||||
| 
 | ||||
|     +method("__len__") | ||||
|       | Number of tokens in the document. | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Spans | ||||
|      | ||||
|     +attribute("sents", types.generator) | ||||
|       | Iterate over sentences in the document. | ||||
|    | ||||
|     +attribute("ents", types.generator) | ||||
|       | Iterate over named entities in the document. | ||||
| 
 | ||||
|     +attribute("noun_chunks", types.generator) | ||||
|    | ||||
|   details | ||||
|     summary: h4 Export/Import | ||||
|      | ||||
|     +method("to_array", "attr_ids") | ||||
| 
 | ||||
|       | Given a list of M attribute IDs, export the tokens to a numpy ndarray | ||||
|       | of shape N*M, where N is the length of the sentence. | ||||
| 
 | ||||
|       +params | ||||
|         +param("attr_ids", "list[int]") | ||||
|           | A list of attribute ID ints. | ||||
| 
 | ||||
|       +returns("feat_array") | ||||
|         | A feature matrix, with one row per word, and one column per attribute | ||||
|         | indicated in the input attr_ids. | ||||
| 
 | ||||
|     +method("count_by", "attr_id") | ||||
|       | Produce a dict of {attribute (int): count (ints)} frequencies, keyed | ||||
|       | by the values of the given attribute ID. | ||||
|      | ||||
|       pre.language-python | ||||
|         code | ||||
|           | >>> from spacy.en import English, attrs | ||||
|           | >>> nlp = English() | ||||
|           | >>> tokens = nlp(u'apple apple orange banana') | ||||
|           | >>> tokens.count_by(attrs.ORTH) | ||||
|           | {12800L: 1, 11880L: 2, 7561L: 1} | ||||
|           | >>> tokens.to_array([attrs.ORTH]) | ||||
|           | array([[11880], | ||||
|           |         [11880], | ||||
|           |         [7561], | ||||
|           |         [12800]]) | ||||
| 
 | ||||
|     +method("from_array", "attrs, array") | ||||
|       | Load from array | ||||
|    | ||||
|     +method("from_bytes") | ||||
|       | Deserialize, loading from bytes | ||||
| 
 | ||||
|     +method("read_bytes") | ||||
|       | classmethod | ||||
| 
 | ||||
|     //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") | ||||
| 
 | ||||
|     //  | Merge a multi-word expression into a single token.  Currently | ||||
|     //  | experimental; API is likely to change. | ||||
| 
 | ||||
| 
 | ||||
| +declare_class("Token") | ||||
|   +init | ||||
|     +method("__init__", "vocab, doc, offset") | ||||
|       +params | ||||
|         +param("vocab", types.Vocab) | ||||
|           p A Vocab object | ||||
| 
 | ||||
|         +param("doc", types.Doc) | ||||
|           p The parent sequence | ||||
| 
 | ||||
|       +param("offset", types.int) | ||||
|         p The index of the token within the document | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 String Views | ||||
| 
 | ||||
|     +attribute("orth / orth_") | ||||
|       | The form of the word with no string normalization or processing, as | ||||
|       | it appears in the string, without trailing whitespace. | ||||
| 
 | ||||
|     +attribute("lemma / lemma_") | ||||
|       | The "base" of the word, with no inflectional suffixes, e.g. the lemma of | ||||
|       | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that | ||||
|       | <em>derivational</em> suffixes are not stripped, e.g. the lemma of | ||||
|       | "instutitions" is "institution", not "institute".  Lemmatization is | ||||
|       | performed using the WordNet data, but extended to also cover closed-class | ||||
|       | words such as pronouns.  By default, the WN lemmatizer returns "hi" | ||||
|       | as the lemma of "his". We assign pronouns the lemma -PRON-. | ||||
| 
 | ||||
|     +attribute("lower / lower_") | ||||
|       | The form of the word, but forced to lower-case, i.e. | ||||
|       pre.language-python: code lower = word.orth\_.lower() | ||||
| 
 | ||||
|     //+attribute("norm / norm_") | ||||
|     //  | The form of the word, after language-specific normalizations has been | ||||
|     //  | applied. | ||||
| 
 | ||||
|     +attribute("shape / shape_") | ||||
|       | A transform of the word's string, to show orthographic features. | ||||
|       | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped | ||||
|       | to d. After these mappings, sequences of 4 or more of the same character | ||||
|       | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, | ||||
|       | :) --> :) | ||||
| 
 | ||||
|     +attribute("prefix / prefix_") | ||||
|       | A length-N substring from the start of the word.  Length may vary by | ||||
|       | language; currently for English n=1, i.e. | ||||
|       pre.language-python: code prefix = word.orth\_[:1] | ||||
| 
 | ||||
|     +attribute("suffix / suffix_") | ||||
|       | A length-N substring from the end of the word.  Length may vary by | ||||
|       | language; currently for English n=3, i.e. | ||||
|       pre.language-python: code suffix = word.orth\_[-3:] | ||||
| 
 | ||||
|     //+attribute("lex_id") | ||||
|     //  | lex_id | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Alignment and Output | ||||
| 
 | ||||
|     +attribute("idx") | ||||
|       p Start index of the token in the string | ||||
| 
 | ||||
|     +method("__len__", "") | ||||
|       p Length of the token's orth string, in unicode code-points. | ||||
| 
 | ||||
|     +method("__unicode__", "") | ||||
|       p Same as token.orth_ | ||||
| 
 | ||||
|     +method("__str__", "") | ||||
|       p Varies between Python 2 and Python 3 | ||||
| 
 | ||||
|     +attribute("string") | ||||
|       p | ||||
|         | The form of the word as it appears in the string, <strong>including | ||||
|         | trailing whitespace</strong>.  This is useful when you need to use | ||||
|         | linguistic features to add inline mark-up to the string. | ||||
| 
 | ||||
|     +method("nbor, i=1") | ||||
|       +params | ||||
|         +param("i") | ||||
|           p Offset relative to token | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Distributional Features | ||||
| 
 | ||||
|     +attribute("repvec") | ||||
|       p | ||||
|         | A "word embedding" representation: a dense real-valued vector that supports | ||||
|         | similarity queries between words.  By default, spaCy currently loads | ||||
|         | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec | ||||
|         | model. | ||||
| 
 | ||||
|     +attribute("cluster") | ||||
|       p | ||||
|         | The Brown cluster ID of the word.  These are often useful features for | ||||
|         | linear models.  If you're using a non-linear model, particularly a | ||||
|         | neural net or random forest, consider using the real-valued word | ||||
|         | representation vector, in Token.repvec, instead. | ||||
| 
 | ||||
|     +attribute("prob") | ||||
|       p | ||||
|         | The unigram log-probability of the word, estimated from counts from a | ||||
|         | large corpus, smoothed using Simple Good Turing estimation. | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Syntactic Tags | ||||
| 
 | ||||
|     +attribute("pos / pos_") | ||||
|       p | ||||
|         | A part-of-speech tag, from the Google Universal Tag Set, e.g.  | ||||
|         | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for | ||||
|         | the 17 tag values are provided in <code>spacy.parts_of_speech.</code> | ||||
| 
 | ||||
|     +attribute("tag / tag_") | ||||
|       p | ||||
|         | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>, | ||||
|         | <code>DT</code>, etc.  These tags are language/corpus specific, and | ||||
|         | typically describe part-of-speech and some amount of morphological | ||||
|         | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code> | ||||
|         | is assigned to a present-tense singular verb. | ||||
| 
 | ||||
|     +attribute("dep / dep_") | ||||
|       p | ||||
|         | The type of syntactic dependency relation between the word and its | ||||
|         | syntactic head. | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Navigating the Parse Tree | ||||
|    | ||||
|     +attribute("head") | ||||
|       p | ||||
|         | The Token that is the immediate syntactic head of the word.  If the | ||||
|         | word is the root of the dependency tree, the same word is returned. | ||||
| 
 | ||||
|     +attribute("lefts") | ||||
|       p | ||||
|         | An iterator for the immediate leftward syntactic children of the | ||||
|         | word. | ||||
| 
 | ||||
|     +attribute("rights") | ||||
|       p | ||||
|         | An iterator for the immediate rightward syntactic children of the | ||||
|         | word. | ||||
| 
 | ||||
|     +attribute("n_lefts") | ||||
|       p | ||||
|         | The number of immediate syntactic children preceding the word in  | ||||
|         | the string. | ||||
| 
 | ||||
|     +attribute("n_rights") | ||||
|       p | ||||
|         | The number of immediate syntactic children following the word in | ||||
|         | the string. | ||||
| 
 | ||||
|     +attribute("children") | ||||
|       p | ||||
|         | An iterator that yields from lefts, and then yields from rights. | ||||
| 
 | ||||
|     +attribute("subtree") | ||||
|       p | ||||
|         | An iterator for the part of the sentence syntactically governed by | ||||
|         | the word, including the word itself. | ||||
| 
 | ||||
|     +attribute("left_edge") | ||||
|       p The leftmost edge of the token's subtree | ||||
| 
 | ||||
|     +attribute("right_edge") | ||||
|       p The rightmost edge of the token's subtree | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Named Entities | ||||
| 
 | ||||
|     +attribute("ent_type") | ||||
|       p If the token is part of an entity, its entity type. | ||||
| 
 | ||||
|     +attribute("ent_iob") | ||||
|       p The IOB (inside, outside, begin) entity recognition tag for the token. | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Lexeme Flags | ||||
| 
 | ||||
|     +method("check_flag", "flag_id") | ||||
|       +params | ||||
|         +param("flag_id") | ||||
|           | flag ID | ||||
| 
 | ||||
|     +attribute("is_oov") | ||||
|     +attribute("is_alpha") | ||||
|     +attribute("is_ascii") | ||||
|     +attribute("is_digit") | ||||
|     +attribute("is_lower") | ||||
|     +attribute("is_title") | ||||
|     +attribute("is_punct") | ||||
|     +attribute("is_space") | ||||
|     +attribute("like_url") | ||||
|     +attribute("like_num") | ||||
|     +attribute("like_email") | ||||
| 
 | ||||
|     //+attribute("conjuncts") | ||||
|     //  | Conjuncts | ||||
| 
 | ||||
| +declare_class("Span") | ||||
|   +init | ||||
|     +method("__init__") | ||||
|       Temp | ||||
| 
 | ||||
|     <code>span = doc[0:4]</code> | ||||
| 
 | ||||
|   +sequence | ||||
|     +method("__getitem__") | ||||
|       p Get item | ||||
| 
 | ||||
|     +method("__iter__") | ||||
|       p Iter | ||||
|          | ||||
|     +method("__len__") | ||||
|       p Len | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Parse | ||||
| 
 | ||||
|     +attribute("root") | ||||
|       p Syntactic head | ||||
| 
 | ||||
|     +attribute("lefts") | ||||
|       p Tokens that are: | ||||
|       ol | ||||
|         li To the left of the span; | ||||
|         li Syntactic children of words within the span | ||||
| 
 | ||||
|       p i.e. | ||||
| 
 | ||||
|       pre.language-python | ||||
|         code | ||||
|           | lefts = [span.doc[i] for i in range(0, span.start) | ||||
|           |          if span.doc[i].head in span] | ||||
| 
 | ||||
|     +attribute("rights") | ||||
|       p Tokens that are: | ||||
|         ol  | ||||
|           li To the right of the span; | ||||
|           li Syntactic children of words within the span | ||||
|       p i.e. | ||||
|       pre.language-python | ||||
|         code | ||||
|           | rights = [span.doc[i] for i in range(span.end, len(span.doc)) | ||||
|           |           if span.doc[i].head in span] | ||||
| 
 | ||||
| 
 | ||||
|     +attribute("subtree") | ||||
|       p String | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 String Views | ||||
| 
 | ||||
|     +attribute("string") | ||||
|       p String | ||||
| 
 | ||||
|     +attribute("lemma / lemma_") | ||||
|       p String | ||||
| 
 | ||||
|     +attribute("label / label_") | ||||
|       p String | ||||
| 
 | ||||
| +declare_class("Lexeme") | ||||
|   p | ||||
|     | The Lexeme object represents a lexical type, stored in the vocabulary | ||||
|     | – as opposed to a token, occurring in a document. | ||||
|   p | ||||
|     | Lexemes store various features, so that these features can be computed | ||||
|     | once per type, rather than once per token. As job sizes grow, this | ||||
|     | can amount to a substantial efficiency improvement. | ||||
| 
 | ||||
|   p | ||||
|     | All Lexeme attributes are therefore context independent, as a single | ||||
|     | lexeme is reused for all usages of that word. Lexemes are keyed by | ||||
|     | the “orth” attribute. | ||||
| 
 | ||||
|   p | ||||
|     All Lexeme attributes are accessible directly on the Token object. | ||||
| 
 | ||||
|   +init | ||||
|     +method("__init__") | ||||
|       p Init | ||||
| 
 | ||||
|     details | ||||
|       summary: h4 String Features | ||||
| 
 | ||||
|         +attribute("orth / orth_") | ||||
|           p | ||||
|             | The form of the word with no string normalization or processing, | ||||
|             | as it appears in the string, without trailing whitespace. | ||||
|        | ||||
|         +attribute("lower / lower_") | ||||
|           p Tmp | ||||
|        | ||||
|         +attribute("norm / norm_") | ||||
|           p Tmp | ||||
|        | ||||
|         +attribute("shape / shape_") | ||||
|           p Tmp | ||||
|        | ||||
|         +attribute("prefix / prefix_") | ||||
|           p Tmp | ||||
|        | ||||
|         +attribute("suffix / suffix_") | ||||
|           p TMP | ||||
| 
 | ||||
| +declare_class("Vocab", "data_dir=None, lex_props_getter=None") | ||||
|   +sequence | ||||
|     +method("__len__") | ||||
|       +returns | ||||
|         p Number of words in the vocabulary. | ||||
| 
 | ||||
|     +method("__iter__") | ||||
|       +returns | ||||
|         p Lexeme | ||||
| 
 | ||||
|   +maptype | ||||
|     +method("__getitem__", "key_int") | ||||
|       +params | ||||
|         +param("key") | ||||
|           p Integer ID | ||||
| 
 | ||||
|       +returns: p A Lexeme object | ||||
| 
 | ||||
|     +method("__getitem__", "key_str") | ||||
|       +params | ||||
|         +param("key_str", types.unicode) | ||||
|           p A string in the vocabulary | ||||
| 
 | ||||
|       +returns("Lexeme") | ||||
| 
 | ||||
|     +method("__setitem__", "orth_str", "props") | ||||
|       +params | ||||
|         +param("orth_str", types.unicode) | ||||
|           p The orth key | ||||
| 
 | ||||
|         +param("props", types.dict) | ||||
|           p A props dictionary | ||||
| 
 | ||||
|       +returns("None") | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Import/Export | ||||
| 
 | ||||
|     +method("dump", "loc") | ||||
|       +params | ||||
|         +param("loc", types.unicode) | ||||
|           p Path where the vocabulary should be saved | ||||
| 
 | ||||
|     +method("load_lexemes", "loc") | ||||
|     +params | ||||
|       +param("loc", types.unicode) | ||||
|         p Path to load the lexemes.bin file from | ||||
| 
 | ||||
|     +method("load_vectors", "loc") | ||||
|       +params | ||||
|         +param("loc", types.unicode) | ||||
|           p Path to load the vectors.bin from | ||||
| 
 | ||||
| +declare_class("StringStore") | ||||
|   +init | ||||
|     Tmp | ||||
| 
 | ||||
|   +sequence | ||||
|     +method("__len__") | ||||
|       +returns("int") | ||||
|         p Number of strings in the string-store | ||||
| 
 | ||||
|     +method("__iter__") | ||||
|       +returns | ||||
|         p Lexeme | ||||
| 
 | ||||
|   +maptype | ||||
|     +method("__getitem__", "key_int") | ||||
|       +params | ||||
|         +param("key_int") | ||||
|           p An integer key | ||||
| 
 | ||||
|       +returns(types.unicode) | ||||
|         p The string that the integer key maps to | ||||
| 
 | ||||
|     +method("__getitem__", "key_unicode") | ||||
|       +params | ||||
|         +param("key_unicode") | ||||
|           p A key, as a unicode string | ||||
| 
 | ||||
|       +returns(types.int) | ||||
|         p The integer ID of the string. | ||||
| 
 | ||||
|     +method("__getitem__", "key_utf8_bytes") | ||||
|       +params | ||||
|         +param("key_utf8_bytes", types.bytes) | ||||
|           p p A key, as a UTF-8 encoded byte-string | ||||
| 
 | ||||
|       +returns(types.int) | ||||
|         p The integer ID of the string. | ||||
| 
 | ||||
|   details | ||||
|     summary: h4 Import/Export | ||||
| 
 | ||||
|     +method("dump", "loc") | ||||
|       +params | ||||
|         +param("loc") | ||||
|           p File path to save the strings.txt to. | ||||
| 
 | ||||
|     +method("load") | ||||
|       +params | ||||
|         +param("loc") | ||||
|           p File path to load the strings.txt from. | ||||
|  | @ -1,95 +0,0 @@ | |||
| mixin Teaser(title, url, date_long, date_short, author, lede) | ||||
|   article.post | ||||
|     header | ||||
|       h2 | ||||
|         a(href=url)= title | ||||
|       .subhead | ||||
|         | by  | ||||
|         a(href='#', rel='author')= author | ||||
|         |  on  | ||||
|         time(datetime=date_short)= date_long | ||||
|     p!= lede | ||||
|         | ||||
|       a.readmore(href='#') ► | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| doctype html | ||||
| html(lang='en') | ||||
|   head | ||||
|     meta(charset='utf-8') | ||||
|     title spaCy Blog | ||||
|     meta(name='description', content='') | ||||
|     meta(name='author', content='Matthew Honnibal') | ||||
|     link(rel='stylesheet', href='css/style.css') | ||||
|     //if lt IE 9 | ||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') | ||||
|   body#blog | ||||
|     header(role='banner') | ||||
|       h1.logo spaCy Blog | ||||
|       .slogan Blog | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="home.html")        Home | ||||
|         li: a(href="docs.html")        Docs | ||||
|         li.active: a(href="blog.html") Blog | ||||
|         li: a(href="license.html")     License | ||||
| 
 | ||||
|     main#content(role='main') | ||||
|       section.intro.profile | ||||
|         p | ||||
|           img(src='img/matt.png') | ||||
|           | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. | ||||
|           span.social | ||||
|             a(href='#') Follow me on Twitter | ||||
|         nav(role='navigation') | ||||
|           ul | ||||
|             li | ||||
|               a.button(href='#') Blog | ||||
|             li | ||||
|               a.button(href='#tutorials') Tutorials | ||||
|       section.blogs | ||||
|         +Teaser( | ||||
|           "Introducing spaCy", | ||||
|           "blog_intro.html", | ||||
|           "February 2015", | ||||
|           "2015-02-18", | ||||
|           "Matthew Honnibal", | ||||
|           "<strong>spaCy</strong> is a new library for text processing in Python " + | ||||
|           "and Cython. I wrote it because I think small companies are terrible at " + | ||||
|           "natural language processing (NLP).  Or rather: small companies are using " + | ||||
|           "terrible NLP technology." | ||||
|         ) | ||||
| 
 | ||||
|         +Teaser( | ||||
|           "Parsing English with 500 lines of Python", | ||||
|           "blog_parser.html", | ||||
|           "December 18, 2013", | ||||
|           "2013-12-18", | ||||
|           "Matthew Hannibal", | ||||
|           "The Natural Language Processing (NLP) community has made big progress" + | ||||
|           "in syntactic parsing over the last few years. It’s now possible for a" + | ||||
|           "tiny Python implementation to perform better than the widely-used Stanford " + | ||||
|           "PCFG parser.") | ||||
|         +Teaser( | ||||
|           "A good Part-of-Speech tagger in about 200 lines of Python", | ||||
|           "blog_tagger.html", | ||||
|           "October 11, 2013", | ||||
|           "2013-09-11", | ||||
|           "Matthew Honnibal", | ||||
|           "There are a tonne of “best known techniques” for POS tagging, and you " + | ||||
|           "should ignore the others and just use greedy Averaged Perceptron." | ||||
|         ) | ||||
| 
 | ||||
|       section.intro | ||||
|         h2 | ||||
|           a.permalink(href='#tutorials', name='tutorials') Tutorials | ||||
| 
 | ||||
|       section.tutorials | ||||
|         include ./tutorials.jade | ||||
| 
 | ||||
|     footer(role="contentinfo") | ||||
|       span.slogan.copyright © 2015 Syllogism Co. | ||||
| 
 | ||||
|     script(src='js/prism.js') | ||||
|  | @ -1,81 +0,0 @@ | |||
| extends ./template_post.jade | ||||
| 
 | ||||
| - | ||||
|   var urls = { | ||||
|     'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', | ||||
|     'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", | ||||
|     'implementation': 'https://gist.github.com/syllog1sm/10343947', | ||||
|     'redshift': 'http://github.com/syllog1sm/redshift', | ||||
|     'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', | ||||
|     'acl_anthology': 'http://aclweb.org/anthology/', | ||||
|     'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' | ||||
|     } | ||||
| 
 | ||||
| - var my_research_software = '<a href="https://github.com/syllog1sm/redshift/tree/develop">my research software</a>' | ||||
| 
 | ||||
| - var how_to_write_a_POS_tagger = '<a href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/">how to write a part-of-speech tagger</a>' | ||||
| 
 | ||||
| - var parser_lnk = '<a href="https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/">parser</a>' | ||||
| 
 | ||||
| - var buy_a_commercial_license = '<a href="license.html">buy a commercial license</a>' | ||||
| 
 | ||||
| 
 | ||||
| block body_block | ||||
|   article.post | ||||
|     p. | ||||
|       <strong>spaCy</strong> is a new library for text processing in Python | ||||
|       and Cython. I wrote it because I think small companies are terrible at | ||||
|       natural language processing (NLP).  Or rather: small companies are using | ||||
|       terrible NLP technology. | ||||
| 
 | ||||
|     p. | ||||
|       To do great NLP, you have to know a little about linguistics, a lot | ||||
|       about machine learning, and almost everything about the latest research. | ||||
|       The people who fit this description seldom join small companies. | ||||
|       Most are broke – they've just finished grad school. | ||||
|       If they don't want to stay in academia, they join Google, IBM, etc. | ||||
| 
 | ||||
|     p. | ||||
|       The net result is that outside of the tech giants, commercial NLP has | ||||
|       changed little in the last ten years.  In academia, it's changed entirely. | ||||
|       Amazing improvements in quality.  Orders of magnitude faster.  But the | ||||
|       academic code is always GPL, undocumented, unuseable, or all three.  | ||||
|       You could implement the ideas yourself, but the papers are hard to read, | ||||
|       and training data is exorbitantly expensive.  So what are you left with? | ||||
|       A common answer is NLTK, which was written primarily as an educational resource. | ||||
|       Nothing past the tokenizer is suitable for production use. | ||||
| 
 | ||||
|     p. | ||||
|       I used to think that the NLP community just needed to do more to communicate | ||||
|       its findings to software engineers.  So I wrote two blog posts, explaining | ||||
|       !{how_to_write_a_POS_tagger} and !{parser_lnk}.  Both were well | ||||
|       received, and there's been a bit of interest in !{my_research_software} | ||||
|       – even though it's entirely undocumented, and mostly unuseable to | ||||
|       anyone but me. | ||||
|     p. | ||||
|       So six months ago I quit my post-doc, and I've been working day and night | ||||
|       on spaCy since.  I'm now pleased to announce an alpha release. | ||||
|    | ||||
|     p. | ||||
|       If you're a small company doing NLP, I think spaCy will seem like a minor | ||||
|       miracle.  It's by far the fastest NLP software ever released.  The | ||||
|       full processing pipeline completes in 20ms per document, including accurate | ||||
|       tagging and parsing.  All strings are mapped to integer IDs, tokens are | ||||
|       linked to embedded word representations, and a range of useful features | ||||
|       are pre-calculated and cached. | ||||
| 
 | ||||
|     p. | ||||
|       If none of that made any sense to you, here's the gist of it.  Computers | ||||
|       don't understand text.  This is unfortunate, because that's what the | ||||
|       web almost entirely consists of.  We want to recommend people text based | ||||
|       on other text they liked.  We want to shorten text to display it on a | ||||
|       mobile screen.  We want to aggregate it, link it, filter it, categorise | ||||
|       it, generate it and correct it. | ||||
| 
 | ||||
|     p.  | ||||
|       spaCy provides a library of utility functions that help programmers | ||||
|       build such products.  It's commercial open source software: you can | ||||
|       either use it under the AGPL, or you can !{buy_a_commercial_license} | ||||
|       under generous terms. | ||||
| 
 | ||||
|   footer(role='contentinfo') | ||||
|  | @ -1,938 +0,0 @@ | |||
| extends ./template_post.jade | ||||
| 
 | ||||
| 
 | ||||
| block body_block | ||||
|   - var urls = {} | ||||
|   //- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/' | ||||
|   - urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html" | ||||
|   - urls.implementation = 'https://gist.github.com/syllog1sm/10343947' | ||||
|   - urls.redshift = 'http://github.com/syllog1sm/redshift' | ||||
|   - urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm' | ||||
|   - urls.acl_anthology = 'http://aclweb.org/anthology/' | ||||
|   - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" | ||||
| 
 | ||||
|   // A comment | ||||
| 
 | ||||
|   article.post | ||||
|     header | ||||
|       h2 Parsing English in 500 lines of Python | ||||
|       .subhead | ||||
|         | by  | ||||
|         a(href='#', rel='author') Matthew Honnibal | ||||
|         |  on  | ||||
|         time(datetime='2013-12-18') December 18, 2013 | ||||
|     p | ||||
|       | A   | ||||
|       a(href=urls.google_ngrams) syntactic parser  | ||||
|       | describes a sentence’s grammatical structure, to help another | ||||
|       | application reason about it. Natural languages introduce many unexpected | ||||
|       | ambiguities, which our world-knowledge immediately filters out. A | ||||
|       | favourite example: | ||||
| 
 | ||||
|     p.example They ate the pizza with anchovies | ||||
| 
 | ||||
|     p | ||||
|       img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') | ||||
|     p | ||||
|       | A correct parse links “with” to “pizza”, while an incorrect parse | ||||
|       | links “with” to “eat”: | ||||
| 
 | ||||
|     .displacy | ||||
|       iframe(src='displacy/anchovies_bad.html', height='275') | ||||
| 
 | ||||
|     .displacy | ||||
|       iframe.displacy(src='displacy/anchovies_good.html', height='275') | ||||
|       a.view-displacy(href='#') View on displaCy | ||||
|       p.caption | ||||
|         | The Natural Language Processing (NLP) community has made big progress | ||||
|         | in syntactic parsing over the last few years. | ||||
| 
 | ||||
|     p | ||||
|       | The Natural Language Processing (NLP) community has made big progress | ||||
|       | in syntactic parsing over the last few years. It’s now possible for | ||||
|       | a tiny Python implementation to perform better than the widely-used | ||||
|       | Stanford PCFG parser. | ||||
| 
 | ||||
|     p | ||||
|       strong Update! | ||||
|       |  The Stanford CoreNLP library now includes a greedy transition-based | ||||
|       | dependency parser, similar to the one described in this post, but with | ||||
|       | an improved learning strategy. It is much faster and more accurate | ||||
|       | than this simple Python implementation. | ||||
| 
 | ||||
|     table | ||||
|       thead | ||||
|         tr | ||||
|           th Parser | ||||
|           th Accuracy | ||||
|           th Speed (w/s) | ||||
|           th Language | ||||
|           th LOC | ||||
|       tbody | ||||
|         tr | ||||
|           td Stanford | ||||
|           td 89.6% | ||||
|           td 19 | ||||
|           td Java | ||||
|           td | ||||
|             | > 4,000 | ||||
|             sup | ||||
|               a(href='#note-1') [1] | ||||
|         tr | ||||
|           td | ||||
|             strong parser.py | ||||
|           td 89.8% | ||||
|           td 2,020 | ||||
|           td Python | ||||
|             strong ~500 | ||||
|         tr | ||||
|           td Redshift | ||||
|           td | ||||
|             strong 93.6% | ||||
|           td | ||||
|             strong 2,580 | ||||
|           td Cython | ||||
|           td ~4,000 | ||||
|     p | ||||
|       | The rest of the post sets up the problem, and then takes you through  | ||||
|       a(href=urls.implementation) a concise implementation | ||||
|       | , prepared for this post. The first 200 lines of parser.py, the | ||||
|       | part-of-speech tagger and learner, are described  | ||||
|       a(href=pos_tagger_url) here. You should probably at least skim that | ||||
|       | post before reading this one, unless you’re very familiar with NLP | ||||
|       | research. | ||||
|     p | ||||
|       | The Cython system, Redshift, was written for my current research. I | ||||
|       | plan to improve it for general use in June, after my contract ends | ||||
|       | at Macquarie University. The current version is  | ||||
|       a(href=urls.redshift) hosted on GitHub | ||||
|       | . | ||||
|     h3 Problem Description | ||||
| 
 | ||||
|     p It’d be nice to type an instruction like this into your phone: | ||||
| 
 | ||||
|     p.example | ||||
|       Set volume to zero when I’m in a meeting, unless John’s school calls. | ||||
|     p | ||||
|       | And have it set the appropriate policy. On Android you can do this | ||||
|       | sort of thing with  | ||||
|       a(href=urls.tasker) Tasker | ||||
|       | , but an NL interface would be much better. It’d be especially nice | ||||
|       | to receive a meaning representation you could edit, so you could see | ||||
|       | what it thinks you said, and correct it. | ||||
|     p | ||||
|       | There are lots of problems to solve to make that work, but some sort | ||||
|       | of syntactic representation is definitely necessary. We need to know that: | ||||
| 
 | ||||
|     p.example | ||||
|       Unless John’s school calls, when I’m in a meeting, set volume to zero | ||||
| 
 | ||||
|     p is another way of phrasing the first instruction, while: | ||||
| 
 | ||||
|     p.example | ||||
|       Unless John’s school, call when I’m in a meeting | ||||
| 
 | ||||
|     p means something completely different. | ||||
| 
 | ||||
|     p | ||||
|       | A dependency parser returns a graph of word-word relationships, | ||||
|       | intended to make such reasoning easier. Our graphs will be trees – | ||||
|       | edges will be directed, and every node (word) will have exactly one | ||||
|       | incoming arc (one dependency, with its head), except one. | ||||
| 
 | ||||
|     h4 Example usage | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | parser = parser.Parser() | ||||
|         | tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split() | ||||
|         | >>> tags, heads = parser.parse(tokens) | ||||
|         | >>> heads | ||||
|         | [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11] | ||||
|         | >>> for i, h in enumerate(heads):  | ||||
|         | ...   head = tokens[heads[h]] if h >= 1 else 'None' | ||||
|         | ...   print(tokens[i] + ' <-- ' + head]) | ||||
|         | Set <-- None | ||||
|         | the <-- volume | ||||
|         | volume <-- Set | ||||
|         | to <-- Set | ||||
|         | zero <-- to | ||||
|         | when <-- Set | ||||
|         | I <-- 'm | ||||
|         | 'm <-- when | ||||
|         | in <-- 'm | ||||
|         | a <-- meeting | ||||
|         | meeting <-- in | ||||
|         | unless <-- Set | ||||
|         | John <-- 's | ||||
|         | 's   <-- calls | ||||
|         | school <-- calls | ||||
|         | calls <-- unless | ||||
| 
 | ||||
|     p. | ||||
|       The idea is that it should be slightly easier to reason from the parse, | ||||
|       than it was from the string. The parse-to-meaning mapping is hopefully | ||||
|       simpler than the string-to-meaning mapping. | ||||
| 
 | ||||
|     p. | ||||
|       The most confusing thing about this problem area is that “correctness” | ||||
|       is defined by convention — by annotation guidelines. If you haven’t | ||||
|       read the guidelines and you’re not a linguist, you can’t tell whether | ||||
|       the parse is “wrong” or “right”, which makes the whole task feel weird | ||||
|       and artificial. | ||||
|      | ||||
|     p. | ||||
|       For instance, there’s a mistake in the parse above: “John’s school | ||||
|       calls” is structured wrongly, according to the Stanford annotation | ||||
|       guidelines. The structure of that part of the sentence is how the | ||||
|       annotators were instructed to parse an example like “John’s school | ||||
|       clothes”. | ||||
|      | ||||
|     p | ||||
|       | It’s worth dwelling on this point a bit. We could, in theory, have | ||||
|       | written our guidelines so that the “correct” parses were reversed. | ||||
|       | There’s good reason to believe the parsing task will be harder if we | ||||
|       | reversed our convention, as it’d be less consistent with the rest of | ||||
|       | the grammar.  | ||||
|       sup: a(href='#note-2') [2] | ||||
|       | But we could test that empirically, and we’d be pleased to gain an | ||||
|       | advantage by reversing the policy. | ||||
| 
 | ||||
|     p | ||||
|       | We definitely do want that distinction in the guidelines — we don’t | ||||
|       | want both to receive the same structure, or our output will be less | ||||
|       | useful. The annotation guidelines strike a balance between what | ||||
|       | distinctions downstream applications will find useful, and what | ||||
|       | parsers will be able to predict easily. | ||||
| 
 | ||||
|     h4 Projective trees | ||||
| 
 | ||||
|     p | ||||
|       | There’s a particularly useful simplification that we can make, when | ||||
|       | deciding what we want the graph to look like: we can restrict the | ||||
|       | graph structures we’ll be dealing with. This doesn’t just give us a | ||||
|       | likely advantage in learnability; it can have deep algorithmic | ||||
|       | implications. We follow most work on English in constraining the | ||||
|       | dependency graphs to be  | ||||
|       em projective trees | ||||
|       | : | ||||
| 
 | ||||
|     ol | ||||
|       li Tree. Every word has exactly one head, except for the dummy ROOT symbol. | ||||
|       li | ||||
|         | Projective. For every pair of dependencies (a1, a2) and (b1, b2), | ||||
|         | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. | ||||
|         | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or | ||||
|         | b1 a1 b2 a2. | ||||
| 
 | ||||
|     p | ||||
|       | There’s a rich literature on parsing non-projective trees, and a | ||||
|       | smaller literature on parsing DAGs. But the parsing algorithm I’ll | ||||
|       | be explaining deals with projective trees. | ||||
| 
 | ||||
|     h3 Greedy transition-based parsing | ||||
| 
 | ||||
|     p | ||||
|       | Our parser takes as input a list of string tokens, and outputs a | ||||
|       | list of head indices, representing edges in the graph. If the  | ||||
| 
 | ||||
|       em i | ||||
| 
 | ||||
|       | th member of heads is  | ||||
| 
 | ||||
|       em j | ||||
| 
 | ||||
|       | , the dependency parse contains an edge (j, i). A transition-based | ||||
|       | parser is a finite-state transducer; it maps an array of N words | ||||
|       | onto an output array of N head indices: | ||||
| 
 | ||||
|     table.center | ||||
|       tbody | ||||
|         tr | ||||
|           td | ||||
|             em start | ||||
|           td MSNBC | ||||
|           td reported | ||||
|           td that | ||||
|           td Facebook | ||||
|           td bought | ||||
|           td WhatsApp | ||||
|           td for | ||||
|           td $16bn | ||||
|           td | ||||
|             em root | ||||
|         tr | ||||
|           td 0 | ||||
|           td 2 | ||||
|           td 9 | ||||
|           td 2 | ||||
|           td 4 | ||||
|           td 2 | ||||
|           td 4 | ||||
|           td 4 | ||||
|           td 7 | ||||
|           td 0 | ||||
|     p | ||||
|       | The heads array denotes that the head of  | ||||
|       em MSNBC | ||||
|       |  is  | ||||
|       em reported | ||||
|       | :  | ||||
|       em MSNBC | ||||
|       |  is word 1, and  | ||||
|       em reported | ||||
|       |  is word 2, and  | ||||
|       code.language-python heads[1] == 2 | ||||
|       | . You can already see why parsing a tree is handy — this data structure | ||||
|       | wouldn’t work if we had to output a DAG, where words may have multiple | ||||
|       | heads. | ||||
| 
 | ||||
|     p | ||||
|       | Although  | ||||
|       code.language-python heads | ||||
|       | can be represented as an array, we’d actually like to maintain some | ||||
|       | alternate ways to access the parse, to make it easy and efficient to | ||||
|       | extract features. Our  | ||||
| 
 | ||||
|       code.language-python Parse | ||||
|       | class looks like this: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | class Parse(object): | ||||
|         |     def __init__(self, n): | ||||
|         |         self.n = n | ||||
|         |         self.heads = [None] * (n-1) | ||||
|         |         self.lefts = [] | ||||
|         |         self.rights = [] | ||||
|         |         for i in range(n+1): | ||||
|         |             self.lefts.append(DefaultList(0)) | ||||
|         |             self.rights.append(DefaultList(0)) | ||||
|         |      | ||||
|         |     def add_arc(self, head, child): | ||||
|         |         self.heads[child] = head | ||||
|         |         if child < head: | ||||
|         |             self.lefts[head].append(child) | ||||
|         |         else: | ||||
|         |             self.rights[head].append(child) | ||||
| 
 | ||||
|     p | ||||
|       | As well as the parse, we also have to keep track of where we’re up | ||||
|       | to in the sentence. We’ll do this with an index into the  | ||||
|       code.language-python words | ||||
|       |  array, and a stack, to which we’ll push words, before popping them | ||||
|       | once their head is set. So our state data structure is fundamentally: | ||||
| 
 | ||||
|     ul | ||||
|       li An index, i, into the list of tokens; | ||||
|       li The dependencies added so far, in Parse | ||||
|       li | ||||
|         | A stack, containing words that occurred before i, for which we’re | ||||
|         | yet to assign a head. | ||||
| 
 | ||||
|     p Each step of the parsing process applies one of three actions to the state: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | SHIFT = 0; RIGHT = 1; LEFT = 2 | ||||
|         | MOVES = [SHIFT, RIGHT, LEFT] | ||||
|         |  | ||||
|         | def transition(move, i, stack, parse): | ||||
|         |     global SHIFT, RIGHT, LEFT | ||||
|         |     if move == SHIFT: | ||||
|         |         stack.append(i) | ||||
|         |         return i + 1 | ||||
|         |     elif move == RIGHT: | ||||
|         |         parse.add_arc(stack[-2], stack.pop()) | ||||
|         |         return i | ||||
|         |     elif move == LEFT: | ||||
|         |         parse.add_arc(i, stack.pop()) | ||||
|         |         return i | ||||
|         |     raise GrammarError("Unknown move: %d" % move) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     p | ||||
|       | The  | ||||
|       code.language-python LEFT | ||||
|       |  and  | ||||
|       code.language-python RIGHT | ||||
|       |  actions add dependencies and pop the stack, while  | ||||
|       code.language-python SHIFT | ||||
|       |  pushes the stack and advances i into the buffer. | ||||
|     p. | ||||
|       So, the parser starts with an empty stack, and a buffer index at 0, with | ||||
|       no dependencies recorded. It chooses one of the (valid) actions, and | ||||
|       applies it to the state. It continues choosing actions and applying | ||||
|       them until the stack is empty and the buffer index is at the end of | ||||
|       the input. (It’s hard to understand this sort of algorithm without | ||||
|       stepping through it. Try coming up with a sentence, drawing a projective | ||||
|       parse tree over it, and then try to reach the parse tree by choosing | ||||
|       the right sequence of transitions.) | ||||
| 
 | ||||
|     p Here’s what the parsing loop looks like in code: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | class Parser(object): | ||||
|         |     ... | ||||
|         |     def parse(self, words): | ||||
|         |         tags = self.tagger(words) | ||||
|         |         n = len(words) | ||||
|         |         idx = 1 | ||||
|         |         stack = [0] | ||||
|         |         deps = Parse(n) | ||||
|         |         while stack or idx < n: | ||||
|         |             features = extract_features(words, tags, idx, n, stack, deps) | ||||
|         |             scores = self.model.score(features) | ||||
|         |             valid_moves = get_valid_moves(i, n, len(stack)) | ||||
|         |             next_move = max(valid_moves, key=lambda move: scores[move]) | ||||
|         |             idx = transition(next_move, idx, stack, parse) | ||||
|         |         return tags, parse | ||||
|         |  | ||||
|         | def get_valid_moves(i, n, stack_depth): | ||||
|         |     moves = [] | ||||
|         |     if i < n: | ||||
|         |         moves.append(SHIFT) | ||||
|         |     if stack_depth <= 2: | ||||
|         |         moves.append(RIGHT) | ||||
|         |     if stack_depth <= 1: | ||||
|         |         moves.append(LEFT) | ||||
|         |     return moves | ||||
|        | ||||
|     p. | ||||
|       We start by tagging the sentence, and initializing the state. We then | ||||
|       map the state to a set of features, which we score using a linear model. | ||||
|       We then find the best-scoring valid move, and apply it to the state. | ||||
| 
 | ||||
|     p | ||||
|       | The model scoring works the same as it did in  | ||||
|       a(href=urls.post) the POS tagger. | ||||
|       | If you’re confused about the idea of extracting features and scoring | ||||
|       | them with a linear model, you should review that post. Here’s a reminder | ||||
|       | of how the model scoring works: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | class Perceptron(object) | ||||
|         |     ... | ||||
|         |     def score(self, features): | ||||
|         |         all_weights = self.weights | ||||
|         |         scores = dict((clas, 0) for clas in self.classes) | ||||
|         |         for feat, value in features.items(): | ||||
|         |             if value == 0: | ||||
|         |                 continue | ||||
|         |             if feat not in all_weights: | ||||
|         |                 continue | ||||
|         |             weights = all_weights[feat] | ||||
|         |             for clas, weight in weights.items(): | ||||
|         |                 scores[clas] += value * weight | ||||
|         |         return scores | ||||
| 
 | ||||
|     p. | ||||
|       It’s just summing the class-weights for each feature. This is often | ||||
|       expressed as a dot-product, but when you’re dealing with multiple | ||||
|       classes, that gets awkward, I find. | ||||
|      | ||||
|     p. | ||||
|       The beam parser (RedShift) tracks multiple candidates, and only decides | ||||
|       on the best one at the very end. We’re going to trade away accuracy | ||||
|       in favour of efficiency and simplicity. We’ll only follow a single | ||||
|       analysis. Our search strategy will be entirely greedy, as it was with | ||||
|       the POS tagger. We’ll lock-in our choices at every step. | ||||
| 
 | ||||
|     p. | ||||
|       If you read the POS tagger post carefully, you might see the underlying | ||||
|       similarity. What we’ve done is mapped the parsing problem onto a | ||||
|       sequence-labelling problem, which we address using a “flat”, or unstructured, | ||||
|       learning algorithm (by doing greedy search). | ||||
| 
 | ||||
|     h3 Features | ||||
|     p. | ||||
|       Feature extraction code is always pretty ugly. The features for the parser | ||||
|       refer to a few tokens from the context: | ||||
| 
 | ||||
|     ul | ||||
|       li The first three words of the buffer (n0, n1, n2) | ||||
|       li The top three words of the stack (s0, s1, s2) | ||||
|       li The two leftmost children of s0 (s0b1, s0b2); | ||||
|       li The two rightmost children of s0 (s0f1, s0f2); | ||||
|       li The two leftmost children of n0 (n0b1, n0b2) | ||||
| 
 | ||||
|     p. | ||||
|       For these 12 tokens, we refer to the word-form, the part-of-speech tag, | ||||
|       and the number of left and right children attached to the token. | ||||
| 
 | ||||
|     p. | ||||
|       Because we’re using a linear model, we have our features refer to pairs | ||||
|       and triples of these atomic properties. | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def extract_features(words, tags, n0, n, stack, parse): | ||||
|         |     def get_stack_context(depth, stack, data): | ||||
|         |         if depth >= 3: | ||||
|         |             return data[stack[-1]], data[stack[-2]], data[stack[-3]] | ||||
|         |         elif depth >= 2: | ||||
|         |             return data[stack[-1]], data[stack[-2]], '' | ||||
|         |         elif depth == 1: | ||||
|         |             return data[stack[-1]], '', '' | ||||
|         |         else: | ||||
|         |             return '', '', '' | ||||
|         |  | ||||
|         |     def get_buffer_context(i, n, data): | ||||
|         |         if i + 1 >= n: | ||||
|         |             return data[i], '', '' | ||||
|         |         elif i + 2 >= n: | ||||
|         |             return data[i], data[i + 1], '' | ||||
|         |         else: | ||||
|         |             return data[i], data[i + 1], data[i + 2] | ||||
|         |  | ||||
|         |     def get_parse_context(word, deps, data): | ||||
|         |         if word == -1: | ||||
|         |             return 0, '', '' | ||||
|         |         deps = deps[word] | ||||
|         |         valency = len(deps) | ||||
|         |         if not valency: | ||||
|         |             return 0, '', '' | ||||
|         |         elif valency == 1: | ||||
|         |             return 1, data[deps[-1]], '' | ||||
|         |         else: | ||||
|         |             return valency, data[deps[-1]], data[deps[-2]] | ||||
|         |  | ||||
|         |     features = {} | ||||
|         |     # Set up the context pieces --- the word, W, and tag, T, of: | ||||
|         |     # S0-2: Top three words on the stack | ||||
|         |     # N0-2: First three words of the buffer | ||||
|         |     # n0b1, n0b2: Two leftmost children of the first word of the buffer | ||||
|         |     # s0b1, s0b2: Two leftmost children of the top word of the stack | ||||
|         |     # s0f1, s0f2: Two rightmost children of the top word of the stack | ||||
|         |  | ||||
|         |     depth = len(stack) | ||||
|         |     s0 = stack[-1] if depth else -1 | ||||
|         |  | ||||
|         |     Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) | ||||
|         |     Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) | ||||
|         |  | ||||
|         |     Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) | ||||
|         |     Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) | ||||
|         |  | ||||
|         |     Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) | ||||
|         |     Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) | ||||
|         |  | ||||
|         |     Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) | ||||
|         |     _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) | ||||
|         |  | ||||
|         |     Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) | ||||
|         |     _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) | ||||
|         |  | ||||
|         |     Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) | ||||
|         |     _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) | ||||
|         |  | ||||
|         |     # Cap numeric features at 5?  | ||||
|         |     # String-distance | ||||
|         |     Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 | ||||
|         |  | ||||
|         |     features['bias'] = 1 | ||||
|         |     # Add word and tag unigrams | ||||
|         |     for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): | ||||
|         |         if w: | ||||
|         |             features['w=%s' % w] = 1 | ||||
|         |     for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): | ||||
|         |         if t: | ||||
|         |             features['t=%s' % t] = 1 | ||||
|         |  | ||||
|         |     # Add word/tag pairs | ||||
|         |     for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): | ||||
|         |         if w or t: | ||||
|         |             features['%d w=%s, t=%s' % (i, w, t)] = 1 | ||||
|         |  | ||||
|         |     # Add some bigrams | ||||
|         |     features['s0w=%s,  n0w=%s' % (Ws0, Wn0)] = 1 | ||||
|         |     features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 | ||||
|         |     features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 | ||||
|         |     features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 | ||||
|         |     features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 | ||||
|         |     features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 | ||||
|         |     features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 | ||||
|         |     features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 | ||||
|         |  | ||||
|         |     # Add some tag trigrams | ||||
|         |     trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),  | ||||
|         |                 (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), | ||||
|         |                 (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), | ||||
|         |                 (Ts0, Ts1, Ts1)) | ||||
|         |     for i, (t1, t2, t3) in enumerate(trigrams): | ||||
|         |         if t1 or t2 or t3: | ||||
|         |             features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 | ||||
|         |  | ||||
|         |     # Add some valency and distance features | ||||
|         |     vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) | ||||
|         |     vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) | ||||
|         |     d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), | ||||
|         |         ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) | ||||
|         |     for i, (w_t, v_d) in enumerate(vw + vt + d): | ||||
|         |         if w_t or v_d: | ||||
|         |             features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 | ||||
|         |     return features</code></pre> | ||||
| 
 | ||||
| 
 | ||||
|     h3 Training | ||||
|      | ||||
|     p. | ||||
|       Weights are learned using the same algorithm, averaged perceptron, that | ||||
|       we used for part-of-speech tagging. Its key strength is that it’s an | ||||
|       online learning algorithm: examples stream in one-by-one, we make our | ||||
|       prediction, check the actual answer, and adjust our beliefs (weights) | ||||
|       if we were wrong. | ||||
|          | ||||
|     p The training loop looks like this: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|           | class Parser(object): | ||||
|           |     ... | ||||
|           |     def train_one(self, itn, words, gold_tags, gold_heads): | ||||
|           |         n = len(words) | ||||
|           |         i = 2; stack = [1]; parse = Parse(n) | ||||
|           |         tags = self.tagger.tag(words) | ||||
|           |         while stack or (i + 1) < n: | ||||
|           |             features = extract_features(words, tags, i, n, stack, parse) | ||||
|           |             scores = self.model.score(features) | ||||
|           |             valid_moves = get_valid_moves(i, n, len(stack)) | ||||
|           |             guess = max(valid_moves, key=lambda move: scores[move]) | ||||
|           |             gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) | ||||
|           |             best = max(gold_moves, key=lambda move: scores[move]) | ||||
|           |         self.model.update(best, guess, features) | ||||
|           |         i = transition(guess, i, stack, parse) | ||||
|           |     # Return number correct | ||||
|           |     return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) | ||||
| 
 | ||||
| 
 | ||||
|        | ||||
|     p  | ||||
|       | The most interesting part of the training process is in  | ||||
|       code.language-python get_gold_moves. | ||||
|       | The performance of our parser is made possible by an advance by Goldberg | ||||
|       | and Nivre (2012), who showed that we’d been doing this wrong for years. | ||||
|      | ||||
|     p | ||||
|       | In the POS-tagging post, I cautioned that during training you need to | ||||
|       | make sure you pass in the last two | ||||
|       em predicted | ||||
|       | tags as features for the current tag, not the last two  | ||||
|       em gold | ||||
|       | tags. At test time you’ll only have the predicted tags, so if you | ||||
|       | base your features on the gold sequence during training, your training | ||||
|       | contexts won’t resemble your test-time contexts, so you’ll learn the | ||||
|       | wrong weights. | ||||
| 
 | ||||
|     p. | ||||
|       In parsing, the problem was that we didn’t know  | ||||
|       em how | ||||
|       | to pass in the predicted sequence! Training worked by taking the | ||||
|       | gold-standard tree, and finding a transition sequence that led to it. | ||||
|       | i.e., you got back a sequence of moves, with the guarantee that if | ||||
|       | you followed those moves, you’d get the gold-standard dependencies. | ||||
|      | ||||
|     p | ||||
|       | The problem is, we didn’t know how to define the “correct” move to | ||||
|       | teach a parser to make if it was in any state that  | ||||
|       em wasn’t | ||||
|       |  along that gold-standard sequence. Once the parser had made a mistake, | ||||
|       | we didn’t know how to train from that example. | ||||
| 
 | ||||
|     p | ||||
|       | That was a big problem, because it meant that once the parser started | ||||
|       | making mistakes, it would end up in states unlike any in its training | ||||
|       | data – leading to yet more mistakes. The problem was specific | ||||
|       | to greedy parsers: once you use a beam, there’s a natural way to do | ||||
|       | structured prediction. | ||||
|     p | ||||
|       | The solution seems obvious once you know it, like all the best breakthroughs. | ||||
|       | What we do is define a function that asks “How many gold-standard | ||||
|       | dependencies can be recovered from this state?”. If you can define | ||||
|       | that function, then you can apply each move in turn, and ask, “How | ||||
|       | many gold-standard dependencies can be recovered from  | ||||
|       em this | ||||
|       | state?”. If the action you applied allows  | ||||
|       em fewer | ||||
|       | gold-standard dependencies to be reached, then it is sub-optimal. | ||||
| 
 | ||||
|     p That’s a lot to take in. | ||||
| 
 | ||||
|     p | ||||
|       | So we have this function  | ||||
|       code Oracle(state) | ||||
|       | : | ||||
|       pre | ||||
|         code | ||||
|           | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | | ||||
|     p | ||||
|       | We also have a set of actions, each of which returns a new state. | ||||
|       | We want to know: | ||||
| 
 | ||||
|     ul | ||||
|       li shift_cost = Oracle(state) – Oracle(shift(state)) | ||||
|       li right_cost = Oracle(state) – Oracle(right(state)) | ||||
|       li left_cost = Oracle(state) – Oracle(left(state)) | ||||
|      | ||||
|     p | ||||
|       | Now, at least one of those costs  | ||||
|       em has | ||||
|       | to be zero. Oracle(state) is asking, “what’s the cost of the best | ||||
|       | path forward?”, and the first action of that best path has to be | ||||
|       | shift, right, or left. | ||||
| 
 | ||||
|     p | ||||
|       | It turns out that we can derive Oracle fairly simply for many transition | ||||
|       | systems. The derivation for the transition system we’re using, Arc | ||||
|       | Hybrid, is in Goldberg and Nivre (2013). | ||||
| 
 | ||||
|     p | ||||
|       | We’re going to implement the oracle as a function that returns the | ||||
|       | zero-cost moves, rather than implementing a function Oracle(state). | ||||
|       | This prevents us from doing a bunch of costly copy operations. | ||||
|       | Hopefully the reasoning in the code isn’t too hard to follow, but | ||||
|       | you can also consult Goldberg and Nivre’s papers if you’re confused | ||||
|       | and want to get to the bottom of this. | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def get_gold_moves(n0, n, stack, heads, gold): | ||||
|         |     def deps_between(target, others, gold): | ||||
|         |         for word in others: | ||||
|         |             if gold[word] == target or gold[target] == word: | ||||
|         |                 return True | ||||
|         |         return False | ||||
|         |  | ||||
|         |     valid = get_valid_moves(n0, n, len(stack)) | ||||
|         |     if not stack or (SHIFT in valid and gold[n0] == stack[-1]): | ||||
|         |         return [SHIFT] | ||||
|         |     if gold[stack[-1]] == n0: | ||||
|         |         return [LEFT] | ||||
|         |     costly = set([m for m in MOVES if m not in valid]) | ||||
|         |     # If the word behind s0 is its gold head, Left is incorrect | ||||
|         |     if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: | ||||
|         |         costly.add(LEFT) | ||||
|         |     # If there are any dependencies between n0 and the stack, | ||||
|         |     # pushing n0 will lose them. | ||||
|         |     if SHIFT not in costly and deps_between(n0, stack, gold): | ||||
|         |         costly.add(SHIFT) | ||||
|         |     # If there are any dependencies between s0 and the buffer, popping | ||||
|         |     # s0 will lose them. | ||||
|         |     if deps_between(stack[-1], range(n0+1, n-1), gold): | ||||
|         |         costly.add(LEFT) | ||||
|         |         costly.add(RIGHT) | ||||
|         |     return [m for m in MOVES if m not in costly]</code></pre> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     p | ||||
|       | Doing this “dynamic oracle” training procedure makes a big difference | ||||
|       | to accuracy — typically 1-2%, with no difference to the way the run-time | ||||
|       | works. The old “static oracle” greedy training procedure is fully | ||||
|       | obsolete; there’s no reason to do it that way any more. | ||||
| 
 | ||||
|     h3 Conclusion | ||||
| 
 | ||||
|     p | ||||
|       | I have the sense that language technologies, particularly those relating | ||||
|       | to grammar, are particularly mysterious. I can imagine having no idea | ||||
|       | what the program might even do. | ||||
| 
 | ||||
|     p | ||||
|       | I think it therefore seems natural to people that the best solutions | ||||
|       | would be over-whelmingly complicated. A 200,000 line Java package | ||||
|       | feels appropriate. | ||||
|     p | ||||
|       | But, algorithmic code is usually short, when only a single algorithm | ||||
|       | is implemented. And when you only implement one algorithm, and you | ||||
|       | know exactly what you want to write before you write a line, you | ||||
|       | also don’t pay for any unnecessary abstractions, which can have a | ||||
|       | big performance impact. | ||||
| 
 | ||||
|     h3 Notes | ||||
|     p | ||||
|       a(name='note-1') | ||||
|         | [1] I wasn’t really sure how to count the lines of code in the Stanford | ||||
|         | parser. Its jar file ships over 200k, but there are a lot of different | ||||
|         | models in it. It’s not important, but it's certainly over 4k. | ||||
| 
 | ||||
|     p | ||||
|       a(name='note-2') | ||||
|       | [2] For instance, how would you parse, “John’s school of music calls”? | ||||
|       | You want to make sure the phrase “John’s school” has a consistent | ||||
|       | structure in both “John’s school calls” and “John’s school of music | ||||
|       | calls”. Reasoning about the different “slots” you can put a phrase | ||||
|       | into is a key way we reason about what syntactic analyses look like. | ||||
|       | You can think of each phrase as having a different shaped connector, | ||||
|       | which you need to plug into different slots — which each phrase also | ||||
|       | has a certain number of, each of a different shape. We’re trying to | ||||
|       | figure out what connectors are where, so we can figure out how the | ||||
|       | sentences are put together. | ||||
| 
 | ||||
|     h3 Idle speculation | ||||
|     p | ||||
|       | For a long time, incremental language processing algorithms were | ||||
|       | primarily of scientific interest. If you want to write a parser to | ||||
|       | test a theory about how the human sentence processor might work, well, | ||||
|       | that parser needs to build partial interpretations. There’s a wealth | ||||
|       | of evidence, including commonsense introspection, that establishes | ||||
|       | that we don’t buffer input and analyse it once the speaker has finished. | ||||
| 
 | ||||
|     p | ||||
|       | But now algorithms with that neat scientific feature are winning! | ||||
|       | As best as I can tell, the secret to that success is to be: | ||||
| 
 | ||||
|     ul | ||||
|       li Incremental. Earlier words constrain the search. | ||||
|       li | ||||
|         | Error-driven. Training involves a working hypothesis, which is | ||||
|         | updated as it makes mistakes. | ||||
| 
 | ||||
|     p | ||||
|       | The links to human sentence processing seem tantalising. I look | ||||
|       | forward to seeing whether these engineering breakthroughs lead to | ||||
|       | any psycholinguistic advances. | ||||
| 
 | ||||
|     h3 Bibliography | ||||
| 
 | ||||
|     p | ||||
|       | The NLP literature is almost entirely open access. All of the relavant | ||||
|       | papers can be found  | ||||
|       a(href=urls.acl_anthology, rel='nofollow') here | ||||
|       | . | ||||
|     p | ||||
|       | The parser I’ve described is an implementation of the dynamic-oracle | ||||
|       | Arc-Hybrid system here: | ||||
| 
 | ||||
|       span.bib-item | ||||
|         | Goldberg, Yoav; Nivre, Joakim.  | ||||
|         em Training Deterministic Parsers with Non-Deterministic Oracles | ||||
|         | . TACL 2013 | ||||
|     p | ||||
|       | However, I wrote my own features for it. The arc-hybrid system was | ||||
|       | originally described here: | ||||
| 
 | ||||
|       span.bib-item | ||||
|         | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic | ||||
|         | programming algorithms for transition-based dependency parsers. ACL 2011 | ||||
| 
 | ||||
|     p | ||||
|       | The dynamic oracle training method was first described here: | ||||
|       span.bib-item | ||||
|         | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; | ||||
|         | Nivre, Joakim. COLING 2012 | ||||
| 
 | ||||
|     p | ||||
|       | This work depended on a big break-through in accuracy for transition-based | ||||
|       | parsers, when beam-search was properly explored by Zhang and Clark. | ||||
|       | They have several papers, but the preferred citation is: | ||||
| 
 | ||||
|       span.bib-item | ||||
|         | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized | ||||
|         | Perceptron and Beam Search. Computational Linguistics 2011 (1) | ||||
|     p | ||||
|       | Another important paper was this little feature engineering paper, | ||||
|       | which further improved the accuracy: | ||||
| 
 | ||||
|       span.bib-item | ||||
|         | Zhang, Yue;  Nivre, Joakim. Transition-based Dependency Parsing with | ||||
|         | Rich Non-local Features. ACL 2011 | ||||
| 
 | ||||
|     p | ||||
|       | The generalised perceptron, which is the learning framework for these | ||||
|       | beam parsers, is from this paper: | ||||
|       span.bib-item | ||||
|         | Collins, Michael. Discriminative Training Methods for Hidden Markov | ||||
|         | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 | ||||
| 
 | ||||
|     h3 Experimental details | ||||
|     p | ||||
|       | The results at the start of the post refer to Section 22 of the Wall | ||||
|       | Street Journal corpus. The Stanford parser was run as follows: | ||||
| 
 | ||||
|     pre.language-bash | ||||
|       code | ||||
|         | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ | ||||
|         | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|     p | ||||
|       | A small post-process was applied, to undo the fancy tokenisation | ||||
|       | Stanford adds for numbers, to make them match the PTB tokenisation: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | """Stanford parser retokenises numbers. Split them.""" | ||||
|         | import sys | ||||
|         | import re | ||||
|         |   | ||||
|         | qp_re = re.compile('\xc2\xa0') | ||||
|         | for line in sys.stdin: | ||||
|         |     line = line.rstrip() | ||||
|         |     if qp_re.search(line): | ||||
|         |         line = line.replace('(CD', '(QP (CD', 1) + ')' | ||||
|         |         line = line.replace('\xc2\xa0', ') (CD ') | ||||
|         |     print line | ||||
| 
 | ||||
|     p | ||||
|       | The resulting PTB-format files were then converted into dependencies | ||||
|       | using the Stanford converter: | ||||
| 
 | ||||
|     pre.language-bash | ||||
|       code | ||||
|         | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp | ||||
|         | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ | ||||
|         | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll | ||||
|     p | ||||
|       | I can’t easily read that anymore, but it should just convert every | ||||
|       | .mrg file in a folder to a CoNLL-format Stanford basic dependencies | ||||
|       | file, using the settings common in the dependency literature. | ||||
| 
 | ||||
|     p | ||||
|       | I then converted the gold-standard trees from WSJ 22, for the evaluation. | ||||
|       | Accuracy scores refer to unlabelled attachment score (i.e. the head index) | ||||
|       | of all non-punctuation tokens. | ||||
| 
 | ||||
|     p | ||||
|       | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 | ||||
|       | into the same conversion script. | ||||
| 
 | ||||
|     p | ||||
|       | In a nutshell: The Stanford model and parser.py are trained on the | ||||
|       | same set of sentences, and they each make their predictions on a | ||||
|       | held-out test set, for which we know the answers. Accuracy refers | ||||
|       | to how many of the words’ heads we got correct. | ||||
| 
 | ||||
|     p | ||||
|       | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a | ||||
|       | server, to give the Stanford parser more memory. The parser.py system | ||||
|       | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; | ||||
|       | CPython was about half as fast on an early benchmark. | ||||
| 
 | ||||
|     p | ||||
|       | One of the reasons parser.py is so fast is that it does unlabelled | ||||
|       | parsing. Based on previous experiments, a labelled parser would likely | ||||
|       | be about 40x slower, and about 1% more accurate. Adapting the program | ||||
|       | to labelled parsing would be a good exercise for the reader, if you | ||||
|       | have access to the data. | ||||
| 
 | ||||
|     p | ||||
|       | The result from the Redshift parser was produced from commit  | ||||
|       code.language-python b6b624c9900f3bf | ||||
|       | , which was run as follows: | ||||
|     pre.language-bash | ||||
|       code | ||||
|         | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp | ||||
|         | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ | ||||
|         | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll< | ||||
| 
 | ||||
|     footer.meta(role='contentinfo') | ||||
|       a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter | ||||
|       .discuss | ||||
|         a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News | ||||
|         |  | ||||
|         a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit | ||||
|  | @ -1,492 +0,0 @@ | |||
| extends ./template_post.jade | ||||
| 
 | ||||
| block body_block | ||||
|   - var urls = {} | ||||
|   - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" | ||||
| 
 | ||||
| 
 | ||||
|   article.post | ||||
|     header | ||||
|       h2 A good Part-of-Speech tagger in about 200 lines of Python | ||||
|       .subhead | ||||
|         | by  | ||||
|         a(href="#" rel="author") Matthew Honnibal | ||||
|         | on  | ||||
|         time(datetime='2013-09-11') October 11, 2013 | ||||
| 
 | ||||
|     p. | ||||
|       Up-to-date knowledge about natural language processing is mostly locked away | ||||
|       in academia. And academics are mostly pretty self-conscious when we write. | ||||
|       We’re careful. We don’t want to stick our necks out too much. But under-confident | ||||
|       recommendations suck, so here’s how to write a good part-of-speech tagger. | ||||
|        | ||||
|     p. | ||||
|       There are a tonne of “best known techniques” for POS tagging, and you should | ||||
|       ignore the others and just use Averaged Perceptron. | ||||
|        | ||||
|     p. | ||||
|       You should use two tags of history, and features derived from the Brown word | ||||
|       clusters distributed here. | ||||
|        | ||||
|     p. | ||||
|       If you only need the tagger to work on carefully edited text, you should | ||||
|       use case-sensitive features, but if you want a more robust tagger you | ||||
|       should avoid them because they’ll make you over-fit to the conventions | ||||
|       of your training domain. Instead, features that ask “how frequently is | ||||
|       this word title-cased, in a large sample from the web?” work well. Then | ||||
|       you can lower-case your comparatively tiny training corpus. | ||||
|        | ||||
|     p. | ||||
|       For efficiency, you should figure out which frequent words in your training | ||||
|       data have unambiguous tags, so you don’t have to do anything but output | ||||
|       their tags when they come up. About 50% of the words can be tagged that way. | ||||
|        | ||||
|     p. | ||||
|       And unless you really, really can’t do without an extra 0.1% of accuracy, | ||||
|       you probably shouldn’t bother with any kind of search strategy  you should | ||||
|       just use a greedy model. | ||||
|        | ||||
|     p. | ||||
|       If you do all that, you’ll find your tagger easy to write and understand, | ||||
|       and an efficient Cython implementation will perform as follows on the standard | ||||
|       evaluation, 130,000 words of text from the Wall Street Journal: | ||||
|        | ||||
|     table | ||||
|       thead | ||||
|         tr | ||||
|           th Tagger | ||||
|           th Accuracy | ||||
|           th Time (130k words) | ||||
|       tbody | ||||
|         tr | ||||
|           td CyGreedyAP | ||||
|           td 97.1% | ||||
|           td 4s | ||||
| 
 | ||||
|     p. | ||||
|       The 4s includes initialisation time — the actual per-token speed is high | ||||
|       enough to be irrelevant; it won’t be your bottleneck. | ||||
|        | ||||
|     p. | ||||
|       It’s tempting to look at 97% accuracy and say something similar, but that’s | ||||
|       not true. My parser is about 1% more accurate if the input has hand-labelled | ||||
|       POS tags, and the taggers all perform much worse on out-of-domain data. | ||||
|       Unfortunately accuracies have been fairly flat for the last ten years. | ||||
|       That’s why my recommendation is to just use a simple and fast tagger that’s | ||||
|       roughly as good. | ||||
|        | ||||
|     p. | ||||
|       The thing is though, it’s very common to see people using taggers that | ||||
|       aren’t anywhere near that good!  For an example of what a non-expert is | ||||
|       likely to use, these were the two taggers wrapped by TextBlob, a new Python | ||||
|       api that I think is quite neat: | ||||
|        | ||||
|     table | ||||
|       thead | ||||
|         tr | ||||
|           th Tagger | ||||
|           th Accuracy | ||||
|           th Time (130k words) | ||||
|       tbody | ||||
|         tr | ||||
|           td NLTK | ||||
|           td 94.0% | ||||
|           td 3m56s | ||||
|         tr | ||||
|           td Pattern | ||||
|           td 93.5% | ||||
|           td 26s | ||||
| 
 | ||||
|     p. | ||||
|       Both Pattern and NLTK are very robust and beautifully well documented, so | ||||
|       the appeal of using them is obvious. But Pattern’s algorithms are pretty | ||||
|       crappy, and NLTK carries tremendous baggage around in its implementation | ||||
|       because of its massive framework, and double-duty as a teaching tool. | ||||
| 
 | ||||
|     p.   | ||||
|       As a stand-alone tagger, my Cython implementation is needlessly complicated | ||||
|       – it was written for my parser. So today I wrote a 200 line version | ||||
|       of my recommended algorithm for TextBlob. It gets: | ||||
|        | ||||
|     table | ||||
|       thead | ||||
|         tr | ||||
|           th Tagger | ||||
|           th Accuracy | ||||
|           th Time (130k words) | ||||
|       tbody | ||||
|         tr | ||||
|           td PyGreedyAP | ||||
|           td 96.8% | ||||
|           td 12s | ||||
| 
 | ||||
|     p. | ||||
|       I traded some accuracy and a lot of efficiency to keep the implementation | ||||
|       simple. Here’s a far-too-brief description of how it works. | ||||
|        | ||||
|     h3 Averaged perceptron | ||||
| 
 | ||||
|     p. | ||||
|       POS tagging is a “supervised learning problem”. You’re given a table of data, | ||||
|       and you’re told that the values in the last column will be missing during | ||||
|       run-time. You have to find correlations from the other columns to predict | ||||
|       that value. | ||||
|        | ||||
|     p. | ||||
|       So for us, the missing column will be “part of speech at word i“. The predictor | ||||
|       columns (features) will be things like “part of speech at word i-1“, “last three | ||||
|       letters of word at i+1“, etc | ||||
|        | ||||
|     p. | ||||
|       First, here’s what prediction looks like at run-time: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def predict(self, features): | ||||
|         |     '''Dot-product the features and current weights and return the best class.''' | ||||
|         |     scores = defaultdict(float) | ||||
|         |     for feat in features: | ||||
|         |         if feat not in self.weights: | ||||
|         |             continue | ||||
|         |         weights = self.weights[feat] | ||||
|         |         for clas, weight in weights.items(): | ||||
|         |             scores[clas] += weight | ||||
|         |     # Do a secondary alphabetic sort, for stability | ||||
|         |     return max(self.classes, key=lambda clas: (scores[clas], clas)) | ||||
| 
 | ||||
|     p. | ||||
|       Earlier I described the learning problem as a table, with one of the columns | ||||
|       marked as missing-at-runtime. For NLP, our tables are always exceedingly | ||||
|       sparse. You have columns like “word i-1=Parliament”, which is almost always | ||||
|       0. So our “weight vectors” can pretty much never be implemented as vectors. | ||||
|       Map-types are good though — here we use dictionaries. | ||||
|        | ||||
|     p. | ||||
|       The input data, features, is a set with a member for every non-zero “column” | ||||
|       in our “table” – every active feature. Usually this is actually a dictionary, | ||||
|       to let you set values for the features. But here all my features are binary | ||||
|       present-or-absent type deals. | ||||
|        | ||||
|     p. | ||||
|       The weights data-structure is a dictionary of dictionaries, that ultimately | ||||
|       associates feature/class pairs with some weight. You want to structure it | ||||
|       this way instead of the reverse because of the way word frequencies are | ||||
|       distributed: most words are rare, frequent words are very frequent. | ||||
|        | ||||
|     h3 Learning the weights | ||||
| 
 | ||||
|     p. | ||||
|       Okay, so how do we get the values for the weights? We start with an empty | ||||
|       weights dictionary, and iteratively do the following: | ||||
| 
 | ||||
|     ol | ||||
|       li Receive a new (features, POS-tag) pair | ||||
|       li Guess the value of the POS tag given the current “weights” for the features | ||||
|       li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class. | ||||
| 
 | ||||
| 
 | ||||
|     p. | ||||
|       It’s one of the simplest learning algorithms. Whenever you make a mistake, | ||||
|       increment the weights for the correct class, and penalise the weights that | ||||
|       led to your false prediction. In code: | ||||
|      | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def train(self, nr_iter, examples): | ||||
|         |     for i in range(nr_iter): | ||||
|         |         for features, true_tag in examples: | ||||
|         |             guess = self.predict(features) | ||||
|         |             if guess != true_tag: | ||||
|         |                 for f in features: | ||||
|         |                     self.weights[f][true_tag] += 1 | ||||
|         |                     self.weights[f][guess] -= 1 | ||||
|         |         random.shuffle(examples) | ||||
|     p. | ||||
|       If you iterate over the same example this way, the weights for the correct | ||||
|       class would have to come out ahead, and you’d get the example right. If | ||||
|       you think about what happens with two examples, you should be able to | ||||
|       see that it will get them both right unless the features are identical. | ||||
|       In general the algorithm will converge so long as the examples are | ||||
|       linearly separable, although that doesn’t matter for our purpose. | ||||
|        | ||||
|     h3 Averaging the weights | ||||
| 
 | ||||
|     p. | ||||
|       We need to do one more thing to make the perceptron algorithm competitive. | ||||
|       The problem with the algorithm so far is that if you train it twice on | ||||
|       slightly different sets of examples, you end up with really different models. | ||||
|       It doesn’t generalise that smartly. And the problem is really in the later | ||||
|       iterations — if you let it run to convergence, it’ll pay lots of attention | ||||
|       to the few examples it’s getting wrong, and mutate its whole model around | ||||
|       them. | ||||
| 
 | ||||
|     p. | ||||
|       So, what we’re going to do is make the weights more "sticky" – give | ||||
|       the model less chance to ruin all its hard work in the later rounds. And | ||||
|       we’re going to do that by returning the averaged weights, not the final | ||||
|       weights. | ||||
| 
 | ||||
|     p. | ||||
|       I doubt there are many people who are convinced that’s the most obvious | ||||
|       solution to the problem, but whatever. We’re not here to innovate, and this | ||||
|       way is time tested on lots of problems. If you have another idea, run the | ||||
|       experiments and tell us what you find. Actually I’d love to see more work | ||||
|       on this, now that the averaged perceptron has become such a prominent learning | ||||
|       algorithm in NLP. | ||||
|        | ||||
|     p. | ||||
|       Okay. So this averaging. How’s that going to work? Note that we don’t want | ||||
|       to just average after each outer-loop iteration. We want the average of all | ||||
|       the values — from the inner loop. So if we have 5,000 examples, and we train | ||||
|       for 10 iterations, we’ll average across 50,000 values for each weight. | ||||
|        | ||||
|     p. | ||||
|       Obviously we’re not going to store all those intermediate values. Instead, | ||||
|       we’ll track an accumulator for each weight, and divide it by the number of | ||||
|       iterations at the end. Again: we want the average weight assigned to a | ||||
|       feature/class pair during learning, so the key component we need is the total | ||||
|       weight it was assigned. But we also want to be careful about how we compute | ||||
|       that accumulator, too. On almost any instance, we’re going to see a tiny | ||||
|       fraction of active feature/class pairs. All the other feature/class weights | ||||
|       won’t change. So we shouldn’t have to go back and add the unchanged value | ||||
|       to our accumulators anyway, like chumps. | ||||
|        | ||||
|     p. | ||||
|       Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain | ||||
|       another dictionary that tracks how long each weight has gone unchanged. Now | ||||
|       when we do change a weight, we can do a fast-forwarded update to the accumulator, | ||||
|       for all those iterations where it lay unchanged. | ||||
|        | ||||
|     p. | ||||
|       Here’s what a weight update looks like now that we have to maintain the | ||||
|       totals and the time-stamps: | ||||
|        | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def update(self, truth, guess, features): | ||||
|         |     def upd_feat(c, f, v): | ||||
|         |         nr_iters_at_this_weight = self.i - self._timestamps[f][c] | ||||
|         |         self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c] | ||||
|         |         self.weights[f][c] += v | ||||
|         |         self._timestamps[f][c] = self.i | ||||
|         | ||||
|         |     self.i += 1 | ||||
|         |     for f in features: | ||||
|         |         upd_feat(truth, f, 1.0) | ||||
|         |         upd_feat(guess, f, -1.0) | ||||
| 
 | ||||
|     h3 Features and pre-processing | ||||
|      | ||||
|     p. | ||||
|       The POS tagging literature has tonnes of intricate features sensitive to | ||||
|       case, punctuation, etc. They help on the standard test-set, which is from | ||||
|       Wall Street Journal articles from the 1980s, but I don’t see how they’ll | ||||
|       help us learn models that are useful on other text. | ||||
|        | ||||
|     p. | ||||
|       To help us learn a more general model, we’ll pre-process the data prior | ||||
|       to feature extraction, as follows: | ||||
|        | ||||
|     ul | ||||
|       li All words are lower cased; | ||||
|       li Digits in the range 1800-2100 are represented as !YEAR; | ||||
|       li Other digit strings are represented as !DIGITS | ||||
|       li | ||||
|         | It would be better to have a module recognising dates, phone numbers, | ||||
|         | emails, hash-tags, etc. but that will have to be pushed back into the | ||||
|         | tokenization. | ||||
|        | ||||
|     p. | ||||
|       I played around with the features a little, and this seems to be a reasonable | ||||
|       bang-for-buck configuration in terms of getting the development-data accuracy | ||||
|       to 97% (where it typically converges anyway), and having a smaller memory | ||||
|       foot-print: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def _get_features(self, i, word, context, prev, prev2): | ||||
|         |     '''Map tokens-in-contexts into a feature representation, implemented as a | ||||
|         |     set. If the features change, a new model must be trained.''' | ||||
|         |     def add(name, *args): | ||||
|         |         features.add('+'.join((name,) + tuple(args))) | ||||
|         | ||||
|         |     features = set() | ||||
|         |     add('bias') # This acts sort of like a prior | ||||
|         |     add('i suffix', word[-3:]) | ||||
|         |     add('i pref1', word[0]) | ||||
|         |     add('i-1 tag', prev) | ||||
|         |     add('i-2 tag', prev2) | ||||
|         |     add('i tag+i-2 tag', prev, prev2) | ||||
|         |     add('i word', context[i]) | ||||
|         |     add('i-1 tag+i word', prev, context[i]) | ||||
|         |     add('i-1 word', context[i-1]) | ||||
|         |     add('i-1 suffix', context[i-1][-3:]) | ||||
|         |     add('i-2 word', context[i-2]) | ||||
|         |     add('i+1 word', context[i+1]) | ||||
|         |     add('i+1 suffix', context[i+1][-3:]) | ||||
|         |     add('i+2 word', context[i+2]) | ||||
|         |     return features | ||||
| 
 | ||||
|     p. | ||||
|       I haven’t added any features from external data, such as case frequency | ||||
|       statistics from the Google Web 1T corpus. I might add those later, but for | ||||
|       now I figured I’d keep things simple. | ||||
|        | ||||
|     h3 What about search? | ||||
| 
 | ||||
|     p. | ||||
|       The model I’ve recommended commits to its predictions on each word, and | ||||
|       moves on to the next one. Those predictions are then used as features for | ||||
|       the next word. There’s a potential problem here, but it turns out it doesn’t | ||||
|       matter much. It’s easy to fix with beam-search, but I say it’s not really | ||||
|       worth bothering. And it definitely doesn’t matter enough to adopt a slow | ||||
|       and complicated algorithm like Conditional Random Fields. | ||||
|        | ||||
|     p. | ||||
|       Here’s the problem. The best indicator for the tag at position, say, 3 in | ||||
|       a sentence is the word at position 3. But the next-best indicators are the | ||||
|       tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want | ||||
|       the predictions for the surrounding words in hand before we commit to a | ||||
|       prediction for the current word. Here’s an example where search might matter: | ||||
|        | ||||
|     p.example. | ||||
|       Their management plan reforms worked | ||||
|        | ||||
|     p. | ||||
|       Depending on just what you’ve learned from your training data, you can | ||||
|       imagine making a different decision if you started at the left and moved | ||||
|       right, conditioning on your previous decisions, than if you’d started at | ||||
|       the right and moved left. | ||||
|        | ||||
|     p. | ||||
|       If that’s not obvious to you, think about it this way: “worked” is almost | ||||
|       surely a verb, so if you tag “reforms” with that in hand, you’ll have a | ||||
|       different idea of its tag than if you’d just come from “plan“, which you | ||||
|       might have regarded as either a noun or a verb. | ||||
|        | ||||
|     p. | ||||
|       Search can only help you when you make a mistake. It can prevent that error | ||||
|       from throwing off your subsequent decisions, or sometimes your future choices | ||||
|       will correct the mistake. And that’s why for POS tagging, search hardly matters! | ||||
|       Your model is so good straight-up that your past predictions are almost always | ||||
|       true. So you really need the planets to align for search to matter at all. | ||||
|        | ||||
|     p. | ||||
|       And as we improve our taggers, search will matter less and less. Instead | ||||
|       of search, what we should be caring about is multi-tagging. If we let the | ||||
|       model be a bit uncertain, we can get over 99% accuracy assigning an average | ||||
|       of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is | ||||
|       rubbish at multi-tagging though. That’s its big weakness. You really want | ||||
|       a probability distribution for that. | ||||
| 
 | ||||
|     p. | ||||
|       One caveat when doing greedy search, though. It’s very important that your | ||||
|       training data model the fact that the history will be imperfect at run-time. | ||||
|       Otherwise, it will be way over-reliant on the tag-history features. Because | ||||
|       the Perceptron is iterative, this is very easy. | ||||
|        | ||||
|     p. | ||||
|       Here’s the training loop for the tagger: | ||||
| 
 | ||||
|     pre.language-python | ||||
|       code | ||||
|         | def train(self, sentences, save_loc=None, nr_iter=5, quiet=False): | ||||
|         |     '''Train a model from sentences, and save it at save_loc. nr_iter | ||||
|         |     controls the number of Perceptron training iterations.''' | ||||
|         |     self._make_tagdict(sentences, quiet=quiet) | ||||
|         |     self.model.classes = self.classes | ||||
|         |     prev, prev2 = START | ||||
|         |     for iter_ in range(nr_iter): | ||||
|         |         c = 0; n = 0 | ||||
|         |         for words, tags in sentences: | ||||
|         |             context = START + [self._normalize(w) for w in words] + END | ||||
|         |             for i, word in enumerate(words): | ||||
|         |                 guess = self.tagdict.get(word) | ||||
|         |                 if not guess: | ||||
|         |                     feats = self._get_features( | ||||
|         |                               i, word, context, prev, prev2) | ||||
|         |                     guess = self.model.predict(feats) | ||||
|         |                     self.model.update(tags[i], guess, feats) | ||||
|         |                 # Set the history features from the guesses, not the | ||||
|         |                 # true tags | ||||
|         |                 prev2 = prev; prev = guess | ||||
|         |                 c += guess == tags[i]; n += 1 | ||||
|         |         random.shuffle(sentences) | ||||
|         |         if not quiet: | ||||
|         |             print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n))) | ||||
|         |     self.model.average_weights() | ||||
|         |     # Pickle as a binary file | ||||
|         |     if save_loc is not None: | ||||
|         |         cPickle.dump((self.model.weights, self.tagdict, self.classes), | ||||
|         |                      open(save_loc, 'wb'), -1) | ||||
|     p. | ||||
|       Unlike the previous snippets, this one’s literal – I tended to edit the | ||||
|       previous ones to simplify. So if they have bugs, hopefully that’s why! | ||||
|        | ||||
|     p. | ||||
|       At the time of writing, I’m just finishing up the implementation before I | ||||
|       submit a pull request to TextBlob. You can see the rest of the source here: | ||||
|        | ||||
|     ul | ||||
|       li | ||||
|         a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py | ||||
|       li | ||||
|         a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py | ||||
|        | ||||
|     h3 A final comparison… | ||||
|      | ||||
|     p. | ||||
|       Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology. | ||||
|       The claim is that we’ve just been meticulously over-fitting our methods to this | ||||
|       data. Actually the evidence doesn’t really bear this out. Mostly, if a technique | ||||
|       is clearly better on one evaluation, it improves others as well. Still, it’s | ||||
|       very reasonable to want to know how these tools perform on other text. So I | ||||
|       ran the unchanged models over two other sections from the OntoNotes corpus: | ||||
|        | ||||
|     table | ||||
|       thead | ||||
|         tr | ||||
|           th Tagger | ||||
|           th WSJ | ||||
|           th ABC | ||||
|           th Web | ||||
|       tbody | ||||
|         tr | ||||
|           td Pattern | ||||
|           td 93.5 | ||||
|           td 90.7 | ||||
|           td 88.1 | ||||
|         tr | ||||
|           td NLTK | ||||
|           td 94.0 | ||||
|           td 91.5 | ||||
|           td 88.4 | ||||
|         tr | ||||
|           td PyGreedyAP | ||||
|           td 96.8 | ||||
|           td 94.8 | ||||
|           td 91.8 | ||||
| 
 | ||||
|     p. | ||||
|       The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t | ||||
|       looked at the data much). | ||||
|        | ||||
|     p. | ||||
|       As you can see, the order of the systems is stable across the three comparisons, | ||||
|       and the advantage of our Averaged Perceptron tagger over the other two is real | ||||
|       enough. Actually the pattern tagger does very poorly on out-of-domain text. | ||||
|       It mostly just looks up the words, so it’s very domain dependent. I hadn’t | ||||
|       realised it before, but it’s obvious enough now that I think about it. | ||||
|        | ||||
|     p. | ||||
|       We can improve our score greatly by training on some of the foreign data. | ||||
|       The technique described in this paper (Daume III, 2007) is the first thing | ||||
|       I try when I have to do that. | ||||
| 
 | ||||
| 
 | ||||
|     footer.meta(role='contentinfo') | ||||
|       a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter | ||||
|       .discuss | ||||
|         a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News | ||||
|         |  | ||||
|         a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit | ||||
|  | @ -1,139 +0,0 @@ | |||
| - var urls = {} | ||||
| - urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" | ||||
| - urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" | ||||
| 
 | ||||
| 
 | ||||
| +comparison("NLTK") | ||||
|   p spaCy is: | ||||
|   ul | ||||
|     li.pro 100x faster; | ||||
|     li.pro 50% more accurate; | ||||
|     li.pro Serializes TODO% smaller; | ||||
| 
 | ||||
|   p spaCy features: | ||||
|     ul  | ||||
|       li.pro Integrated word vectors; | ||||
|       li.pro Efficient binary serialization; | ||||
| 
 | ||||
|   p NLTK features: | ||||
|     ul | ||||
|       li.con Multiple languages;  | ||||
|       li.neutral Educational resources | ||||
| 
 | ||||
| 
 | ||||
| //+comparison("Pattern") | ||||
| +comparison("CoreNLP") | ||||
|   p spaCy is: | ||||
| 
 | ||||
|   ul | ||||
|     li.pro TODO% faster; | ||||
|     li.pro TODO% more accurate; | ||||
|     li.pro Not Java; | ||||
|     li.pro Well documented; | ||||
|     li.pro Cheaper to license commercially; | ||||
|     li.neutral | ||||
|       | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping | ||||
|       | options.   | ||||
| 
 | ||||
|   p CoreNLP features: | ||||
| 
 | ||||
|   ul | ||||
|     li.con Multiple Languages; | ||||
|     li.con Sentiment analysis  | ||||
|     li.con Coreference resolution | ||||
| 
 | ||||
| 
 | ||||
| +comparison("ClearNLP") | ||||
|   p spaCy is: | ||||
| 
 | ||||
|   ul | ||||
|     li.pro Not Java; | ||||
|     li.pro TODO% faster; | ||||
|     li.pro Well documented; | ||||
|     li.neutral Slightly more accurate; | ||||
| 
 | ||||
|   p ClearNLP features: | ||||
| 
 | ||||
|   ul | ||||
|     li.con Semantic Role Labelling | ||||
|     li.con Multiple Languages | ||||
|     li.con Model for biology/life-science; | ||||
| 
 | ||||
| //+comparison("Accuracy Summary") | ||||
| 
 | ||||
| //+comparison("Speed Summary") | ||||
| //  table | ||||
| //    thead | ||||
| //      tr | ||||
| //        th. | ||||
| //        th(colspan=3) Absolute (ms per doc) | ||||
| //        th(colspan=3) Relative (to spaCy) | ||||
| // | ||||
| //    tbody | ||||
| //      tr | ||||
| //        td: strong System | ||||
| //        td: strong Split | ||||
| //        td: strong Tag | ||||
| //        td: strong Parse | ||||
| //        td: strong Split | ||||
| //        td: strong Tag | ||||
| //        td: strong Parse | ||||
| // | ||||
| //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") | ||||
| //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") | ||||
| //      +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") | ||||
| //      +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") | ||||
| //      +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") | ||||
| // | ||||
| //  p | ||||
| //    | <strong>Set up</strong>: 100,000 plain-text documents were streamed | ||||
| //    | from an SQLite3 database, and processed with an NLP library, to one | ||||
| //    | of three levels of detail – tokenization, tagging, or parsing. | ||||
| //    | The tasks are additive: to parse the text you have to tokenize and | ||||
| //    | tag it.  The  pre-processing was not subtracted from the times – | ||||
| //    | I report the time required for the pipeline to complete.  I report | ||||
| //    | mean times per document, in milliseconds. | ||||
| // | ||||
| //  p | ||||
| //    | <strong>Hardware</strong>: Intel i7-3770 (2012) | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| +comparison("Peer-reviewed Evaluations") | ||||
|   p. | ||||
|     spaCy is committed to rigorous evaluation under standard methodology.  Two | ||||
|     papers in 2015 confirm that: | ||||
|   ol | ||||
|     li spaCy is the fastest syntactic parser in the world; | ||||
|     li Its accuracy is within 1% of the best available; | ||||
|     li The few systems that are more accurate are 20× slower or more. | ||||
| 
 | ||||
|   p | ||||
|     | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, | ||||
|     | as part of a survey paper benchmarking the current state-of-the-art dependency | ||||
|     | parsers  | ||||
|     a(href=urls.choi_paper) (Choi et al., 2015) | ||||
|     | . | ||||
| 
 | ||||
|   table | ||||
|     thead | ||||
|       +columns("System", "Language", "Accuracy", "Speed") | ||||
| 
 | ||||
|     tbody | ||||
|       +row("spaCy v0.84", "Cython", "90.6", "13,963") | ||||
|       +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") | ||||
|       +row("ClearNLP", "Java", "91.7", "10,271") | ||||
|       +row("CoreNLP", "Java", "89.6", "8,602") | ||||
|       +row("MATE", "Java", "92.5", "550") | ||||
|       +row("Turbo", "C++", "92.4", "349") | ||||
|       +row("Yara", "Java", "92.3", "340") | ||||
| 
 | ||||
|   p | ||||
|     | Discussion with the authors led to accuracy improvements in spaCy, which | ||||
|     | have been accepted for publication in EMNLP, in joint work with Macquarie | ||||
|     | University | ||||
|     a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) | ||||
|     | .  | ||||
| 
 | ||||
|  | @ -1,129 +0,0 @@ | |||
| extends ./outline.jade | ||||
| 
 | ||||
| include ./mixins.jade | ||||
| 
 | ||||
| 
 | ||||
| mixin declare_class(name) | ||||
|   details | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label class | ||||
|         code #{name} | ||||
|     block | ||||
| 
 | ||||
| mixin method(name, parameters) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|         span.parameters | ||||
|           | self, #{parameters} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin params | ||||
|   ul | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin param(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin attribute(name, type, value) | ||||
|   details(open=attributes.open) | ||||
|     summary | ||||
|       span.declaration | ||||
|         span.label #{name} | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(name, type, value) | ||||
|   li | ||||
|     if type | ||||
|       <strong>#{name}</strong> (!{type}) – | ||||
|     else | ||||
|       <strong>#{name}</strong> – | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin returns(type) | ||||
|   | tmp | ||||
| 
 | ||||
| mixin init | ||||
|   details | ||||
|     summary: h4 Init | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin callable | ||||
|   details | ||||
|     summary: h4 Callable | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin sequence | ||||
|   details | ||||
|     summary: h4 Sequence | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin maptype | ||||
|   details | ||||
|     summary: h4 Map | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| mixin summary | ||||
|   block | ||||
| 
 | ||||
| mixin en_example | ||||
|   pre.language-python | ||||
|     code | ||||
|       | from spacy.en import English | ||||
|       | from spacy._doc_examples import download_war_and_peace | ||||
|       |  | ||||
|       | unprocessed_unicode = download_war_and_peace() | ||||
|       |  | ||||
|       | nlp = English() | ||||
|       | doc = nlp(unprocessed_unicode) | ||||
| 
 | ||||
| 
 | ||||
| block intro_block | ||||
|   section(class="intro") | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="#api" class="button") API | ||||
|         li: a(href="#tutorials" class="button") Tutorials | ||||
|         li: a(href="#spec" class="button") Spec | ||||
| 
 | ||||
| 
 | ||||
| block body_block | ||||
|   - var py_docs = '<a class="reference" href="http://docs.python.org/library/' | ||||
| 
 | ||||
|   - | ||||
|     var types = { | ||||
|       'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>', | ||||
|       'bool': py_docs + 'functions.html#bool"><em>bool</em></a>', | ||||
|       'int': py_docs + 'functions.html#int"><em>int</em></a>', | ||||
|       'generator': "", | ||||
|       'Vocab': "", | ||||
|       'Span': "", | ||||
|       'Doc': "" | ||||
|     } | ||||
| 
 | ||||
|   article | ||||
| 
 | ||||
|     +Section("API", "api", "api.jade") | ||||
|     +Section("Tutorials", "tutorials", "tutorials.jade") | ||||
|     +Section("Annotation Specifications", "spec", "spec.jade") | ||||
|  | @ -1,88 +0,0 @@ | |||
| extends ./outline.jade | ||||
| 
 | ||||
| include ./mixins.jade | ||||
| 
 | ||||
| // Notes | ||||
| // | ||||
| // 1. Where to put version notice? Should say something like | ||||
| //   2015-08-12: v0.89 | ||||
| //   and be a link | ||||
| //    | ||||
| //   Only needs to appear on home page. | ||||
| 
 | ||||
| 
 | ||||
| - var slogan = "Build Tomorrow's Language Technologies" | ||||
| - var tag_line = "spaCy – " + slogan | ||||
| 
 | ||||
| mixin lede | ||||
|   - var state_of_the_art = '<a href="#">state-of-the-art</a>' | ||||
|   - var a_minor_miracle = '<a href="">a minor miracle</a>' | ||||
|   - var great_documentation = '<a href="">great documentation</a>' | ||||
|   - var concise_API = '<a href="">concise API</a>' | ||||
|    | ||||
|   p. | ||||
|     <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a | ||||
|     library for industrial-strength natural language processing in Python and | ||||
|     Cython.  It features !{state_of_the_art} speed and accuracy, a !{concise_API}, | ||||
|     and <a href="#license">license terms</a> designed to get out of your way. | ||||
|     If you're a small company doing NLP, we want <strong>spaCy</strong> to seem | ||||
|     like !{a_minor_miracle}. | ||||
| 
 | ||||
| 
 | ||||
| mixin comparison(name) | ||||
|   details | ||||
|     summary | ||||
|       h4= name | ||||
| 
 | ||||
|     block | ||||
|   | ||||
| mixin columns(...names) | ||||
|   tr | ||||
|     each name in names | ||||
|       th= name | ||||
| 
 | ||||
| 
 | ||||
| mixin row(...cells) | ||||
|   tr | ||||
|     each cell in cells | ||||
|       td= cell | ||||
| 
 | ||||
| 
 | ||||
| mixin social       | ||||
|   footer(role="contentinfo") | ||||
|     a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter | ||||
| 
 | ||||
|     div.discuss | ||||
|       a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") | ||||
|         | Discuss on Hacker News | ||||
| 
 | ||||
|       a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") | ||||
|         | Discuss on Reddit | ||||
| 
 | ||||
| 
 | ||||
| block intro_block | ||||
|   section(class="intro") | ||||
|     +lede | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="#example-use" class="button") Examples | ||||
|         li: a(href="#comparisons" class="button") Comparisons | ||||
|         li: a(href="#online-demo" class="button") Try Online | ||||
|         li: a(href="#install" class="button") | ||||
|           | Install | ||||
|           <span class="button-caption">v0.89</span> | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| block body_block | ||||
|   article(class="page landing-page") | ||||
| 
 | ||||
|     +Section("Usage by Example", "example-use", "./usage_examples.jade") | ||||
| 
 | ||||
|     +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") | ||||
|        | ||||
|     +Section("Online Demo", "online-demo", "./online_demo.jade") | ||||
| 
 | ||||
| 
 | ||||
|     +Section("Install", "install", "./install.jade") | ||||
|  | @ -1,71 +0,0 @@ | |||
| mixin Option(name, open) | ||||
|   details(open=open) | ||||
|     summary | ||||
|       h4= name | ||||
|     block | ||||
| 
 | ||||
| +Option("conda", true) | ||||
|   pre.language-bash: code | ||||
|     | $ conda install spacy | ||||
|     | $ python -m spacy.en.download | ||||
| 
 | ||||
| +Option("pip and virtualenv", true) | ||||
|   p With Python 2.7 or Python 3, using Linux or OSX, run: | ||||
| 
 | ||||
|     pre.language-bash: code | ||||
|       | $ pip install spacy | ||||
|       | $ python -m spacy.en.download | ||||
| 
 | ||||
|   p | ||||
|     | The download command fetches and installs about 300mb of data, for | ||||
|     | the parser model and word vectors, which it installs within the spacy.en | ||||
|     | package directory. | ||||
| 
 | ||||
| 
 | ||||
|   +Option("Workaround for obsolete system Python", false) | ||||
|     p | ||||
|       | If you're stuck using a server with an old version of Python, and you | ||||
|       | don't have root access, I've prepared a bootstrap script to help you | ||||
|       | compile a local Python install.  Run: | ||||
| 
 | ||||
|     pre.language-bash: code | ||||
|       | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| +Option("Compile from source", false) | ||||
|   p | ||||
|     | The other way to install the package is to clone the github repository, | ||||
|     | and build it from source.  This installs an additional dependency, | ||||
|     | Cython.  If you're using Python 2, I also recommend installing fabric | ||||
|     | and fabtools – this is how I build the project. | ||||
| 
 | ||||
|   pre.language-bash: code | ||||
|     | $ git clone https://github.com/honnibal/spaCy.git | ||||
|     | $ cd spaCy | ||||
|     | $ virtualenv .env && source .env/bin/activate | ||||
|     | $ export PYTHONPATH=`pwd` | ||||
|     | $ pip install -r requirements.txt | ||||
|     | $ python setup.py build_ext --inplace | ||||
|     | $ python -m spacy.en.download | ||||
|     | $ pip install pytest | ||||
|     | $ py.test tests/ | ||||
| 
 | ||||
|   p | ||||
|     | Python packaging is awkward at the best of times, and it's particularly tricky | ||||
|     | with C extensions, built via Cython, requiring large data files.  So, | ||||
|     | please report issues as you encounter them. | ||||
| 
 | ||||
| +Option("pypy (Unsupported)") | ||||
|   | If PyPy support is a priority for you, please get in touch.  We could likely | ||||
|   | fix the remaining issues, if necessary.  However, the library is likely to | ||||
|   | be much slower on PyPy, as it's written in Cython, which produces code tuned | ||||
|   | for the performance of CPython. | ||||
| 
 | ||||
| +Option("Windows (Unsupported)") | ||||
|   | Unfortunately we don't currently have access to a Windows machine, and have | ||||
|   | no experience developing on a MicroSoft stack. In theory the only problems are | ||||
|   | with the installation and packaging – there should be no deep platform | ||||
|   | dependency. Unfortunately we can't debug these issues at present, simply due | ||||
|   | to lack of a development environment. | ||||
| 
 | ||||
|  | @ -1,179 +0,0 @@ | |||
| extends ./outline.jade | ||||
| 
 | ||||
| mixin columns(...names) | ||||
|   tr | ||||
|     each name in names | ||||
|       th= name | ||||
| 
 | ||||
| 
 | ||||
| mixin row(...cells) | ||||
|   tr | ||||
|     each cell in cells | ||||
|       td= cell | ||||
| 
 | ||||
| 
 | ||||
| mixin LicenseOption(name, period, price, audience) | ||||
|     .item | ||||
|       h4 #{name} | ||||
|          | ||||
|       .focus #{period} | ||||
| 
 | ||||
|       span #{price} | ||||
|          | ||||
|       h5 Suggested for: | ||||
|          | ||||
|       span #{audience} | ||||
|          | ||||
|       a.button(href="spacy_trial_free.docx") Download license | ||||
| 
 | ||||
|       span or  | ||||
|         a(href="#") get in touch | ||||
| 
 | ||||
|   | ||||
| block body_block | ||||
|   article.pricing | ||||
| 
 | ||||
|     .box.license | ||||
|       +LicenseOption("Trial", "90 days", "$0", "Evaluation") | ||||
|       +LicenseOption("Production", "1 year", "$5,000", "Production") | ||||
|       +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") | ||||
| 
 | ||||
|     p.caption | ||||
|       | Researcher, hobbyist, or open-source developer? spaCy also offers  | ||||
|       a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3  | ||||
|       | licenses. | ||||
| 
 | ||||
|     p. | ||||
|       What we offer is a rare, simple certainty: a long-term, permissive license | ||||
|       that comes with full access to the source, complete transparency, and almost | ||||
|       complete flexibility.  The difference between this and a black-box API is | ||||
|       night and day.  You cannot build a great product against a service you | ||||
|       don't understand, and you can't build a great business on a service you | ||||
|       don't control. | ||||
|        | ||||
|     p | ||||
|       | Let's face it: services disappear.  Constantly. The good start-ups get | ||||
|       | bought; the bad ones go bankrupt.  Open-source projects become abandoned | ||||
|       | or bloated.  Google's graveyard is over-flowing – ditto for Yahoo!, | ||||
|       | Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? | ||||
| 
 | ||||
|     p | ||||
|       | A 5 year license won't expire until 2020.  spaCy will be with you for | ||||
|       | longer than most of your current staff.  If that's still not enough, | ||||
|       | get in touch. I'm sure we can work something out. | ||||
| 
 | ||||
|     //p. | ||||
|     //  To make spaCy as valuable as possible, licenses to it are for life.  You get | ||||
|     //  complete transparency, certainty and control.  If you need to use spaCy | ||||
|     //  as an API, it's trivial to host it yourself – and you don't need to | ||||
|     //  worry about the service changing or disappearing.  And if you're ever in | ||||
|     //  acquisition or IPO talks, the story is simple. | ||||
| 
 | ||||
|     //p. | ||||
|     //  spaCy can also be used as free open-source software, under the Aferro GPL | ||||
|     //  license.  If you use it this way, you must comply with the AGPL license | ||||
|     //  terms.  When you distribute your project, or offer it as a network service, | ||||
|     //  you must distribute the source-code and grant users an AGPL license to it. | ||||
| 
 | ||||
| 
 | ||||
|     //h3 Examples | ||||
| 
 | ||||
|     //p. | ||||
|     //  In order to clarify how spaCy's license structure might apply to you, I've | ||||
|     //  written a few examples, in the form of user-stories. | ||||
| 
 | ||||
|     //details | ||||
|     //  summary: h4 Seed stage start-ups | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Ashley and Casey have an idea for a start-up.  To explore their idea, they | ||||
|     //    want to build a minimum viable product they can put in front of potential | ||||
|     //    users and investors. | ||||
| 
 | ||||
|     //  p. They have two options. | ||||
| 
 | ||||
|     //  ol | ||||
|     //    li | ||||
|     //      p. | ||||
|     //        <strong>Trial commercial license.</strong> With a simple form, they can | ||||
|     //        use spaCy for 90 days, for a nominal fee of $1.  They are free to modify | ||||
|     //        spaCy, and they will own the copyright to their modifications for the | ||||
|     //        duration of the license.  After the trial period elapses, they can either | ||||
|     //        pay the license fee, stop using spaCy, release their project under the | ||||
|     //        AGPL. | ||||
|     // | ||||
|     //    li | ||||
|     //      p. | ||||
|     //        <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL | ||||
|     //        license. However, they must then release any code that statically or | ||||
|     //        dynamically links to spaCy under the AGPL as well (e.g. if they import | ||||
|     //        the module, or import a module that imports it, etc).  They also cannot | ||||
|     //        use spaCy as a network resource, by running it as a service --- this is | ||||
|     //        the loophole that the "A" part of the AGPL is designed to close. | ||||
|     //   | ||||
|     //  p. | ||||
|     //    Ashley and Casey find the AGPL license unattractive for commercial use. | ||||
|     //    They decide to take up the trial commercial license.  However,  over the | ||||
|     //    next 90 days, Ashley has to move house twice, and Casey gets sick.  By | ||||
|     //    the time the trial expires, they still don't have a demo they can show | ||||
|     //    investors.  They send an email explaining the situation, and a 90 day extension | ||||
|     //    to their trial license is granted. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    By the time the extension period has elapsed, spaCy has helped them secure | ||||
|     //    funding, and they even have a little revenue.  They are glad to pay the | ||||
|     //    $5,000 commercial license fee. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    spaCy is now permanently licensed for the product Ashley and Casey are | ||||
|     //    developing.  They own the copyright to any modifications they make to spaCy, | ||||
|     //    but not to the original spaCy code. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    No additional fees will be due when they hire new developers, run spaCy on | ||||
|     //    additional internal servers, etc.  If their company is acquired, the license | ||||
|     //    will be transferred to the company acquiring them.  However, to use spaCy | ||||
|     //    in another product, they will have to buy a second license. | ||||
| 
 | ||||
| 
 | ||||
|     // details | ||||
|     //  summary: h4 University academics | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Alex and Sasha are post-doctoral researchers working for a university. | ||||
|     //    Part of their funding comes from a grant from Google, but Google will not | ||||
|     //    own any part of the work that they produce.  Their mission is just to write | ||||
|     //    papers. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Alex and Sasha find spaCy convenient, so they use it in their system under | ||||
|     //    the AGPL.  This means that their system must also be released under the | ||||
|     //    AGPL, but they're cool with that – they were going to release their | ||||
|     //    code anyway, as it's the only way to ensure their experiments are properly | ||||
|     //    repeatable. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Alex and Sasha find and fix a few bugs in spaCy.  They must release these | ||||
|     //    modifications, and they ask that they be accepted into the main spaCy repo. | ||||
|     //    In order to do this, they must sign a contributor agreement, ceding their | ||||
|     //    copyright.  When commercial licenses to spaCy are sold, Alex and Sasha will | ||||
|     //    not be able to claim any royalties from their contributions. | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Later, Alex and Sasha implement new features into spaCy, for another paper. | ||||
|     //    The code was quite rushed, and they don't want to take the time to put | ||||
|     //    together a proper pull request.  They must release their modifications | ||||
|     //    under the AGPL, but they are not obliged to contribute it to the spaCy | ||||
|     //    repository, or concede their copyright. | ||||
| 
 | ||||
|     // details | ||||
|     //  summary: h4 Open Source developers | ||||
| 
 | ||||
|     //  p. | ||||
|     //    Phuong and Jessie use the open-source software Calibre to manage their | ||||
|     //    e-book libraries.  They have an idea for a search feature, and they want | ||||
|     //    to use spaCy to implement it.  Calibre is released under the GPLv3.  The | ||||
|     //    AGPL has additional restrictions for projects used as a network resource, | ||||
|     //    but they don't apply to this project, so Phuong and Jessie can use spaCy | ||||
|     //    to improve Calibre.  They'll have to release their code, but that was | ||||
|     //    always their intention anyway. | ||||
|  | @ -1,17 +0,0 @@ | |||
| mixin Section(title_text, link_name, include_file) | ||||
|   h3: a(name=link_name) #{title_text} | ||||
| 
 | ||||
|   if (link_name == "example-use") | ||||
|     include ./usage_examples.jade | ||||
|   else if (link_name == "online-demo") | ||||
|     include ./online_demo.jade | ||||
|   else if (link_name == "comparisons") | ||||
|     include ./comparisons.jade | ||||
|   else if (link_name == "install") | ||||
|     include ./installation.jade | ||||
|   else if (link_name == "api") | ||||
|     include ./api.jade | ||||
|   else if (link_name == "tutorials") | ||||
|     include ./tutorials.jade | ||||
|   else if (link_name == "spec") | ||||
|     include ./spec.jade | ||||
|  | @ -1,18 +0,0 @@ | |||
| mixin Displacy(sentence, caption_text, height) | ||||
|   - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20") | ||||
| 
 | ||||
|   .displacy | ||||
|     iframe.displacy(src="displacy/displacy_demo.html" height=height) | ||||
|      | ||||
|     a.view-displacy(href=url) | ||||
|       | Interactive Visualizer | ||||
| 
 | ||||
|     p.caption. | ||||
|       #{caption_text} | ||||
| 
 | ||||
| 
 | ||||
| +Displacy( | ||||
|   "Click the button to see this sentence in displaCy.", | ||||
|   "The best parse-tree visualizer and annotation tool in all the land.", | ||||
|   275 | ||||
| ) | ||||
|  | @ -1,37 +0,0 @@ | |||
| - var slogan = "Build Tomorrow's Language Technologies" | ||||
| - var tag_line = "spaCy – " + slogan | ||||
| 
 | ||||
| 
 | ||||
| doctype html | ||||
| html(lang="en") | ||||
|   head | ||||
|     meta(charset="utf-8") | ||||
|     title!= tag_line | ||||
|     meta(name="description" content="") | ||||
|     meta(name="author" content="Matthew Honnibal") | ||||
|     link(rel="stylesheet" href="css/style.css") | ||||
|     <!--[if lt IE 9]> | ||||
|     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") | ||||
|     <![endif]--> | ||||
| 
 | ||||
|   body(id="home" role="document") | ||||
|     header(role="banner") | ||||
|       h1(class="logo")!= tag_line | ||||
|       div(class="slogan")!= slogan | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="home.html") Home | ||||
|         li: a(href="docs.html") Docs | ||||
|         li: a(href="license.html") License | ||||
|         li: a(href="blog.html") Blog | ||||
| 
 | ||||
|     main(id="content" role="main") | ||||
|       block intro_block | ||||
| 
 | ||||
|       block body_block | ||||
|   | ||||
|   footer(role="contentinfo") | ||||
| 
 | ||||
|   script(src="js/prism.js") | ||||
|   script(src="js/details_polyfill.js") | ||||
|  | @ -1,129 +0,0 @@ | |||
| mixin columns(...names) | ||||
|   tr | ||||
|     each name in names | ||||
|       th= name | ||||
| 
 | ||||
| 
 | ||||
| mixin row(...cells) | ||||
|   tr | ||||
|     each cell in cells | ||||
|       td= cell | ||||
| 
 | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Overview | ||||
|    | ||||
|   p. | ||||
|     This document describes the target annotations spaCy is trained to predict. | ||||
|     This is currently a work in progress. Please ask questions on the issue tracker, | ||||
|     so that the answers can be integrated here to improve the documentation. | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Tokenization | ||||
| 
 | ||||
|   p Tokenization standards are based on the OntoNotes 5 corpus. | ||||
| 
 | ||||
|   p. | ||||
|     The tokenizer differs from most by including tokens for significant | ||||
|     whitespace. Any sequence of whitespace characters beyond a single space | ||||
|     (' ') is included as a token. For instance: | ||||
| 
 | ||||
|   pre.language-python | ||||
|     code | ||||
|       | from spacy.en import English | ||||
|       | nlp = English(parse=False) | ||||
|       | tokens = nlp('Some\nspaces  and\ttab characters') | ||||
|       | print([t.orth_ for t in tokens]) | ||||
|          | ||||
|   p Which produces: | ||||
|      | ||||
|   pre.language-python | ||||
|     code | ||||
|       | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] | ||||
| 
 | ||||
|   p. | ||||
|     The whitespace tokens are useful for much the same reason punctuation is | ||||
|     – it's often an important delimiter in the text.  By preserving | ||||
|     it in the token output, we are able to maintain a simple alignment | ||||
|     between the tokens and the original string, and we ensure that no | ||||
|     information is lost during processing. | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Sentence boundary detection | ||||
| 
 | ||||
|   p. | ||||
|     Sentence boundaries are calculated from the syntactic parse tree, so | ||||
|     features such as punctuation and capitalisation play an important but | ||||
|     non-decisive role in determining the sentence boundaries.  Usually this | ||||
|     means that the sentence boundaries will at least coincide with clause | ||||
|     boundaries, even given poorly punctuated text. | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Part-of-speech Tagging | ||||
| 
 | ||||
|   p. | ||||
|     The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank | ||||
|     tag set.  We also map the tags to the simpler Google Universal POS Tag set. | ||||
| 
 | ||||
|   p. | ||||
|     Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Lemmatization | ||||
| 
 | ||||
|   p. | ||||
|     A "lemma" is the uninflected form of a word. In English, this means: | ||||
| 
 | ||||
|   ul | ||||
|     li Adjectives: The form like "happy", not "happier" or "happiest" | ||||
|     li Adverbs: The form like "badly", not "worse" or "worst" | ||||
|     li Nouns: The form like "dog", not "dogs"; like "child", not "children" | ||||
|     li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"  | ||||
| 
 | ||||
|   p. | ||||
|     The lemmatization data is taken from WordNet. However, we also add a | ||||
|     special case for pronouns: all pronouns are lemmatized to the special | ||||
|     token -PRON-. | ||||
| 
 | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Syntactic Dependency Parsing | ||||
| 
 | ||||
|   p. | ||||
|     The parser is trained on data produced by the ClearNLP converter. Details | ||||
|     of the annotation scheme can be found here:  http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf | ||||
| 
 | ||||
| details | ||||
|   summary: h4 Named Entity Recognition | ||||
| 
 | ||||
|   table | ||||
|     thead | ||||
|       +columns("Entity Type", "Description") | ||||
|        | ||||
|     tbody | ||||
|       +row("PERSON", "People, including fictional.") | ||||
|       +row("NORP", "Nationalities or religious or political groups.") | ||||
|       +row("FACILITY", "Buildings, airports, highways, bridges, etc.") | ||||
|       +row("ORG", "Companies, agencies, institutions, etc.") | ||||
|       +row("GPE", "Countries, cities, states.") | ||||
|       +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") | ||||
|       +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") | ||||
|       +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") | ||||
|       +row("WORK_OF_ART", "Titles of books, songs, etc.") | ||||
|       +row("LAW", "Named documents made into laws") | ||||
|       +row("LANGUAGE", "Any named language") | ||||
| 
 | ||||
|   p The following values are also annotated in a style similar to names: | ||||
| 
 | ||||
|   table | ||||
|     thead | ||||
|       +columns("Entity Type", "Description") | ||||
|        | ||||
|     tbody | ||||
|       +row("DATE", "Absolute or relative dates or periods") | ||||
|       +row("TIME", "Times smaller than a day") | ||||
|       +row("PERCENT", 'Percentage (including “%”)') | ||||
|       +row("MONEY", "Monetary values, including unit") | ||||
|       +row("QUANTITY", "Measurements, as of weight or distance") | ||||
|       +row("ORDINAL", 'first", "second"') | ||||
|       +row("CARDINAL", "Numerals that do not fall under another type") | ||||
|  | @ -1,31 +0,0 @@ | |||
| doctype html | ||||
| html(lang='en') | ||||
|   head | ||||
|     meta(charset='utf-8') | ||||
|     title spaCy Blog | ||||
|     meta(name='description', content='') | ||||
|     meta(name='author', content='Matthew Honnibal') | ||||
|     link(rel='stylesheet', href='css/style.css') | ||||
|     //if lt IE 9 | ||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') | ||||
|   body#blog(role="document") | ||||
|     header(role='banner') | ||||
|       h1.logo spaCy Blog | ||||
|       .slogan Blog | ||||
| 
 | ||||
|     nav(role="navigation") | ||||
|       ul | ||||
|         li: a(href="home.html")        Home | ||||
|         li: a(href="docs.html")        Docs | ||||
|         li.active: a(href="blog.html") Blog | ||||
|         li: a(href="license.html")     License | ||||
| 
 | ||||
|     main#content(role='main') | ||||
|       block intro_block | ||||
| 
 | ||||
|       block body_block | ||||
|   | ||||
|   footer(role='contentinfo') | ||||
| 
 | ||||
|   script(src="js/prism.js") | ||||
|   script(src="js/details_polyfill.js") | ||||
|  | @ -1,200 +0,0 @@ | |||
| doctype html | ||||
| html(lang='en') | ||||
|   head | ||||
|     meta(charset='utf-8') | ||||
|     title spaCy Blog | ||||
|     meta(name='description', content='') | ||||
|     meta(name='author', content='Matthew Honnibal') | ||||
|     link(rel='stylesheet', href='css/style.css') | ||||
|     //if lt IE 9 | ||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') | ||||
|   body#blog | ||||
|     header(role='banner') | ||||
|       h1.logo spaCy Blog | ||||
|       .slogan Blog | ||||
|     main#content(role='main') | ||||
|       article.post | ||||
|   | ||||
| 
 | ||||
|         :markdown-it | ||||
|           # Adverbs | ||||
|    | ||||
|           Let's say you're developing a proofreading tool, or possibly an IDE for | ||||
|           writers.  You're convinced by Stephen King's advice that `adverbs are | ||||
|           not your friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_, | ||||
|           so you want to **highlight all adverbs**.  We'll use one of the examples | ||||
|           he finds particularly egregious: | ||||
|      | ||||
|         pre.language-python | ||||
|           code | ||||
|             | import spacy.en | ||||
|             | >>> from spacy.parts_of_speech import ADV | ||||
|             | >>> # Load the pipeline, and call it with some text. | ||||
|             | >>> nlp = spacy.en.English() | ||||
|             | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False) | ||||
|             | >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) | ||||
|             | u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ | ||||
|    | ||||
|         :markdown-it | ||||
|           Easy enough --- but the problem is that we've also highlighted "back". | ||||
|           While "back" is undoubtedly an adverb, we probably don't want to highlight | ||||
|           it. If what we're trying to do is flag dubious stylistic choices, we'll | ||||
|           need to refine our logic.  It turns out only a certain type of adverb | ||||
|           is of interest to us. | ||||
| 
 | ||||
| 
 | ||||
|         :markdown-it | ||||
|           There are lots of ways we might do this, depending on just what words | ||||
|           we want to flag.  The simplest way to exclude adverbs like "back" and | ||||
|           "not" is by word frequency: these words are much more common than the | ||||
|           prototypical manner adverbs that the style guides are worried about. | ||||
|    | ||||
|         :markdown-it | ||||
|           The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a | ||||
|           log probability estimate of the word: | ||||
|    | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> nlp.vocab[u'back'].prob | ||||
|             | -7.403977394104004 | ||||
|             | >>> nlp.vocab[u'not'].prob | ||||
|             | -5.407193660736084 | ||||
|             | >>> nlp.vocab[u'quietly'].prob | ||||
|             | -11.07155704498291 | ||||
|    | ||||
|         :markdown-it | ||||
|           (The probability estimate is based on counts from a 3 billion word corpus, | ||||
|           smoothed using the `Simple Good-Turing`_ method.) | ||||
|    | ||||
|           So we can easily exclude the N most frequent words in English from our | ||||
|           adverb marker.  Let's try N=1000 for now: | ||||
|   | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> import spacy.en | ||||
|             | >>> from spacy.parts_of_speech import ADV | ||||
|             | >>> nlp = spacy.en.English() | ||||
|             | >>> # Find log probability of Nth most frequent word | ||||
|             | >>> probs = [lex.prob for lex in nlp.vocab] | ||||
|             | >>> probs.sort() | ||||
|             | >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] | ||||
|             | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") | ||||
|             | >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) | ||||
|             | ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ | ||||
|          | ||||
|         :markdown-it | ||||
|           There are lots of other ways we could refine the logic, depending on | ||||
|           just what words we want to flag.  Let's say we wanted to only flag | ||||
|           adverbs that modified words similar to "pleaded".  This is easy to do, | ||||
|           as spaCy loads a vector-space representation for every word (by default, | ||||
|           the vectors produced by `Levy and Goldberg (2014)`_).  Naturally, the | ||||
|           vector is provided as a numpy array: | ||||
| 
 | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> pleaded = tokens[7] | ||||
|             | >>> pleaded.repvec.shape | ||||
|             | (300,) | ||||
|             | >>> pleaded.repvec[:5] | ||||
|             | array([ 0.04229792,  0.07459262,  0.00820188, -0.02181299,  0.07519238], dtype=float32) | ||||
|    | ||||
|         :markdown-it | ||||
|           We want to sort the words in our vocabulary by their similarity to | ||||
|           "pleaded".  There are lots of ways to measure the similarity of two | ||||
|           vectors.  We'll use the cosine metric: | ||||
| 
 | ||||
|         pre.language-python | ||||
|           code  | ||||
|             | >>> from numpy import dot | ||||
|             | >>> from numpy.linalg import norm | ||||
|    | ||||
|             | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) | ||||
|             | >>> words = [w for w in nlp.vocab if w.has_repvec] | ||||
|             | >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) | ||||
|             | >>> words.reverse() | ||||
|             | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) | ||||
|             | 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading | ||||
|             | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) | ||||
|             | 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses | ||||
|             | >>> print('100-110', ', '.join(w.orth_ for w in words[100:110])) | ||||
|             | 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes | ||||
|             | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) | ||||
|             | 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged | ||||
|             | >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010])) | ||||
|             | 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists | ||||
|    | ||||
|         :markdown-it | ||||
|           As you can see, the similarity model that these vectors give us is excellent | ||||
|           --- we're still getting meaningful results at 1000 words, off a single | ||||
|           prototype!  The only problem is that the list really contains two clusters of | ||||
|           words: one associated with the legal meaning of "pleaded", and one for the more | ||||
|           general sense.  Sorting out these clusters is an area of active research. | ||||
|    | ||||
|           A simple work-around is to average the vectors of several words, and use that | ||||
|           as our target: | ||||
|    | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] | ||||
|             | >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) | ||||
|             | >>> words.sort(key=lambda w: cosine(w.repvec * say_vector)) | ||||
|             | >>> words.reverse() | ||||
|             | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) | ||||
|             | 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired | ||||
|             | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) | ||||
|             | 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed | ||||
|             | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) | ||||
|             | 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate | ||||
|    | ||||
|         :markdown-it | ||||
|           These definitely look like words that King might scold a writer for attaching | ||||
|           adverbs to.  Recall that our original adverb highlighting function looked like | ||||
|           this: | ||||
|    | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> import spacy.en | ||||
|             | >>> from spacy.parts_of_speech import ADV | ||||
|             | >>> # Load the pipeline, and call it with some text. | ||||
|             | >>> nlp = spacy.en.English() | ||||
|             | >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", | ||||
|             |                  tag=True, parse=False) | ||||
|             | >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) | ||||
|             | ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ | ||||
|    | ||||
|    | ||||
|         :markdown-it | ||||
|           We wanted to refine the logic so that only adverbs modifying evocative | ||||
|           verbs of communication, like "pleaded", were highlighted.  We've now | ||||
|           built a vector that represents that type of word, so now we can highlight | ||||
|           adverbs based on subtle logic, honing in on adverbs that seem the most | ||||
|           stylistically problematic, given our starting assumptions: | ||||
|    | ||||
|         pre.language-python | ||||
|           code | ||||
|             | >>> import numpy | ||||
|             | >>> from numpy import dot | ||||
|             | >>> from numpy.linalg import norm | ||||
|             | >>> import spacy.en | ||||
|             | >>> from spacy.parts_of_speech import ADV, VERB | ||||
|             | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) | ||||
|             | >>> def is_bad_adverb(token, target_verb, tol): | ||||
|             | ...   if token.pos != ADV | ||||
|             | ...     return False | ||||
|             | ...   elif token.head.pos != VERB: | ||||
|             | ...     return False | ||||
|             | ...   elif cosine(token.head.repvec, target_verb) < tol: | ||||
|             | ...     return False | ||||
|             | ...   else: | ||||
|             | ...     return True | ||||
|    | ||||
|         :markdown-it | ||||
|           This example was somewhat contrived --- and, truth be told, I've never | ||||
|           really bought the idea that adverbs were a grave stylistic sin.  But | ||||
|           hopefully it got the message across: the state-of-the-art NLP technologies | ||||
|           are very powerful. spaCy gives you easy and efficient access to them, | ||||
|           which lets you build all sorts of useful products and features that | ||||
|           were previously impossible. | ||||
| 
 | ||||
|   footer(role='contentinfo') | ||||
|   script(src='js/prism.js') | ||||
|  | @ -1,132 +0,0 @@ | |||
| doctype html | ||||
| html(lang='en') | ||||
|   head | ||||
|     meta(charset='utf-8') | ||||
|     title spaCy Blog | ||||
|     meta(name='description', content='') | ||||
|     meta(name='author', content='Matthew Honnibal') | ||||
|     link(rel='stylesheet', href='css/style.css') | ||||
|     //if lt IE 9 | ||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') | ||||
|   body#blog | ||||
|     header(role='banner') | ||||
|       h1.logo spaCy Blog | ||||
|       .slogan Blog | ||||
|     main#content(role='main') | ||||
|       section.intro | ||||
|         p | ||||
|           | Example use of the spaCy NLP tools for data exploration. | ||||
|           | Here we will look for reddit comments that describe Google doing something, | ||||
|           | i.e. discuss the company's actions. This is difficult, because other senses of | ||||
|           | "Google" now dominate usage of the word in conversation, particularly references to | ||||
|           | using Google products. | ||||
|          | ||||
|         p | ||||
|           | The heuristics used are quick and dirty – about 5 minutes work. | ||||
|            | ||||
|         //| A better approach is to use the word vector of the verb. But, the | ||||
|         //  | demo here is just to show what's possible to build up quickly, to | ||||
|         //  | start to understand some data. | ||||
| 
 | ||||
|       article.post | ||||
|         header | ||||
|           h2 Syntax-specific Search | ||||
|           .subhead | ||||
|             | by  | ||||
|             a(href='#', rel='author') Matthew Honnibal | ||||
|             |  on  | ||||
|             time(datetime='2015-08-14') August | ||||
|            | ||||
|         details | ||||
|           summary: h4 Imports | ||||
| 
 | ||||
|           pre.language-python | ||||
|             code | ||||
|               | from __future__ import unicode_literals | ||||
|               | from __future__ import print_function | ||||
|               | import sys | ||||
|               |  | ||||
|               | import plac | ||||
|               | import bz2 | ||||
|               | import ujson | ||||
|               | import spacy.en | ||||
|            | ||||
|         details | ||||
|           summary: h4 Load the model and iterate over the data | ||||
| 
 | ||||
|           pre.language-python | ||||
|             code  | ||||
|               | def main(input_loc): | ||||
|               |     nlp = spacy.en.English()                 # Load the model takes 10-20 seconds. | ||||
|               |     for line in bz2.BZ2File(input_loc):      # Iterate over the reddit comments from the dump.  | ||||
|               |         comment_str = ujson.loads(line)['body']  # Parse the json object, and extract the 'body' attribute.  | ||||
|               |          | ||||
|         details | ||||
|           summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want | ||||
| 
 | ||||
|           pre.language-python | ||||
|             code | ||||
|               |         comment_parse = nlp(comment_str)  | ||||
|               |         for word in comment_parse:   | ||||
|               |             if google_doing_something(word): | ||||
|               |                 # Print the clause | ||||
|               |                 print(''.join(w.string for w in word.head.subtree).strip()) | ||||
|         details | ||||
|           summary: h4 Define the filter function | ||||
| 
 | ||||
|           pre.language-python | ||||
|             code | ||||
| 
 | ||||
|               |  | ||||
|               | def google_doing_something(w): | ||||
|               |     if w.lower_ != 'google': | ||||
|               |         return False | ||||
|               |     # Is it the subject of a verb? | ||||
|               |     elif w.dep_ != 'nsubj':  | ||||
|               |         return False | ||||
|               |     # And not 'is' | ||||
|               |     elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux':  | ||||
|               |         return False | ||||
|               |     # Exclude e.g. "Google says..." | ||||
|               |     elif w.head.lemma_ in ('say', 'show'):  | ||||
|               |         return False | ||||
|               |     else: | ||||
|               |         return True | ||||
|               |  | ||||
|               |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Call main | ||||
| 
 | ||||
|           pre.language-python | ||||
|             code | ||||
|               | if __name__ == '__main__': | ||||
|               |     plac.call(main) | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Example output | ||||
| 
 | ||||
|           p. | ||||
|             Many false positives remain. Some are from incorrect interpretations | ||||
|             of the sentence by spaCy, some are flaws in our filtering logic. But | ||||
|             the results are vastly better than a string-based search, which returns | ||||
|             almost no examples of the pattern we're looking for. | ||||
| 
 | ||||
|           code | ||||
|             | Google dropped support for Android < 4.0 already | ||||
|             | google drive | ||||
|             | Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc | ||||
|             | When Google responds | ||||
|             | Google translate cyka pasterino. | ||||
|             | A quick google looks like Synology does have a sync'ing feature which does support block level so that should work  | ||||
|             | (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible? | ||||
|             | Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop. | ||||
|             | Google offers something like this already, but it is truly terrible. | ||||
|             | google isn't helping me | ||||
|             | Google tells me: 0 results, 250 pages removed from google. | ||||
|             | how did Google swoop in and eat our lunch | ||||
| 
 | ||||
|              | ||||
| 
 | ||||
|   script(src="js/prism.js") | ||||
|   script(src="js/details_polyfill.js") | ||||
|  | @ -1,204 +0,0 @@ | |||
| doctype html | ||||
| html(lang='en') | ||||
|   head | ||||
|     meta(charset='utf-8') | ||||
|     title spaCy Blog | ||||
|     meta(name='description', content='') | ||||
|     meta(name='author', content='Matthew Honnibal') | ||||
|     link(rel='stylesheet', href='css/style.css') | ||||
|     //if lt IE 9 | ||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') | ||||
|   body#blog | ||||
|     header(role='banner') | ||||
|       h1.logo spaCy Blog | ||||
|       .slogan Blog | ||||
|     main#content(role='main') | ||||
|       article.post | ||||
|         header | ||||
|           h2 Finding Relevant Tweets | ||||
|           .subhead | ||||
|             | by  | ||||
|             a(href='#', rel='author') Matthew Honnibal | ||||
|             |  on  | ||||
|             time(datetime='2015-08-14') December | ||||
|            | ||||
|         details | ||||
|           summary: h4 Imports | ||||
|           pre.language-python | ||||
| 
 | ||||
|             | from __future__ import unicode_literals, print_function | ||||
|             | import plac | ||||
|             | import codecs | ||||
|             | import sys | ||||
|             | import math | ||||
|             |  | ||||
|             | import spacy.en | ||||
|             | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ | ||||
|             |  | ||||
|             | from termcolor import colored | ||||
|             | from twython import TwythonStreamer | ||||
|             |  | ||||
|             | from os import path | ||||
|             | from math import sqrt | ||||
|             |  | ||||
|             | from numpy import dot | ||||
|             | from numpy.linalg import norm | ||||
|             |  | ||||
|             |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Simple vector-averaging similarity | ||||
| 
 | ||||
|           pre.language-python: code | ||||
| 
 | ||||
|             | class Meaning(object): | ||||
|             |     def __init__(self, vectors): | ||||
|             |         if vectors: | ||||
|             |             self.vector = sum(vectors) / len(vectors) | ||||
|             |             self.norm = norm(self.vector) | ||||
|             |         else: | ||||
|             |             self.vector = None | ||||
|             |             self.norm = 0 | ||||
|             |  | ||||
|             |     @classmethod | ||||
|             |     def from_path(cls, nlp, loc): | ||||
|             |         with codecs.open(loc, 'r', 'utf8') as file_: | ||||
|             |             terms = file_.read().strip().split() | ||||
|             |         return cls.from_terms(nlp, terms) | ||||
|             |  | ||||
|             |     @classmethod | ||||
|             |     def from_tokens(cls, nlp, tokens): | ||||
|             |         vectors = [t.repvec for t in tokens] | ||||
|             |         return cls(vectors) | ||||
|             |  | ||||
|             |     @classmethod | ||||
|             |     def from_terms(cls, nlp, examples): | ||||
|             |         lexemes = [nlp.vocab[eg] for eg in examples] | ||||
|             |         vectors = [eg.repvec for eg in lexemes] | ||||
|             |         return cls(vectors) | ||||
|             |  | ||||
|             |     def similarity(self, other): | ||||
|             |         if not self.norm or not other.norm: | ||||
|             |             return -1 | ||||
|             |         return dot(self.vector, other.vector) / (self.norm * other.norm) | ||||
|             |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Print matches | ||||
|                | ||||
|           pre.language-python: code | ||||
| 
 | ||||
|             |  | ||||
|             | def print_colored(model, stream=sys.stdout): | ||||
|             |     if model['is_match']: | ||||
|             |         color = 'green' | ||||
|             |     elif model['is_reject']: | ||||
|             |         color = 'red' | ||||
|             |     else: | ||||
|             |         color = 'grey' | ||||
|             |      | ||||
|             |     if not model['is_rare'] and model['is_match'] and not model['is_reject']: | ||||
|             |         match_score = colored('%.3f' % model['match_score'], 'green') | ||||
|             |         reject_score = colored('%.3f' % model['reject_score'], 'red') | ||||
|             |         prob = '%.5f' % model['prob'] | ||||
|             |  | ||||
|             |         print(match_score, reject_score, prob) | ||||
|             |         print(repr(model['text']), color) | ||||
|             |         print('') | ||||
|             |  | ||||
|             |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 TextMatcher: Process the tweets using spaCy | ||||
| 
 | ||||
|           pre.language-python: code | ||||
| 
 | ||||
|             | class TextMatcher(object): | ||||
|             |     def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): | ||||
|             |         self.nlp = nlp | ||||
|             |         self.get_target = get_target | ||||
|             |         self.get_reject = get_reject | ||||
|             |         self.min_prob = min_prob | ||||
|             |         self.min_match = min_match | ||||
|             |         self.max_reject = max_reject | ||||
|             |  | ||||
|             |     def __call__(self, text): | ||||
|             |         tweet = self.nlp(text) | ||||
|             |         target_terms = self.get_target() | ||||
|             |         reject_terms = self.get_reject() | ||||
|             |  | ||||
|             |         prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) | ||||
|             |         meaning = Meaning.from_tokens(self, tweet) | ||||
|             |          | ||||
|             |         match_score = meaning.similarity(self.get_target()) | ||||
|             |         reject_score = meaning.similarity(self.get_reject()) | ||||
|             |         return { | ||||
|             |             'text': tweet.string, | ||||
|             |             'prob': prob, | ||||
|             |             'match_score': match_score, | ||||
|             |             'reject_score': reject_score, | ||||
|             |             'is_rare': prob < self.min_prob, | ||||
|             |             'is_match': prob >= self.min_prob  and match_score  >= self.min_match, | ||||
|             |             'is_reject': prob >= self.min_prob and reject_score >= self.max_reject | ||||
|             |         } | ||||
|             |  | ||||
|             |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Connect to Twitter and stream tweets | ||||
| 
 | ||||
|           pre.language-python: code | ||||
| 
 | ||||
|             | class Connection(TwythonStreamer): | ||||
|             |     def __init__(self, keys_dir, handler, view): | ||||
|             |         keys = Secrets(keys_dir) | ||||
|             |         TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)  | ||||
|             |         self.handler = handler | ||||
|             |         self.view = view | ||||
|             |  | ||||
|             |     def on_success(self, data): | ||||
|             |         text = data.get('text', u'') | ||||
|             |         # Twython returns either bytes or unicode, depending on tweet. | ||||
|             |         # #APIshaming | ||||
|             |         try: | ||||
|             |             model = self.handler(text) | ||||
|             |         except TypeError: | ||||
|             |             model = self.handler(text.decode('utf8')) | ||||
|             |         status = self.view(model, sys.stdin) | ||||
|             |  | ||||
|             |     def on_error(self, status_code, data): | ||||
|             |         print(status_code) | ||||
|             |  | ||||
|             |  | ||||
|             | class Secrets(object): | ||||
|             |     def __init__(self, key_dir): | ||||
|             |         self.key = open(path.join(key_dir, 'key.txt')).read().strip() | ||||
|             |         self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() | ||||
|             |         self.token = open(path.join(key_dir, 'token.txt')).read().strip() | ||||
|             |         self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() | ||||
|             |  | ||||
|             |  | ||||
| 
 | ||||
|         details | ||||
|           summary: h4 Command-line interface | ||||
| 
 | ||||
|           pre.language-python: code | ||||
| 
 | ||||
|             | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): | ||||
|             |     # We don't need the parser for this demo, so may as well save the loading time | ||||
|             |     nlp = spacy.en.English(Parser=None) | ||||
|             |     get_target = lambda: Meaning.from_path(nlp, target_loc) | ||||
|             |     get_reject = lambda: Meaning.from_path(nlp, reject_loc) | ||||
|             |     matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) | ||||
|             |  | ||||
|             |     twitter = Connection(keys_dir, matcher, print_colored) | ||||
|             |     twitter.statuses.filter(track=term) | ||||
|             |  | ||||
|             |  | ||||
|             | if __name__ == '__main__': | ||||
|             |     plac.call(main) | ||||
|             |    | ||||
| 
 | ||||
|   footer(role='contentinfo') | ||||
|   script(src='js/prism.js') | ||||
| 
 | ||||
|  | @ -1,29 +0,0 @@ | |||
| mixin Tutorial(title) | ||||
|   details | ||||
|     summary | ||||
|       h4= title  | ||||
| 
 | ||||
|     block | ||||
| 
 | ||||
| +Tutorial("Mark-up all manner adverbs, especially for verbs of speech") | ||||
|   | Let's say you're developing a proofreading tool, or possibly an IDE for | ||||
|   | writers.  You're convinced by Stephen King's advice that  | ||||
|   | adverbs are not your friend | ||||
|   | so you want to  | ||||
|   a.readmore(href='tute_adverbs.html')  | ||||
|     | highlight all adverbs. ► | ||||
| 
 | ||||
| +Tutorial("Search Reddit for comments about Google doing something") | ||||
|   | Example use of the spaCy NLP tools for data exploration. | ||||
|   | Here we will look for Reddit comments that describe Google doing something, | ||||
|   | i.e. discuss the company's actions. This is difficult, because other | ||||
|   | senses of "Google" now dominate usage of the word in conversation, | ||||
|   | particularly references to using Google products.  | ||||
|   a.readmore(href='tute_adverbs.html')  | ||||
|     | ► | ||||
| 
 | ||||
| +Tutorial("Use word vectors for semantic search of Twitter") | ||||
|   | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. | ||||
|   | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. | ||||
|   a.readmore(href='tute_twitter.html')  | ||||
|     | ► | ||||
|  | @ -1,167 +0,0 @@ | |||
| mixin example(name) | ||||
|   details | ||||
|     summary | ||||
|       h4= name | ||||
|     block | ||||
| 
 | ||||
| 
 | ||||
| +example("Load resources and process text") | ||||
|   pre.language-python: code | ||||
|     | from __future__ import unicode_literals, print_function | ||||
|     | from spacy.en import English | ||||
|     | nlp = English() | ||||
|     | doc = nlp('Hello, world. Here are two sentences.') | ||||
| 
 | ||||
| +example("Get tokens and sentences") | ||||
|   pre.language-python: code | ||||
|     | token = doc[0] | ||||
|     | sentence = doc.sents[0] | ||||
|     | assert token[0] is sentence[0] | ||||
| 
 | ||||
| +example("Use integer IDs for any string") | ||||
|   pre.language-python: code | ||||
|     | hello_id = nlp.vocab.strings['Hello'] | ||||
|     | hello_str = nlp.vocab.strings[hello_id] | ||||
|     |  | ||||
|     | assert token.orth  == hello_id  == 52 | ||||
|     | assert token.orth_ == hello_str == 'Hello' | ||||
| 
 | ||||
| +example("Get and set string views and flags") | ||||
|   pre.language-python: code | ||||
|     | assert token.shape_ == 'Xxxx' | ||||
|     | for lexeme in nlp.vocab: | ||||
|     |     if lexeme.is_alpha: | ||||
|     |         lexeme.shape_ = 'W' | ||||
|     |     elif lexeme.is_digit: | ||||
|     |         lexeme.shape_ = 'D' | ||||
|     |     elif lexeme.is_punct: | ||||
|     |         lexeme.shape_ = 'P' | ||||
|     |     else: | ||||
|     |         lexeme.shape_ = 'M' | ||||
|     | assert token.shape_ == 'W' | ||||
| 
 | ||||
| +example("Export to numpy arrays") | ||||
|   pre.language-python: code | ||||
|     | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV | ||||
|     |  | ||||
|     | attr_ids = [ORTH, LIKE_URL, IS_OOV] | ||||
|     | doc_array = doc.to_array(attr_ids) | ||||
|     | assert doc_array.shape == (len(doc), len(attrs) | ||||
|     | assert doc[0].orth == doc_array[0, 0] | ||||
|     | assert doc[1].orth == doc_array[1, 0] | ||||
|     | assert doc[0].like_url == doc_array[0, 1] | ||||
|     | assert doc_array[, 1] == [t.like_url for t in doc] | ||||
| 
 | ||||
| +example("Word vectors") | ||||
|   pre.language-python: code | ||||
|     | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") | ||||
|     |  | ||||
|     | apples = doc[0] | ||||
|     | oranges = doc[1] | ||||
|     | boots = doc[6] | ||||
|     | hippos = doc[8] | ||||
|     |  | ||||
|     | assert apples.similarity(oranges) > boots.similarity(hippos) | ||||
| 
 | ||||
| 
 | ||||
| +example("Part-of-speech tags") | ||||
|   pre.language-python: code | ||||
|     | from spacy.parts_of_speech import ADV | ||||
|     |  | ||||
|     | def is_adverb(token): | ||||
|     |     return token.pos == spacy.parts_of_speech.ADV | ||||
|     |  | ||||
|     | # These are data-specific, so no constants are provided. You have to look | ||||
|     | # up the IDs from the StringStore. | ||||
|     | NNS = nlp.vocab.strings['NNS'] | ||||
|     | NNPS = nlp.vocab.strings['NNPS'] | ||||
|     | def is_plural_noun(token): | ||||
|     |     return token.tag == NNS or token.tag == NNPS | ||||
|     |  | ||||
|     | def print_coarse_pos(token): | ||||
|     |     print(token.pos_) | ||||
|     |  | ||||
|     | def print_fine_pos(token): | ||||
|     |     print(token.tag_) | ||||
| 
 | ||||
| +example("Syntactic dependencies") | ||||
|   pre.language-python: code | ||||
|     | def dependency_labels_to_root(token): | ||||
|     |     '''Walk up the syntactic tree, collecting the arc labels.''' | ||||
|     |     dep_labels = [] | ||||
|     |     while token.root is not token: | ||||
|     |         dep_labels.append(token.dep) | ||||
|     |         token = token.head | ||||
|     |     return dep_labels | ||||
| 
 | ||||
| +example("Named entities") | ||||
|   pre.language-python: code | ||||
|     | def iter_products(docs): | ||||
|     |     for doc in docs: | ||||
|     |         for ent in doc.ents: | ||||
|     |             if ent.label_ == 'PRODUCT': | ||||
|     |                 yield ent | ||||
|     |  | ||||
|     | def word_is_in_entity(word): | ||||
|     |     return word.ent_type != 0 | ||||
|     |  | ||||
|     | def count_parent_verb_by_person(docs): | ||||
|     |     counts = defaultdict(defaultdict(int)) | ||||
|     |     for doc in docs: | ||||
|     |         for ent in doc.ents: | ||||
|     |             if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: | ||||
|     |                 counts[ent.orth_][ent.root.head.lemma_] += 1 | ||||
|     |     return counts | ||||
| 
 | ||||
|   //+example("Define custom NER rules") | ||||
|   //  pre.language-python: code | ||||
|   //    | nlp.matcher | ||||
| 
 | ||||
| 
 | ||||
| +example("Calculate inline mark-up on original string") | ||||
|   pre.language-python: code | ||||
|     | def put_spans_around_tokens(doc, get_classes): | ||||
|     |     '''Given some function to compute class names, put each token in a | ||||
|     |     span element, with the appropriate classes computed. | ||||
|     |   | ||||
|     |     All whitespace is preserved, outside of the spans. (Yes, I know HTML | ||||
|     |     won't display it. But the point is no information is lost, so you can | ||||
|     |     calculate what you need, e.g. <br /> tags, <p> tags, etc.) | ||||
|     |     ''' | ||||
|     |     output = [] | ||||
|     |     template = '<span classes="{classes}">{word}</span>{space}' | ||||
|     |     for token in doc: | ||||
|     |         if token.is_space: | ||||
|     |             output.append(token.orth_) | ||||
|     |         else: | ||||
|     |             output.append( | ||||
|     |               template.format( | ||||
|     |                 classes=' '.join(get_classes(token)), | ||||
|     |                 word=token.orth_, | ||||
|     |                 space=token.whitespace_)) | ||||
|     |     string = ''.join(output) | ||||
|     |     string = string.replace('\n', '<br />') | ||||
|     |     string = string.replace('\t', '    ' | ||||
|     |     return string | ||||
| 
 | ||||
| 
 | ||||
| +example("Efficient binary serialization") | ||||
|   pre.language-python: code | ||||
|     |  | ||||
|     | byte_string = doc.as_bytes() | ||||
|     | open('/tmp/moby_dick.bin', 'wb').write(byte_string) | ||||
|     |  | ||||
|     | nlp = spacy.en.English() | ||||
|     | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): | ||||
|     |    doc = Doc(nlp.vocab) | ||||
|     |    doc.from_bytes(byte_string) | ||||
| 
 | ||||
| 
 | ||||
| p | ||||
|   | See the  | ||||
|   a(href="docs.html") docs page  | ||||
|   | for  | ||||
|   a(href="docs.html#api") API documentation,  | ||||
|   a(href="docs.html#tutorials") tutorials,  | ||||
|   | and  | ||||
|   a(href="docs.html#spec") annotation specs. | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user