mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Remove old docs
This commit is contained in:
		
							parent
							
								
									cad0cca4e3
								
							
						
					
					
						commit
						890d6aa216
					
				|  | @ -1,661 +0,0 @@ | ||||||
| mixin declare_class(name) |  | ||||||
|   details |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label class |  | ||||||
|         code #{name} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| mixin method(name, parameters) |  | ||||||
|   details(open=attributes.open) |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label #{name} |  | ||||||
|         span.parameters |  | ||||||
|           | self, #{parameters} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin params |  | ||||||
|   ul |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin param(name, type, value) |  | ||||||
|   li |  | ||||||
|     if type |  | ||||||
|       <strong>#{name}</strong> (!{type}) – |  | ||||||
|     else |  | ||||||
|       <strong>#{name}</strong> – |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin attribute(name, type, value) |  | ||||||
|   details(open=attributes.open) |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label #{name} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin returns(name, type, value) |  | ||||||
|   li |  | ||||||
|     if type |  | ||||||
|       <strong>#{name}</strong> (!{type}) – |  | ||||||
|     else |  | ||||||
|       <strong>#{name}</strong> – |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin returns(type) |  | ||||||
|   | tmp |  | ||||||
| 
 |  | ||||||
| mixin init |  | ||||||
|   details |  | ||||||
|     summary: h4 Init |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin callable |  | ||||||
|   details |  | ||||||
|     summary: h4 Callable |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin sequence |  | ||||||
|   details |  | ||||||
|     summary: h4 Sequence |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin maptype |  | ||||||
|   details |  | ||||||
|     summary: h4 Map |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin summary |  | ||||||
|   block |  | ||||||
| 
 |  | ||||||
| mixin en_example |  | ||||||
|   pre.language-python |  | ||||||
|     code |  | ||||||
|       | from spacy.en import English |  | ||||||
|       | from spacy._doc_examples import download_war_and_peace |  | ||||||
|       |  |  | ||||||
|       | unprocessed_unicode = download_war_and_peace() |  | ||||||
|       |  |  | ||||||
|       | nlp = English() |  | ||||||
|       | doc = nlp(unprocessed_unicode) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +declare_class("English") |  | ||||||
|   p Load models into a callable object to process English text. |  | ||||||
| 
 |  | ||||||
|   +summary |  | ||||||
|     +en_example |  | ||||||
| 
 |  | ||||||
|   +init |  | ||||||
|     p |  | ||||||
|       | Load the resources.  Loading takes 20 seconds, and the instance |  | ||||||
|       | consumes 2 to 3 gigabytes of memory. |  | ||||||
|      |  | ||||||
|     p  |  | ||||||
|       | Intended use is for one instance to be created per process. |  | ||||||
|       | You can create more if you're doing something unusual. |  | ||||||
|     p |  | ||||||
|       | You may wish to make the instance a global variable or "singleton". |  | ||||||
|       | We usually instantiate the object in the <code>main()</code> |  | ||||||
|       | function and pass it around as an explicit argument.  |  | ||||||
|     +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") |  | ||||||
| 
 |  | ||||||
|       +params |  | ||||||
|         +param("data_dir") |  | ||||||
|           | The data directory.  May be #{None}, to disable any data loading |  | ||||||
|           | (including the vocabulary). |  | ||||||
| 
 |  | ||||||
|         +param("Tokenizer") |  | ||||||
|           | A class/function that creates the tokenizer. |  | ||||||
| 
 |  | ||||||
|         +param("Tagger") |  | ||||||
|           | A class/function that creates the part-of-speech tagger. |  | ||||||
| 
 |  | ||||||
|         +param("Parser") |  | ||||||
|           | A class/function that creates the dependency parser. |  | ||||||
| 
 |  | ||||||
|         +param("Entity") |  | ||||||
|           | A class/function that creates the named entity recogniser. |  | ||||||
| 
 |  | ||||||
|         +param("load_vectors") |  | ||||||
|           | A boolean value to control whether the word vectors are loaded. |  | ||||||
|    |  | ||||||
|   +callable |  | ||||||
|     +method("__call__", "text, tag=True, parse=True, entity=True") |  | ||||||
| 
 |  | ||||||
|       +params |  | ||||||
|         +param("text", types.unicode) |  | ||||||
|           | The text to be processed.  No pre-processing needs to be applied, |  | ||||||
|           | and any length of text can be submitted.  Usually you will submit |  | ||||||
|           | a whole document. Text may be zero-length. An exception is raised |  | ||||||
|           | if byte strings are supplied. |  | ||||||
| 
 |  | ||||||
|         +param("tag", types.bool) |  | ||||||
|           | Whether to apply the part-of-speech tagger. Required for parsing |  | ||||||
|           | and entity recognition. |  | ||||||
| 
 |  | ||||||
|         +param("parse", types.bool) |  | ||||||
|           | Whether to apply the syntactic dependency parser. |  | ||||||
| 
 |  | ||||||
|         +param("entity", types.bool) |  | ||||||
|           | Whether to apply the named entity recognizer. |  | ||||||
| 
 |  | ||||||
|       pre.language-python |  | ||||||
|         code |  | ||||||
|           | from spacy.en import English |  | ||||||
|           | nlp = English() |  | ||||||
|           | doc = nlp(u'Some text.) # Applies tagger, parser, entity |  | ||||||
|           | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser |  | ||||||
|           | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity |  | ||||||
|           | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser |  | ||||||
|           | doc = nlp(u'') # Zero-length tokens, not an error |  | ||||||
|           | # doc = nlp(b'Some text') <-- Error: need unicode |  | ||||||
|           | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +declare_class("Doc") |  | ||||||
|   p I'm a doc |  | ||||||
| 
 |  | ||||||
|   +init |  | ||||||
|     +method("__init__", "vocab") |  | ||||||
|       +params |  | ||||||
|         +param("vocab", vocab_type) |  | ||||||
|           | A vocabulary object |  | ||||||
| 
 |  | ||||||
|   +sequence |  | ||||||
|     +method("__getitem__", "i", types.int) |  | ||||||
|       +returns(types.Token) |  | ||||||
| 
 |  | ||||||
|     +method("__getitem__", "start_end", types.slice) |  | ||||||
|       +returns(types.Span) |  | ||||||
| 
 |  | ||||||
|     +method("__iter__") |  | ||||||
|       | Iterate over tokens |  | ||||||
| 
 |  | ||||||
|     +method("__len__") |  | ||||||
|       | Number of tokens in the document. |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Spans |  | ||||||
|      |  | ||||||
|     +attribute("sents", types.generator) |  | ||||||
|       | Iterate over sentences in the document. |  | ||||||
|    |  | ||||||
|     +attribute("ents", types.generator) |  | ||||||
|       | Iterate over named entities in the document. |  | ||||||
| 
 |  | ||||||
|     +attribute("noun_chunks", types.generator) |  | ||||||
|    |  | ||||||
|   details |  | ||||||
|     summary: h4 Export/Import |  | ||||||
|      |  | ||||||
|     +method("to_array", "attr_ids") |  | ||||||
| 
 |  | ||||||
|       | Given a list of M attribute IDs, export the tokens to a numpy ndarray |  | ||||||
|       | of shape N*M, where N is the length of the sentence. |  | ||||||
| 
 |  | ||||||
|       +params |  | ||||||
|         +param("attr_ids", "list[int]") |  | ||||||
|           | A list of attribute ID ints. |  | ||||||
| 
 |  | ||||||
|       +returns("feat_array") |  | ||||||
|         | A feature matrix, with one row per word, and one column per attribute |  | ||||||
|         | indicated in the input attr_ids. |  | ||||||
| 
 |  | ||||||
|     +method("count_by", "attr_id") |  | ||||||
|       | Produce a dict of {attribute (int): count (ints)} frequencies, keyed |  | ||||||
|       | by the values of the given attribute ID. |  | ||||||
|      |  | ||||||
|       pre.language-python |  | ||||||
|         code |  | ||||||
|           | >>> from spacy.en import English, attrs |  | ||||||
|           | >>> nlp = English() |  | ||||||
|           | >>> tokens = nlp(u'apple apple orange banana') |  | ||||||
|           | >>> tokens.count_by(attrs.ORTH) |  | ||||||
|           | {12800L: 1, 11880L: 2, 7561L: 1} |  | ||||||
|           | >>> tokens.to_array([attrs.ORTH]) |  | ||||||
|           | array([[11880], |  | ||||||
|           |         [11880], |  | ||||||
|           |         [7561], |  | ||||||
|           |         [12800]]) |  | ||||||
| 
 |  | ||||||
|     +method("from_array", "attrs, array") |  | ||||||
|       | Load from array |  | ||||||
|    |  | ||||||
|     +method("from_bytes") |  | ||||||
|       | Deserialize, loading from bytes |  | ||||||
| 
 |  | ||||||
|     +method("read_bytes") |  | ||||||
|       | classmethod |  | ||||||
| 
 |  | ||||||
|     //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") |  | ||||||
| 
 |  | ||||||
|     //  | Merge a multi-word expression into a single token.  Currently |  | ||||||
|     //  | experimental; API is likely to change. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +declare_class("Token") |  | ||||||
|   +init |  | ||||||
|     +method("__init__", "vocab, doc, offset") |  | ||||||
|       +params |  | ||||||
|         +param("vocab", types.Vocab) |  | ||||||
|           p A Vocab object |  | ||||||
| 
 |  | ||||||
|         +param("doc", types.Doc) |  | ||||||
|           p The parent sequence |  | ||||||
| 
 |  | ||||||
|       +param("offset", types.int) |  | ||||||
|         p The index of the token within the document |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 String Views |  | ||||||
| 
 |  | ||||||
|     +attribute("orth / orth_") |  | ||||||
|       | The form of the word with no string normalization or processing, as |  | ||||||
|       | it appears in the string, without trailing whitespace. |  | ||||||
| 
 |  | ||||||
|     +attribute("lemma / lemma_") |  | ||||||
|       | The "base" of the word, with no inflectional suffixes, e.g. the lemma of |  | ||||||
|       | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that |  | ||||||
|       | <em>derivational</em> suffixes are not stripped, e.g. the lemma of |  | ||||||
|       | "instutitions" is "institution", not "institute".  Lemmatization is |  | ||||||
|       | performed using the WordNet data, but extended to also cover closed-class |  | ||||||
|       | words such as pronouns.  By default, the WN lemmatizer returns "hi" |  | ||||||
|       | as the lemma of "his". We assign pronouns the lemma -PRON-. |  | ||||||
| 
 |  | ||||||
|     +attribute("lower / lower_") |  | ||||||
|       | The form of the word, but forced to lower-case, i.e. |  | ||||||
|       pre.language-python: code lower = word.orth\_.lower() |  | ||||||
| 
 |  | ||||||
|     //+attribute("norm / norm_") |  | ||||||
|     //  | The form of the word, after language-specific normalizations has been |  | ||||||
|     //  | applied. |  | ||||||
| 
 |  | ||||||
|     +attribute("shape / shape_") |  | ||||||
|       | A transform of the word's string, to show orthographic features. |  | ||||||
|       | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped |  | ||||||
|       | to d. After these mappings, sequences of 4 or more of the same character |  | ||||||
|       | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx, |  | ||||||
|       | :) --> :) |  | ||||||
| 
 |  | ||||||
|     +attribute("prefix / prefix_") |  | ||||||
|       | A length-N substring from the start of the word.  Length may vary by |  | ||||||
|       | language; currently for English n=1, i.e. |  | ||||||
|       pre.language-python: code prefix = word.orth\_[:1] |  | ||||||
| 
 |  | ||||||
|     +attribute("suffix / suffix_") |  | ||||||
|       | A length-N substring from the end of the word.  Length may vary by |  | ||||||
|       | language; currently for English n=3, i.e. |  | ||||||
|       pre.language-python: code suffix = word.orth\_[-3:] |  | ||||||
| 
 |  | ||||||
|     //+attribute("lex_id") |  | ||||||
|     //  | lex_id |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Alignment and Output |  | ||||||
| 
 |  | ||||||
|     +attribute("idx") |  | ||||||
|       p Start index of the token in the string |  | ||||||
| 
 |  | ||||||
|     +method("__len__", "") |  | ||||||
|       p Length of the token's orth string, in unicode code-points. |  | ||||||
| 
 |  | ||||||
|     +method("__unicode__", "") |  | ||||||
|       p Same as token.orth_ |  | ||||||
| 
 |  | ||||||
|     +method("__str__", "") |  | ||||||
|       p Varies between Python 2 and Python 3 |  | ||||||
| 
 |  | ||||||
|     +attribute("string") |  | ||||||
|       p |  | ||||||
|         | The form of the word as it appears in the string, <strong>including |  | ||||||
|         | trailing whitespace</strong>.  This is useful when you need to use |  | ||||||
|         | linguistic features to add inline mark-up to the string. |  | ||||||
| 
 |  | ||||||
|     +method("nbor, i=1") |  | ||||||
|       +params |  | ||||||
|         +param("i") |  | ||||||
|           p Offset relative to token |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Distributional Features |  | ||||||
| 
 |  | ||||||
|     +attribute("repvec") |  | ||||||
|       p |  | ||||||
|         | A "word embedding" representation: a dense real-valued vector that supports |  | ||||||
|         | similarity queries between words.  By default, spaCy currently loads |  | ||||||
|         | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec |  | ||||||
|         | model. |  | ||||||
| 
 |  | ||||||
|     +attribute("cluster") |  | ||||||
|       p |  | ||||||
|         | The Brown cluster ID of the word.  These are often useful features for |  | ||||||
|         | linear models.  If you're using a non-linear model, particularly a |  | ||||||
|         | neural net or random forest, consider using the real-valued word |  | ||||||
|         | representation vector, in Token.repvec, instead. |  | ||||||
| 
 |  | ||||||
|     +attribute("prob") |  | ||||||
|       p |  | ||||||
|         | The unigram log-probability of the word, estimated from counts from a |  | ||||||
|         | large corpus, smoothed using Simple Good Turing estimation. |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Syntactic Tags |  | ||||||
| 
 |  | ||||||
|     +attribute("pos / pos_") |  | ||||||
|       p |  | ||||||
|         | A part-of-speech tag, from the Google Universal Tag Set, e.g.  |  | ||||||
|         | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for |  | ||||||
|         | the 17 tag values are provided in <code>spacy.parts_of_speech.</code> |  | ||||||
| 
 |  | ||||||
|     +attribute("tag / tag_") |  | ||||||
|       p |  | ||||||
|         | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>, |  | ||||||
|         | <code>DT</code>, etc.  These tags are language/corpus specific, and |  | ||||||
|         | typically describe part-of-speech and some amount of morphological |  | ||||||
|         | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code> |  | ||||||
|         | is assigned to a present-tense singular verb. |  | ||||||
| 
 |  | ||||||
|     +attribute("dep / dep_") |  | ||||||
|       p |  | ||||||
|         | The type of syntactic dependency relation between the word and its |  | ||||||
|         | syntactic head. |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Navigating the Parse Tree |  | ||||||
|    |  | ||||||
|     +attribute("head") |  | ||||||
|       p |  | ||||||
|         | The Token that is the immediate syntactic head of the word.  If the |  | ||||||
|         | word is the root of the dependency tree, the same word is returned. |  | ||||||
| 
 |  | ||||||
|     +attribute("lefts") |  | ||||||
|       p |  | ||||||
|         | An iterator for the immediate leftward syntactic children of the |  | ||||||
|         | word. |  | ||||||
| 
 |  | ||||||
|     +attribute("rights") |  | ||||||
|       p |  | ||||||
|         | An iterator for the immediate rightward syntactic children of the |  | ||||||
|         | word. |  | ||||||
| 
 |  | ||||||
|     +attribute("n_lefts") |  | ||||||
|       p |  | ||||||
|         | The number of immediate syntactic children preceding the word in  |  | ||||||
|         | the string. |  | ||||||
| 
 |  | ||||||
|     +attribute("n_rights") |  | ||||||
|       p |  | ||||||
|         | The number of immediate syntactic children following the word in |  | ||||||
|         | the string. |  | ||||||
| 
 |  | ||||||
|     +attribute("children") |  | ||||||
|       p |  | ||||||
|         | An iterator that yields from lefts, and then yields from rights. |  | ||||||
| 
 |  | ||||||
|     +attribute("subtree") |  | ||||||
|       p |  | ||||||
|         | An iterator for the part of the sentence syntactically governed by |  | ||||||
|         | the word, including the word itself. |  | ||||||
| 
 |  | ||||||
|     +attribute("left_edge") |  | ||||||
|       p The leftmost edge of the token's subtree |  | ||||||
| 
 |  | ||||||
|     +attribute("right_edge") |  | ||||||
|       p The rightmost edge of the token's subtree |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Named Entities |  | ||||||
| 
 |  | ||||||
|     +attribute("ent_type") |  | ||||||
|       p If the token is part of an entity, its entity type. |  | ||||||
| 
 |  | ||||||
|     +attribute("ent_iob") |  | ||||||
|       p The IOB (inside, outside, begin) entity recognition tag for the token. |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Lexeme Flags |  | ||||||
| 
 |  | ||||||
|     +method("check_flag", "flag_id") |  | ||||||
|       +params |  | ||||||
|         +param("flag_id") |  | ||||||
|           | flag ID |  | ||||||
| 
 |  | ||||||
|     +attribute("is_oov") |  | ||||||
|     +attribute("is_alpha") |  | ||||||
|     +attribute("is_ascii") |  | ||||||
|     +attribute("is_digit") |  | ||||||
|     +attribute("is_lower") |  | ||||||
|     +attribute("is_title") |  | ||||||
|     +attribute("is_punct") |  | ||||||
|     +attribute("is_space") |  | ||||||
|     +attribute("like_url") |  | ||||||
|     +attribute("like_num") |  | ||||||
|     +attribute("like_email") |  | ||||||
| 
 |  | ||||||
|     //+attribute("conjuncts") |  | ||||||
|     //  | Conjuncts |  | ||||||
| 
 |  | ||||||
| +declare_class("Span") |  | ||||||
|   +init |  | ||||||
|     +method("__init__") |  | ||||||
|       Temp |  | ||||||
| 
 |  | ||||||
|     <code>span = doc[0:4]</code> |  | ||||||
| 
 |  | ||||||
|   +sequence |  | ||||||
|     +method("__getitem__") |  | ||||||
|       p Get item |  | ||||||
| 
 |  | ||||||
|     +method("__iter__") |  | ||||||
|       p Iter |  | ||||||
|          |  | ||||||
|     +method("__len__") |  | ||||||
|       p Len |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Parse |  | ||||||
| 
 |  | ||||||
|     +attribute("root") |  | ||||||
|       p Syntactic head |  | ||||||
| 
 |  | ||||||
|     +attribute("lefts") |  | ||||||
|       p Tokens that are: |  | ||||||
|       ol |  | ||||||
|         li To the left of the span; |  | ||||||
|         li Syntactic children of words within the span |  | ||||||
| 
 |  | ||||||
|       p i.e. |  | ||||||
| 
 |  | ||||||
|       pre.language-python |  | ||||||
|         code |  | ||||||
|           | lefts = [span.doc[i] for i in range(0, span.start) |  | ||||||
|           |          if span.doc[i].head in span] |  | ||||||
| 
 |  | ||||||
|     +attribute("rights") |  | ||||||
|       p Tokens that are: |  | ||||||
|         ol  |  | ||||||
|           li To the right of the span; |  | ||||||
|           li Syntactic children of words within the span |  | ||||||
|       p i.e. |  | ||||||
|       pre.language-python |  | ||||||
|         code |  | ||||||
|           | rights = [span.doc[i] for i in range(span.end, len(span.doc)) |  | ||||||
|           |           if span.doc[i].head in span] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     +attribute("subtree") |  | ||||||
|       p String |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 String Views |  | ||||||
| 
 |  | ||||||
|     +attribute("string") |  | ||||||
|       p String |  | ||||||
| 
 |  | ||||||
|     +attribute("lemma / lemma_") |  | ||||||
|       p String |  | ||||||
| 
 |  | ||||||
|     +attribute("label / label_") |  | ||||||
|       p String |  | ||||||
| 
 |  | ||||||
| +declare_class("Lexeme") |  | ||||||
|   p |  | ||||||
|     | The Lexeme object represents a lexical type, stored in the vocabulary |  | ||||||
|     | – as opposed to a token, occurring in a document. |  | ||||||
|   p |  | ||||||
|     | Lexemes store various features, so that these features can be computed |  | ||||||
|     | once per type, rather than once per token. As job sizes grow, this |  | ||||||
|     | can amount to a substantial efficiency improvement. |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     | All Lexeme attributes are therefore context independent, as a single |  | ||||||
|     | lexeme is reused for all usages of that word. Lexemes are keyed by |  | ||||||
|     | the “orth” attribute. |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     All Lexeme attributes are accessible directly on the Token object. |  | ||||||
| 
 |  | ||||||
|   +init |  | ||||||
|     +method("__init__") |  | ||||||
|       p Init |  | ||||||
| 
 |  | ||||||
|     details |  | ||||||
|       summary: h4 String Features |  | ||||||
| 
 |  | ||||||
|         +attribute("orth / orth_") |  | ||||||
|           p |  | ||||||
|             | The form of the word with no string normalization or processing, |  | ||||||
|             | as it appears in the string, without trailing whitespace. |  | ||||||
|        |  | ||||||
|         +attribute("lower / lower_") |  | ||||||
|           p Tmp |  | ||||||
|        |  | ||||||
|         +attribute("norm / norm_") |  | ||||||
|           p Tmp |  | ||||||
|        |  | ||||||
|         +attribute("shape / shape_") |  | ||||||
|           p Tmp |  | ||||||
|        |  | ||||||
|         +attribute("prefix / prefix_") |  | ||||||
|           p Tmp |  | ||||||
|        |  | ||||||
|         +attribute("suffix / suffix_") |  | ||||||
|           p TMP |  | ||||||
| 
 |  | ||||||
| +declare_class("Vocab", "data_dir=None, lex_props_getter=None") |  | ||||||
|   +sequence |  | ||||||
|     +method("__len__") |  | ||||||
|       +returns |  | ||||||
|         p Number of words in the vocabulary. |  | ||||||
| 
 |  | ||||||
|     +method("__iter__") |  | ||||||
|       +returns |  | ||||||
|         p Lexeme |  | ||||||
| 
 |  | ||||||
|   +maptype |  | ||||||
|     +method("__getitem__", "key_int") |  | ||||||
|       +params |  | ||||||
|         +param("key") |  | ||||||
|           p Integer ID |  | ||||||
| 
 |  | ||||||
|       +returns: p A Lexeme object |  | ||||||
| 
 |  | ||||||
|     +method("__getitem__", "key_str") |  | ||||||
|       +params |  | ||||||
|         +param("key_str", types.unicode) |  | ||||||
|           p A string in the vocabulary |  | ||||||
| 
 |  | ||||||
|       +returns("Lexeme") |  | ||||||
| 
 |  | ||||||
|     +method("__setitem__", "orth_str", "props") |  | ||||||
|       +params |  | ||||||
|         +param("orth_str", types.unicode) |  | ||||||
|           p The orth key |  | ||||||
| 
 |  | ||||||
|         +param("props", types.dict) |  | ||||||
|           p A props dictionary |  | ||||||
| 
 |  | ||||||
|       +returns("None") |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Import/Export |  | ||||||
| 
 |  | ||||||
|     +method("dump", "loc") |  | ||||||
|       +params |  | ||||||
|         +param("loc", types.unicode) |  | ||||||
|           p Path where the vocabulary should be saved |  | ||||||
| 
 |  | ||||||
|     +method("load_lexemes", "loc") |  | ||||||
|     +params |  | ||||||
|       +param("loc", types.unicode) |  | ||||||
|         p Path to load the lexemes.bin file from |  | ||||||
| 
 |  | ||||||
|     +method("load_vectors", "loc") |  | ||||||
|       +params |  | ||||||
|         +param("loc", types.unicode) |  | ||||||
|           p Path to load the vectors.bin from |  | ||||||
| 
 |  | ||||||
| +declare_class("StringStore") |  | ||||||
|   +init |  | ||||||
|     Tmp |  | ||||||
| 
 |  | ||||||
|   +sequence |  | ||||||
|     +method("__len__") |  | ||||||
|       +returns("int") |  | ||||||
|         p Number of strings in the string-store |  | ||||||
| 
 |  | ||||||
|     +method("__iter__") |  | ||||||
|       +returns |  | ||||||
|         p Lexeme |  | ||||||
| 
 |  | ||||||
|   +maptype |  | ||||||
|     +method("__getitem__", "key_int") |  | ||||||
|       +params |  | ||||||
|         +param("key_int") |  | ||||||
|           p An integer key |  | ||||||
| 
 |  | ||||||
|       +returns(types.unicode) |  | ||||||
|         p The string that the integer key maps to |  | ||||||
| 
 |  | ||||||
|     +method("__getitem__", "key_unicode") |  | ||||||
|       +params |  | ||||||
|         +param("key_unicode") |  | ||||||
|           p A key, as a unicode string |  | ||||||
| 
 |  | ||||||
|       +returns(types.int) |  | ||||||
|         p The integer ID of the string. |  | ||||||
| 
 |  | ||||||
|     +method("__getitem__", "key_utf8_bytes") |  | ||||||
|       +params |  | ||||||
|         +param("key_utf8_bytes", types.bytes) |  | ||||||
|           p p A key, as a UTF-8 encoded byte-string |  | ||||||
| 
 |  | ||||||
|       +returns(types.int) |  | ||||||
|         p The integer ID of the string. |  | ||||||
| 
 |  | ||||||
|   details |  | ||||||
|     summary: h4 Import/Export |  | ||||||
| 
 |  | ||||||
|     +method("dump", "loc") |  | ||||||
|       +params |  | ||||||
|         +param("loc") |  | ||||||
|           p File path to save the strings.txt to. |  | ||||||
| 
 |  | ||||||
|     +method("load") |  | ||||||
|       +params |  | ||||||
|         +param("loc") |  | ||||||
|           p File path to load the strings.txt from. |  | ||||||
|  | @ -1,95 +0,0 @@ | ||||||
| mixin Teaser(title, url, date_long, date_short, author, lede) |  | ||||||
|   article.post |  | ||||||
|     header |  | ||||||
|       h2 |  | ||||||
|         a(href=url)= title |  | ||||||
|       .subhead |  | ||||||
|         | by  |  | ||||||
|         a(href='#', rel='author')= author |  | ||||||
|         |  on  |  | ||||||
|         time(datetime=date_short)= date_long |  | ||||||
|     p!= lede |  | ||||||
|         |  | ||||||
|       a.readmore(href='#') ► |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| doctype html |  | ||||||
| html(lang='en') |  | ||||||
|   head |  | ||||||
|     meta(charset='utf-8') |  | ||||||
|     title spaCy Blog |  | ||||||
|     meta(name='description', content='') |  | ||||||
|     meta(name='author', content='Matthew Honnibal') |  | ||||||
|     link(rel='stylesheet', href='css/style.css') |  | ||||||
|     //if lt IE 9 |  | ||||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') |  | ||||||
|   body#blog |  | ||||||
|     header(role='banner') |  | ||||||
|       h1.logo spaCy Blog |  | ||||||
|       .slogan Blog |  | ||||||
| 
 |  | ||||||
|     nav(role="navigation") |  | ||||||
|       ul |  | ||||||
|         li: a(href="home.html")        Home |  | ||||||
|         li: a(href="docs.html")        Docs |  | ||||||
|         li.active: a(href="blog.html") Blog |  | ||||||
|         li: a(href="license.html")     License |  | ||||||
| 
 |  | ||||||
|     main#content(role='main') |  | ||||||
|       section.intro.profile |  | ||||||
|         p |  | ||||||
|           img(src='img/matt.png') |  | ||||||
|           | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. |  | ||||||
|           span.social |  | ||||||
|             a(href='#') Follow me on Twitter |  | ||||||
|         nav(role='navigation') |  | ||||||
|           ul |  | ||||||
|             li |  | ||||||
|               a.button(href='#') Blog |  | ||||||
|             li |  | ||||||
|               a.button(href='#tutorials') Tutorials |  | ||||||
|       section.blogs |  | ||||||
|         +Teaser( |  | ||||||
|           "Introducing spaCy", |  | ||||||
|           "blog_intro.html", |  | ||||||
|           "February 2015", |  | ||||||
|           "2015-02-18", |  | ||||||
|           "Matthew Honnibal", |  | ||||||
|           "<strong>spaCy</strong> is a new library for text processing in Python " + |  | ||||||
|           "and Cython. I wrote it because I think small companies are terrible at " + |  | ||||||
|           "natural language processing (NLP).  Or rather: small companies are using " + |  | ||||||
|           "terrible NLP technology." |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|         +Teaser( |  | ||||||
|           "Parsing English with 500 lines of Python", |  | ||||||
|           "blog_parser.html", |  | ||||||
|           "December 18, 2013", |  | ||||||
|           "2013-12-18", |  | ||||||
|           "Matthew Hannibal", |  | ||||||
|           "The Natural Language Processing (NLP) community has made big progress" + |  | ||||||
|           "in syntactic parsing over the last few years. It’s now possible for a" + |  | ||||||
|           "tiny Python implementation to perform better than the widely-used Stanford " + |  | ||||||
|           "PCFG parser.") |  | ||||||
|         +Teaser( |  | ||||||
|           "A good Part-of-Speech tagger in about 200 lines of Python", |  | ||||||
|           "blog_tagger.html", |  | ||||||
|           "October 11, 2013", |  | ||||||
|           "2013-09-11", |  | ||||||
|           "Matthew Honnibal", |  | ||||||
|           "There are a tonne of “best known techniques” for POS tagging, and you " + |  | ||||||
|           "should ignore the others and just use greedy Averaged Perceptron." |  | ||||||
|         ) |  | ||||||
| 
 |  | ||||||
|       section.intro |  | ||||||
|         h2 |  | ||||||
|           a.permalink(href='#tutorials', name='tutorials') Tutorials |  | ||||||
| 
 |  | ||||||
|       section.tutorials |  | ||||||
|         include ./tutorials.jade |  | ||||||
| 
 |  | ||||||
|     footer(role="contentinfo") |  | ||||||
|       span.slogan.copyright © 2015 Syllogism Co. |  | ||||||
| 
 |  | ||||||
|     script(src='js/prism.js') |  | ||||||
|  | @ -1,81 +0,0 @@ | ||||||
| extends ./template_post.jade |  | ||||||
| 
 |  | ||||||
| - |  | ||||||
|   var urls = { |  | ||||||
|     'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', |  | ||||||
|     'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", |  | ||||||
|     'implementation': 'https://gist.github.com/syllog1sm/10343947', |  | ||||||
|     'redshift': 'http://github.com/syllog1sm/redshift', |  | ||||||
|     'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', |  | ||||||
|     'acl_anthology': 'http://aclweb.org/anthology/', |  | ||||||
|     'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
| - var my_research_software = '<a href="https://github.com/syllog1sm/redshift/tree/develop">my research software</a>' |  | ||||||
| 
 |  | ||||||
| - var how_to_write_a_POS_tagger = '<a href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/">how to write a part-of-speech tagger</a>' |  | ||||||
| 
 |  | ||||||
| - var parser_lnk = '<a href="https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/">parser</a>' |  | ||||||
| 
 |  | ||||||
| - var buy_a_commercial_license = '<a href="license.html">buy a commercial license</a>' |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block body_block |  | ||||||
|   article.post |  | ||||||
|     p. |  | ||||||
|       <strong>spaCy</strong> is a new library for text processing in Python |  | ||||||
|       and Cython. I wrote it because I think small companies are terrible at |  | ||||||
|       natural language processing (NLP).  Or rather: small companies are using |  | ||||||
|       terrible NLP technology. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       To do great NLP, you have to know a little about linguistics, a lot |  | ||||||
|       about machine learning, and almost everything about the latest research. |  | ||||||
|       The people who fit this description seldom join small companies. |  | ||||||
|       Most are broke – they've just finished grad school. |  | ||||||
|       If they don't want to stay in academia, they join Google, IBM, etc. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The net result is that outside of the tech giants, commercial NLP has |  | ||||||
|       changed little in the last ten years.  In academia, it's changed entirely. |  | ||||||
|       Amazing improvements in quality.  Orders of magnitude faster.  But the |  | ||||||
|       academic code is always GPL, undocumented, unuseable, or all three.  |  | ||||||
|       You could implement the ideas yourself, but the papers are hard to read, |  | ||||||
|       and training data is exorbitantly expensive.  So what are you left with? |  | ||||||
|       A common answer is NLTK, which was written primarily as an educational resource. |  | ||||||
|       Nothing past the tokenizer is suitable for production use. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       I used to think that the NLP community just needed to do more to communicate |  | ||||||
|       its findings to software engineers.  So I wrote two blog posts, explaining |  | ||||||
|       !{how_to_write_a_POS_tagger} and !{parser_lnk}.  Both were well |  | ||||||
|       received, and there's been a bit of interest in !{my_research_software} |  | ||||||
|       – even though it's entirely undocumented, and mostly unuseable to |  | ||||||
|       anyone but me. |  | ||||||
|     p. |  | ||||||
|       So six months ago I quit my post-doc, and I've been working day and night |  | ||||||
|       on spaCy since.  I'm now pleased to announce an alpha release. |  | ||||||
|    |  | ||||||
|     p. |  | ||||||
|       If you're a small company doing NLP, I think spaCy will seem like a minor |  | ||||||
|       miracle.  It's by far the fastest NLP software ever released.  The |  | ||||||
|       full processing pipeline completes in 20ms per document, including accurate |  | ||||||
|       tagging and parsing.  All strings are mapped to integer IDs, tokens are |  | ||||||
|       linked to embedded word representations, and a range of useful features |  | ||||||
|       are pre-calculated and cached. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       If none of that made any sense to you, here's the gist of it.  Computers |  | ||||||
|       don't understand text.  This is unfortunate, because that's what the |  | ||||||
|       web almost entirely consists of.  We want to recommend people text based |  | ||||||
|       on other text they liked.  We want to shorten text to display it on a |  | ||||||
|       mobile screen.  We want to aggregate it, link it, filter it, categorise |  | ||||||
|       it, generate it and correct it. |  | ||||||
| 
 |  | ||||||
|     p.  |  | ||||||
|       spaCy provides a library of utility functions that help programmers |  | ||||||
|       build such products.  It's commercial open source software: you can |  | ||||||
|       either use it under the AGPL, or you can !{buy_a_commercial_license} |  | ||||||
|       under generous terms. |  | ||||||
| 
 |  | ||||||
|   footer(role='contentinfo') |  | ||||||
|  | @ -1,938 +0,0 @@ | ||||||
| extends ./template_post.jade |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block body_block |  | ||||||
|   - var urls = {} |  | ||||||
|   //- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/' |  | ||||||
|   - urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html" |  | ||||||
|   - urls.implementation = 'https://gist.github.com/syllog1sm/10343947' |  | ||||||
|   - urls.redshift = 'http://github.com/syllog1sm/redshift' |  | ||||||
|   - urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm' |  | ||||||
|   - urls.acl_anthology = 'http://aclweb.org/anthology/' |  | ||||||
|   - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" |  | ||||||
| 
 |  | ||||||
|   // A comment |  | ||||||
| 
 |  | ||||||
|   article.post |  | ||||||
|     header |  | ||||||
|       h2 Parsing English in 500 lines of Python |  | ||||||
|       .subhead |  | ||||||
|         | by  |  | ||||||
|         a(href='#', rel='author') Matthew Honnibal |  | ||||||
|         |  on  |  | ||||||
|         time(datetime='2013-12-18') December 18, 2013 |  | ||||||
|     p |  | ||||||
|       | A   |  | ||||||
|       a(href=urls.google_ngrams) syntactic parser  |  | ||||||
|       | describes a sentence’s grammatical structure, to help another |  | ||||||
|       | application reason about it. Natural languages introduce many unexpected |  | ||||||
|       | ambiguities, which our world-knowledge immediately filters out. A |  | ||||||
|       | favourite example: |  | ||||||
| 
 |  | ||||||
|     p.example They ate the pizza with anchovies |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') |  | ||||||
|     p |  | ||||||
|       | A correct parse links “with” to “pizza”, while an incorrect parse |  | ||||||
|       | links “with” to “eat”: |  | ||||||
| 
 |  | ||||||
|     .displacy |  | ||||||
|       iframe(src='displacy/anchovies_bad.html', height='275') |  | ||||||
| 
 |  | ||||||
|     .displacy |  | ||||||
|       iframe.displacy(src='displacy/anchovies_good.html', height='275') |  | ||||||
|       a.view-displacy(href='#') View on displaCy |  | ||||||
|       p.caption |  | ||||||
|         | The Natural Language Processing (NLP) community has made big progress |  | ||||||
|         | in syntactic parsing over the last few years. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The Natural Language Processing (NLP) community has made big progress |  | ||||||
|       | in syntactic parsing over the last few years. It’s now possible for |  | ||||||
|       | a tiny Python implementation to perform better than the widely-used |  | ||||||
|       | Stanford PCFG parser. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       strong Update! |  | ||||||
|       |  The Stanford CoreNLP library now includes a greedy transition-based |  | ||||||
|       | dependency parser, similar to the one described in this post, but with |  | ||||||
|       | an improved learning strategy. It is much faster and more accurate |  | ||||||
|       | than this simple Python implementation. |  | ||||||
| 
 |  | ||||||
|     table |  | ||||||
|       thead |  | ||||||
|         tr |  | ||||||
|           th Parser |  | ||||||
|           th Accuracy |  | ||||||
|           th Speed (w/s) |  | ||||||
|           th Language |  | ||||||
|           th LOC |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td Stanford |  | ||||||
|           td 89.6% |  | ||||||
|           td 19 |  | ||||||
|           td Java |  | ||||||
|           td |  | ||||||
|             | > 4,000 |  | ||||||
|             sup |  | ||||||
|               a(href='#note-1') [1] |  | ||||||
|         tr |  | ||||||
|           td |  | ||||||
|             strong parser.py |  | ||||||
|           td 89.8% |  | ||||||
|           td 2,020 |  | ||||||
|           td Python |  | ||||||
|             strong ~500 |  | ||||||
|         tr |  | ||||||
|           td Redshift |  | ||||||
|           td |  | ||||||
|             strong 93.6% |  | ||||||
|           td |  | ||||||
|             strong 2,580 |  | ||||||
|           td Cython |  | ||||||
|           td ~4,000 |  | ||||||
|     p |  | ||||||
|       | The rest of the post sets up the problem, and then takes you through  |  | ||||||
|       a(href=urls.implementation) a concise implementation |  | ||||||
|       | , prepared for this post. The first 200 lines of parser.py, the |  | ||||||
|       | part-of-speech tagger and learner, are described  |  | ||||||
|       a(href=pos_tagger_url) here. You should probably at least skim that |  | ||||||
|       | post before reading this one, unless you’re very familiar with NLP |  | ||||||
|       | research. |  | ||||||
|     p |  | ||||||
|       | The Cython system, Redshift, was written for my current research. I |  | ||||||
|       | plan to improve it for general use in June, after my contract ends |  | ||||||
|       | at Macquarie University. The current version is  |  | ||||||
|       a(href=urls.redshift) hosted on GitHub |  | ||||||
|       | . |  | ||||||
|     h3 Problem Description |  | ||||||
| 
 |  | ||||||
|     p It’d be nice to type an instruction like this into your phone: |  | ||||||
| 
 |  | ||||||
|     p.example |  | ||||||
|       Set volume to zero when I’m in a meeting, unless John’s school calls. |  | ||||||
|     p |  | ||||||
|       | And have it set the appropriate policy. On Android you can do this |  | ||||||
|       | sort of thing with  |  | ||||||
|       a(href=urls.tasker) Tasker |  | ||||||
|       | , but an NL interface would be much better. It’d be especially nice |  | ||||||
|       | to receive a meaning representation you could edit, so you could see |  | ||||||
|       | what it thinks you said, and correct it. |  | ||||||
|     p |  | ||||||
|       | There are lots of problems to solve to make that work, but some sort |  | ||||||
|       | of syntactic representation is definitely necessary. We need to know that: |  | ||||||
| 
 |  | ||||||
|     p.example |  | ||||||
|       Unless John’s school calls, when I’m in a meeting, set volume to zero |  | ||||||
| 
 |  | ||||||
|     p is another way of phrasing the first instruction, while: |  | ||||||
| 
 |  | ||||||
|     p.example |  | ||||||
|       Unless John’s school, call when I’m in a meeting |  | ||||||
| 
 |  | ||||||
|     p means something completely different. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | A dependency parser returns a graph of word-word relationships, |  | ||||||
|       | intended to make such reasoning easier. Our graphs will be trees – |  | ||||||
|       | edges will be directed, and every node (word) will have exactly one |  | ||||||
|       | incoming arc (one dependency, with its head), except one. |  | ||||||
| 
 |  | ||||||
|     h4 Example usage |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | parser = parser.Parser() |  | ||||||
|         | tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split() |  | ||||||
|         | >>> tags, heads = parser.parse(tokens) |  | ||||||
|         | >>> heads |  | ||||||
|         | [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11] |  | ||||||
|         | >>> for i, h in enumerate(heads):  |  | ||||||
|         | ...   head = tokens[heads[h]] if h >= 1 else 'None' |  | ||||||
|         | ...   print(tokens[i] + ' <-- ' + head]) |  | ||||||
|         | Set <-- None |  | ||||||
|         | the <-- volume |  | ||||||
|         | volume <-- Set |  | ||||||
|         | to <-- Set |  | ||||||
|         | zero <-- to |  | ||||||
|         | when <-- Set |  | ||||||
|         | I <-- 'm |  | ||||||
|         | 'm <-- when |  | ||||||
|         | in <-- 'm |  | ||||||
|         | a <-- meeting |  | ||||||
|         | meeting <-- in |  | ||||||
|         | unless <-- Set |  | ||||||
|         | John <-- 's |  | ||||||
|         | 's   <-- calls |  | ||||||
|         | school <-- calls |  | ||||||
|         | calls <-- unless |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The idea is that it should be slightly easier to reason from the parse, |  | ||||||
|       than it was from the string. The parse-to-meaning mapping is hopefully |  | ||||||
|       simpler than the string-to-meaning mapping. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The most confusing thing about this problem area is that “correctness” |  | ||||||
|       is defined by convention — by annotation guidelines. If you haven’t |  | ||||||
|       read the guidelines and you’re not a linguist, you can’t tell whether |  | ||||||
|       the parse is “wrong” or “right”, which makes the whole task feel weird |  | ||||||
|       and artificial. |  | ||||||
|      |  | ||||||
|     p. |  | ||||||
|       For instance, there’s a mistake in the parse above: “John’s school |  | ||||||
|       calls” is structured wrongly, according to the Stanford annotation |  | ||||||
|       guidelines. The structure of that part of the sentence is how the |  | ||||||
|       annotators were instructed to parse an example like “John’s school |  | ||||||
|       clothes”. |  | ||||||
|      |  | ||||||
|     p |  | ||||||
|       | It’s worth dwelling on this point a bit. We could, in theory, have |  | ||||||
|       | written our guidelines so that the “correct” parses were reversed. |  | ||||||
|       | There’s good reason to believe the parsing task will be harder if we |  | ||||||
|       | reversed our convention, as it’d be less consistent with the rest of |  | ||||||
|       | the grammar.  |  | ||||||
|       sup: a(href='#note-2') [2] |  | ||||||
|       | But we could test that empirically, and we’d be pleased to gain an |  | ||||||
|       | advantage by reversing the policy. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | We definitely do want that distinction in the guidelines — we don’t |  | ||||||
|       | want both to receive the same structure, or our output will be less |  | ||||||
|       | useful. The annotation guidelines strike a balance between what |  | ||||||
|       | distinctions downstream applications will find useful, and what |  | ||||||
|       | parsers will be able to predict easily. |  | ||||||
| 
 |  | ||||||
|     h4 Projective trees |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | There’s a particularly useful simplification that we can make, when |  | ||||||
|       | deciding what we want the graph to look like: we can restrict the |  | ||||||
|       | graph structures we’ll be dealing with. This doesn’t just give us a |  | ||||||
|       | likely advantage in learnability; it can have deep algorithmic |  | ||||||
|       | implications. We follow most work on English in constraining the |  | ||||||
|       | dependency graphs to be  |  | ||||||
|       em projective trees |  | ||||||
|       | : |  | ||||||
| 
 |  | ||||||
|     ol |  | ||||||
|       li Tree. Every word has exactly one head, except for the dummy ROOT symbol. |  | ||||||
|       li |  | ||||||
|         | Projective. For every pair of dependencies (a1, a2) and (b1, b2), |  | ||||||
|         | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. |  | ||||||
|         | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or |  | ||||||
|         | b1 a1 b2 a2. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | There’s a rich literature on parsing non-projective trees, and a |  | ||||||
|       | smaller literature on parsing DAGs. But the parsing algorithm I’ll |  | ||||||
|       | be explaining deals with projective trees. |  | ||||||
| 
 |  | ||||||
|     h3 Greedy transition-based parsing |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | Our parser takes as input a list of string tokens, and outputs a |  | ||||||
|       | list of head indices, representing edges in the graph. If the  |  | ||||||
| 
 |  | ||||||
|       em i |  | ||||||
| 
 |  | ||||||
|       | th member of heads is  |  | ||||||
| 
 |  | ||||||
|       em j |  | ||||||
| 
 |  | ||||||
|       | , the dependency parse contains an edge (j, i). A transition-based |  | ||||||
|       | parser is a finite-state transducer; it maps an array of N words |  | ||||||
|       | onto an output array of N head indices: |  | ||||||
| 
 |  | ||||||
|     table.center |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td |  | ||||||
|             em start |  | ||||||
|           td MSNBC |  | ||||||
|           td reported |  | ||||||
|           td that |  | ||||||
|           td Facebook |  | ||||||
|           td bought |  | ||||||
|           td WhatsApp |  | ||||||
|           td for |  | ||||||
|           td $16bn |  | ||||||
|           td |  | ||||||
|             em root |  | ||||||
|         tr |  | ||||||
|           td 0 |  | ||||||
|           td 2 |  | ||||||
|           td 9 |  | ||||||
|           td 2 |  | ||||||
|           td 4 |  | ||||||
|           td 2 |  | ||||||
|           td 4 |  | ||||||
|           td 4 |  | ||||||
|           td 7 |  | ||||||
|           td 0 |  | ||||||
|     p |  | ||||||
|       | The heads array denotes that the head of  |  | ||||||
|       em MSNBC |  | ||||||
|       |  is  |  | ||||||
|       em reported |  | ||||||
|       | :  |  | ||||||
|       em MSNBC |  | ||||||
|       |  is word 1, and  |  | ||||||
|       em reported |  | ||||||
|       |  is word 2, and  |  | ||||||
|       code.language-python heads[1] == 2 |  | ||||||
|       | . You can already see why parsing a tree is handy — this data structure |  | ||||||
|       | wouldn’t work if we had to output a DAG, where words may have multiple |  | ||||||
|       | heads. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | Although  |  | ||||||
|       code.language-python heads |  | ||||||
|       | can be represented as an array, we’d actually like to maintain some |  | ||||||
|       | alternate ways to access the parse, to make it easy and efficient to |  | ||||||
|       | extract features. Our  |  | ||||||
| 
 |  | ||||||
|       code.language-python Parse |  | ||||||
|       | class looks like this: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | class Parse(object): |  | ||||||
|         |     def __init__(self, n): |  | ||||||
|         |         self.n = n |  | ||||||
|         |         self.heads = [None] * (n-1) |  | ||||||
|         |         self.lefts = [] |  | ||||||
|         |         self.rights = [] |  | ||||||
|         |         for i in range(n+1): |  | ||||||
|         |             self.lefts.append(DefaultList(0)) |  | ||||||
|         |             self.rights.append(DefaultList(0)) |  | ||||||
|         |      |  | ||||||
|         |     def add_arc(self, head, child): |  | ||||||
|         |         self.heads[child] = head |  | ||||||
|         |         if child < head: |  | ||||||
|         |             self.lefts[head].append(child) |  | ||||||
|         |         else: |  | ||||||
|         |             self.rights[head].append(child) |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | As well as the parse, we also have to keep track of where we’re up |  | ||||||
|       | to in the sentence. We’ll do this with an index into the  |  | ||||||
|       code.language-python words |  | ||||||
|       |  array, and a stack, to which we’ll push words, before popping them |  | ||||||
|       | once their head is set. So our state data structure is fundamentally: |  | ||||||
| 
 |  | ||||||
|     ul |  | ||||||
|       li An index, i, into the list of tokens; |  | ||||||
|       li The dependencies added so far, in Parse |  | ||||||
|       li |  | ||||||
|         | A stack, containing words that occurred before i, for which we’re |  | ||||||
|         | yet to assign a head. |  | ||||||
| 
 |  | ||||||
|     p Each step of the parsing process applies one of three actions to the state: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | SHIFT = 0; RIGHT = 1; LEFT = 2 |  | ||||||
|         | MOVES = [SHIFT, RIGHT, LEFT] |  | ||||||
|         |  |  | ||||||
|         | def transition(move, i, stack, parse): |  | ||||||
|         |     global SHIFT, RIGHT, LEFT |  | ||||||
|         |     if move == SHIFT: |  | ||||||
|         |         stack.append(i) |  | ||||||
|         |         return i + 1 |  | ||||||
|         |     elif move == RIGHT: |  | ||||||
|         |         parse.add_arc(stack[-2], stack.pop()) |  | ||||||
|         |         return i |  | ||||||
|         |     elif move == LEFT: |  | ||||||
|         |         parse.add_arc(i, stack.pop()) |  | ||||||
|         |         return i |  | ||||||
|         |     raise GrammarError("Unknown move: %d" % move) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The  |  | ||||||
|       code.language-python LEFT |  | ||||||
|       |  and  |  | ||||||
|       code.language-python RIGHT |  | ||||||
|       |  actions add dependencies and pop the stack, while  |  | ||||||
|       code.language-python SHIFT |  | ||||||
|       |  pushes the stack and advances i into the buffer. |  | ||||||
|     p. |  | ||||||
|       So, the parser starts with an empty stack, and a buffer index at 0, with |  | ||||||
|       no dependencies recorded. It chooses one of the (valid) actions, and |  | ||||||
|       applies it to the state. It continues choosing actions and applying |  | ||||||
|       them until the stack is empty and the buffer index is at the end of |  | ||||||
|       the input. (It’s hard to understand this sort of algorithm without |  | ||||||
|       stepping through it. Try coming up with a sentence, drawing a projective |  | ||||||
|       parse tree over it, and then try to reach the parse tree by choosing |  | ||||||
|       the right sequence of transitions.) |  | ||||||
| 
 |  | ||||||
|     p Here’s what the parsing loop looks like in code: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | class Parser(object): |  | ||||||
|         |     ... |  | ||||||
|         |     def parse(self, words): |  | ||||||
|         |         tags = self.tagger(words) |  | ||||||
|         |         n = len(words) |  | ||||||
|         |         idx = 1 |  | ||||||
|         |         stack = [0] |  | ||||||
|         |         deps = Parse(n) |  | ||||||
|         |         while stack or idx < n: |  | ||||||
|         |             features = extract_features(words, tags, idx, n, stack, deps) |  | ||||||
|         |             scores = self.model.score(features) |  | ||||||
|         |             valid_moves = get_valid_moves(i, n, len(stack)) |  | ||||||
|         |             next_move = max(valid_moves, key=lambda move: scores[move]) |  | ||||||
|         |             idx = transition(next_move, idx, stack, parse) |  | ||||||
|         |         return tags, parse |  | ||||||
|         |  |  | ||||||
|         | def get_valid_moves(i, n, stack_depth): |  | ||||||
|         |     moves = [] |  | ||||||
|         |     if i < n: |  | ||||||
|         |         moves.append(SHIFT) |  | ||||||
|         |     if stack_depth <= 2: |  | ||||||
|         |         moves.append(RIGHT) |  | ||||||
|         |     if stack_depth <= 1: |  | ||||||
|         |         moves.append(LEFT) |  | ||||||
|         |     return moves |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       We start by tagging the sentence, and initializing the state. We then |  | ||||||
|       map the state to a set of features, which we score using a linear model. |  | ||||||
|       We then find the best-scoring valid move, and apply it to the state. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The model scoring works the same as it did in  |  | ||||||
|       a(href=urls.post) the POS tagger. |  | ||||||
|       | If you’re confused about the idea of extracting features and scoring |  | ||||||
|       | them with a linear model, you should review that post. Here’s a reminder |  | ||||||
|       | of how the model scoring works: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | class Perceptron(object) |  | ||||||
|         |     ... |  | ||||||
|         |     def score(self, features): |  | ||||||
|         |         all_weights = self.weights |  | ||||||
|         |         scores = dict((clas, 0) for clas in self.classes) |  | ||||||
|         |         for feat, value in features.items(): |  | ||||||
|         |             if value == 0: |  | ||||||
|         |                 continue |  | ||||||
|         |             if feat not in all_weights: |  | ||||||
|         |                 continue |  | ||||||
|         |             weights = all_weights[feat] |  | ||||||
|         |             for clas, weight in weights.items(): |  | ||||||
|         |                 scores[clas] += value * weight |  | ||||||
|         |         return scores |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       It’s just summing the class-weights for each feature. This is often |  | ||||||
|       expressed as a dot-product, but when you’re dealing with multiple |  | ||||||
|       classes, that gets awkward, I find. |  | ||||||
|      |  | ||||||
|     p. |  | ||||||
|       The beam parser (RedShift) tracks multiple candidates, and only decides |  | ||||||
|       on the best one at the very end. We’re going to trade away accuracy |  | ||||||
|       in favour of efficiency and simplicity. We’ll only follow a single |  | ||||||
|       analysis. Our search strategy will be entirely greedy, as it was with |  | ||||||
|       the POS tagger. We’ll lock-in our choices at every step. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       If you read the POS tagger post carefully, you might see the underlying |  | ||||||
|       similarity. What we’ve done is mapped the parsing problem onto a |  | ||||||
|       sequence-labelling problem, which we address using a “flat”, or unstructured, |  | ||||||
|       learning algorithm (by doing greedy search). |  | ||||||
| 
 |  | ||||||
|     h3 Features |  | ||||||
|     p. |  | ||||||
|       Feature extraction code is always pretty ugly. The features for the parser |  | ||||||
|       refer to a few tokens from the context: |  | ||||||
| 
 |  | ||||||
|     ul |  | ||||||
|       li The first three words of the buffer (n0, n1, n2) |  | ||||||
|       li The top three words of the stack (s0, s1, s2) |  | ||||||
|       li The two leftmost children of s0 (s0b1, s0b2); |  | ||||||
|       li The two rightmost children of s0 (s0f1, s0f2); |  | ||||||
|       li The two leftmost children of n0 (n0b1, n0b2) |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       For these 12 tokens, we refer to the word-form, the part-of-speech tag, |  | ||||||
|       and the number of left and right children attached to the token. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       Because we’re using a linear model, we have our features refer to pairs |  | ||||||
|       and triples of these atomic properties. |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def extract_features(words, tags, n0, n, stack, parse): |  | ||||||
|         |     def get_stack_context(depth, stack, data): |  | ||||||
|         |         if depth >= 3: |  | ||||||
|         |             return data[stack[-1]], data[stack[-2]], data[stack[-3]] |  | ||||||
|         |         elif depth >= 2: |  | ||||||
|         |             return data[stack[-1]], data[stack[-2]], '' |  | ||||||
|         |         elif depth == 1: |  | ||||||
|         |             return data[stack[-1]], '', '' |  | ||||||
|         |         else: |  | ||||||
|         |             return '', '', '' |  | ||||||
|         |  |  | ||||||
|         |     def get_buffer_context(i, n, data): |  | ||||||
|         |         if i + 1 >= n: |  | ||||||
|         |             return data[i], '', '' |  | ||||||
|         |         elif i + 2 >= n: |  | ||||||
|         |             return data[i], data[i + 1], '' |  | ||||||
|         |         else: |  | ||||||
|         |             return data[i], data[i + 1], data[i + 2] |  | ||||||
|         |  |  | ||||||
|         |     def get_parse_context(word, deps, data): |  | ||||||
|         |         if word == -1: |  | ||||||
|         |             return 0, '', '' |  | ||||||
|         |         deps = deps[word] |  | ||||||
|         |         valency = len(deps) |  | ||||||
|         |         if not valency: |  | ||||||
|         |             return 0, '', '' |  | ||||||
|         |         elif valency == 1: |  | ||||||
|         |             return 1, data[deps[-1]], '' |  | ||||||
|         |         else: |  | ||||||
|         |             return valency, data[deps[-1]], data[deps[-2]] |  | ||||||
|         |  |  | ||||||
|         |     features = {} |  | ||||||
|         |     # Set up the context pieces --- the word, W, and tag, T, of: |  | ||||||
|         |     # S0-2: Top three words on the stack |  | ||||||
|         |     # N0-2: First three words of the buffer |  | ||||||
|         |     # n0b1, n0b2: Two leftmost children of the first word of the buffer |  | ||||||
|         |     # s0b1, s0b2: Two leftmost children of the top word of the stack |  | ||||||
|         |     # s0f1, s0f2: Two rightmost children of the top word of the stack |  | ||||||
|         |  |  | ||||||
|         |     depth = len(stack) |  | ||||||
|         |     s0 = stack[-1] if depth else -1 |  | ||||||
|         |  |  | ||||||
|         |     Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) |  | ||||||
|         |     Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) |  | ||||||
|         |  |  | ||||||
|         |     Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) |  | ||||||
|         |     Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) |  | ||||||
|         |  |  | ||||||
|         |     Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) |  | ||||||
|         |     Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) |  | ||||||
|         |  |  | ||||||
|         |     Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) |  | ||||||
|         |     _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) |  | ||||||
|         |  |  | ||||||
|         |     Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) |  | ||||||
|         |     _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) |  | ||||||
|         |  |  | ||||||
|         |     Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) |  | ||||||
|         |     _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) |  | ||||||
|         |  |  | ||||||
|         |     # Cap numeric features at 5?  |  | ||||||
|         |     # String-distance |  | ||||||
|         |     Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 |  | ||||||
|         |  |  | ||||||
|         |     features['bias'] = 1 |  | ||||||
|         |     # Add word and tag unigrams |  | ||||||
|         |     for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): |  | ||||||
|         |         if w: |  | ||||||
|         |             features['w=%s' % w] = 1 |  | ||||||
|         |     for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): |  | ||||||
|         |         if t: |  | ||||||
|         |             features['t=%s' % t] = 1 |  | ||||||
|         |  |  | ||||||
|         |     # Add word/tag pairs |  | ||||||
|         |     for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): |  | ||||||
|         |         if w or t: |  | ||||||
|         |             features['%d w=%s, t=%s' % (i, w, t)] = 1 |  | ||||||
|         |  |  | ||||||
|         |     # Add some bigrams |  | ||||||
|         |     features['s0w=%s,  n0w=%s' % (Ws0, Wn0)] = 1 |  | ||||||
|         |     features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 |  | ||||||
|         |     features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 |  | ||||||
|         |     features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 |  | ||||||
|         |     features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 |  | ||||||
|         |     features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 |  | ||||||
|         |     features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 |  | ||||||
|         |     features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 |  | ||||||
|         |  |  | ||||||
|         |     # Add some tag trigrams |  | ||||||
|         |     trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0),  |  | ||||||
|         |                 (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), |  | ||||||
|         |                 (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), |  | ||||||
|         |                 (Ts0, Ts1, Ts1)) |  | ||||||
|         |     for i, (t1, t2, t3) in enumerate(trigrams): |  | ||||||
|         |         if t1 or t2 or t3: |  | ||||||
|         |             features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 |  | ||||||
|         |  |  | ||||||
|         |     # Add some valency and distance features |  | ||||||
|         |     vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) |  | ||||||
|         |     vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) |  | ||||||
|         |     d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), |  | ||||||
|         |         ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) |  | ||||||
|         |     for i, (w_t, v_d) in enumerate(vw + vt + d): |  | ||||||
|         |         if w_t or v_d: |  | ||||||
|         |             features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 |  | ||||||
|         |     return features</code></pre> |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     h3 Training |  | ||||||
|      |  | ||||||
|     p. |  | ||||||
|       Weights are learned using the same algorithm, averaged perceptron, that |  | ||||||
|       we used for part-of-speech tagging. Its key strength is that it’s an |  | ||||||
|       online learning algorithm: examples stream in one-by-one, we make our |  | ||||||
|       prediction, check the actual answer, and adjust our beliefs (weights) |  | ||||||
|       if we were wrong. |  | ||||||
|          |  | ||||||
|     p The training loop looks like this: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|           | class Parser(object): |  | ||||||
|           |     ... |  | ||||||
|           |     def train_one(self, itn, words, gold_tags, gold_heads): |  | ||||||
|           |         n = len(words) |  | ||||||
|           |         i = 2; stack = [1]; parse = Parse(n) |  | ||||||
|           |         tags = self.tagger.tag(words) |  | ||||||
|           |         while stack or (i + 1) < n: |  | ||||||
|           |             features = extract_features(words, tags, i, n, stack, parse) |  | ||||||
|           |             scores = self.model.score(features) |  | ||||||
|           |             valid_moves = get_valid_moves(i, n, len(stack)) |  | ||||||
|           |             guess = max(valid_moves, key=lambda move: scores[move]) |  | ||||||
|           |             gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) |  | ||||||
|           |             best = max(gold_moves, key=lambda move: scores[move]) |  | ||||||
|           |         self.model.update(best, guess, features) |  | ||||||
|           |         i = transition(guess, i, stack, parse) |  | ||||||
|           |     # Return number correct |  | ||||||
|           |     return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|        |  | ||||||
|     p  |  | ||||||
|       | The most interesting part of the training process is in  |  | ||||||
|       code.language-python get_gold_moves. |  | ||||||
|       | The performance of our parser is made possible by an advance by Goldberg |  | ||||||
|       | and Nivre (2012), who showed that we’d been doing this wrong for years. |  | ||||||
|      |  | ||||||
|     p |  | ||||||
|       | In the POS-tagging post, I cautioned that during training you need to |  | ||||||
|       | make sure you pass in the last two |  | ||||||
|       em predicted |  | ||||||
|       | tags as features for the current tag, not the last two  |  | ||||||
|       em gold |  | ||||||
|       | tags. At test time you’ll only have the predicted tags, so if you |  | ||||||
|       | base your features on the gold sequence during training, your training |  | ||||||
|       | contexts won’t resemble your test-time contexts, so you’ll learn the |  | ||||||
|       | wrong weights. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       In parsing, the problem was that we didn’t know  |  | ||||||
|       em how |  | ||||||
|       | to pass in the predicted sequence! Training worked by taking the |  | ||||||
|       | gold-standard tree, and finding a transition sequence that led to it. |  | ||||||
|       | i.e., you got back a sequence of moves, with the guarantee that if |  | ||||||
|       | you followed those moves, you’d get the gold-standard dependencies. |  | ||||||
|      |  | ||||||
|     p |  | ||||||
|       | The problem is, we didn’t know how to define the “correct” move to |  | ||||||
|       | teach a parser to make if it was in any state that  |  | ||||||
|       em wasn’t |  | ||||||
|       |  along that gold-standard sequence. Once the parser had made a mistake, |  | ||||||
|       | we didn’t know how to train from that example. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | That was a big problem, because it meant that once the parser started |  | ||||||
|       | making mistakes, it would end up in states unlike any in its training |  | ||||||
|       | data – leading to yet more mistakes. The problem was specific |  | ||||||
|       | to greedy parsers: once you use a beam, there’s a natural way to do |  | ||||||
|       | structured prediction. |  | ||||||
|     p |  | ||||||
|       | The solution seems obvious once you know it, like all the best breakthroughs. |  | ||||||
|       | What we do is define a function that asks “How many gold-standard |  | ||||||
|       | dependencies can be recovered from this state?”. If you can define |  | ||||||
|       | that function, then you can apply each move in turn, and ask, “How |  | ||||||
|       | many gold-standard dependencies can be recovered from  |  | ||||||
|       em this |  | ||||||
|       | state?”. If the action you applied allows  |  | ||||||
|       em fewer |  | ||||||
|       | gold-standard dependencies to be reached, then it is sub-optimal. |  | ||||||
| 
 |  | ||||||
|     p That’s a lot to take in. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | So we have this function  |  | ||||||
|       code Oracle(state) |  | ||||||
|       | : |  | ||||||
|       pre |  | ||||||
|         code |  | ||||||
|           | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | |  | ||||||
|     p |  | ||||||
|       | We also have a set of actions, each of which returns a new state. |  | ||||||
|       | We want to know: |  | ||||||
| 
 |  | ||||||
|     ul |  | ||||||
|       li shift_cost = Oracle(state) – Oracle(shift(state)) |  | ||||||
|       li right_cost = Oracle(state) – Oracle(right(state)) |  | ||||||
|       li left_cost = Oracle(state) – Oracle(left(state)) |  | ||||||
|      |  | ||||||
|     p |  | ||||||
|       | Now, at least one of those costs  |  | ||||||
|       em has |  | ||||||
|       | to be zero. Oracle(state) is asking, “what’s the cost of the best |  | ||||||
|       | path forward?”, and the first action of that best path has to be |  | ||||||
|       | shift, right, or left. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | It turns out that we can derive Oracle fairly simply for many transition |  | ||||||
|       | systems. The derivation for the transition system we’re using, Arc |  | ||||||
|       | Hybrid, is in Goldberg and Nivre (2013). |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | We’re going to implement the oracle as a function that returns the |  | ||||||
|       | zero-cost moves, rather than implementing a function Oracle(state). |  | ||||||
|       | This prevents us from doing a bunch of costly copy operations. |  | ||||||
|       | Hopefully the reasoning in the code isn’t too hard to follow, but |  | ||||||
|       | you can also consult Goldberg and Nivre’s papers if you’re confused |  | ||||||
|       | and want to get to the bottom of this. |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def get_gold_moves(n0, n, stack, heads, gold): |  | ||||||
|         |     def deps_between(target, others, gold): |  | ||||||
|         |         for word in others: |  | ||||||
|         |             if gold[word] == target or gold[target] == word: |  | ||||||
|         |                 return True |  | ||||||
|         |         return False |  | ||||||
|         |  |  | ||||||
|         |     valid = get_valid_moves(n0, n, len(stack)) |  | ||||||
|         |     if not stack or (SHIFT in valid and gold[n0] == stack[-1]): |  | ||||||
|         |         return [SHIFT] |  | ||||||
|         |     if gold[stack[-1]] == n0: |  | ||||||
|         |         return [LEFT] |  | ||||||
|         |     costly = set([m for m in MOVES if m not in valid]) |  | ||||||
|         |     # If the word behind s0 is its gold head, Left is incorrect |  | ||||||
|         |     if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: |  | ||||||
|         |         costly.add(LEFT) |  | ||||||
|         |     # If there are any dependencies between n0 and the stack, |  | ||||||
|         |     # pushing n0 will lose them. |  | ||||||
|         |     if SHIFT not in costly and deps_between(n0, stack, gold): |  | ||||||
|         |         costly.add(SHIFT) |  | ||||||
|         |     # If there are any dependencies between s0 and the buffer, popping |  | ||||||
|         |     # s0 will lose them. |  | ||||||
|         |     if deps_between(stack[-1], range(n0+1, n-1), gold): |  | ||||||
|         |         costly.add(LEFT) |  | ||||||
|         |         costly.add(RIGHT) |  | ||||||
|         |     return [m for m in MOVES if m not in costly]</code></pre> |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | Doing this “dynamic oracle” training procedure makes a big difference |  | ||||||
|       | to accuracy — typically 1-2%, with no difference to the way the run-time |  | ||||||
|       | works. The old “static oracle” greedy training procedure is fully |  | ||||||
|       | obsolete; there’s no reason to do it that way any more. |  | ||||||
| 
 |  | ||||||
|     h3 Conclusion |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | I have the sense that language technologies, particularly those relating |  | ||||||
|       | to grammar, are particularly mysterious. I can imagine having no idea |  | ||||||
|       | what the program might even do. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | I think it therefore seems natural to people that the best solutions |  | ||||||
|       | would be over-whelmingly complicated. A 200,000 line Java package |  | ||||||
|       | feels appropriate. |  | ||||||
|     p |  | ||||||
|       | But, algorithmic code is usually short, when only a single algorithm |  | ||||||
|       | is implemented. And when you only implement one algorithm, and you |  | ||||||
|       | know exactly what you want to write before you write a line, you |  | ||||||
|       | also don’t pay for any unnecessary abstractions, which can have a |  | ||||||
|       | big performance impact. |  | ||||||
| 
 |  | ||||||
|     h3 Notes |  | ||||||
|     p |  | ||||||
|       a(name='note-1') |  | ||||||
|         | [1] I wasn’t really sure how to count the lines of code in the Stanford |  | ||||||
|         | parser. Its jar file ships over 200k, but there are a lot of different |  | ||||||
|         | models in it. It’s not important, but it's certainly over 4k. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       a(name='note-2') |  | ||||||
|       | [2] For instance, how would you parse, “John’s school of music calls”? |  | ||||||
|       | You want to make sure the phrase “John’s school” has a consistent |  | ||||||
|       | structure in both “John’s school calls” and “John’s school of music |  | ||||||
|       | calls”. Reasoning about the different “slots” you can put a phrase |  | ||||||
|       | into is a key way we reason about what syntactic analyses look like. |  | ||||||
|       | You can think of each phrase as having a different shaped connector, |  | ||||||
|       | which you need to plug into different slots — which each phrase also |  | ||||||
|       | has a certain number of, each of a different shape. We’re trying to |  | ||||||
|       | figure out what connectors are where, so we can figure out how the |  | ||||||
|       | sentences are put together. |  | ||||||
| 
 |  | ||||||
|     h3 Idle speculation |  | ||||||
|     p |  | ||||||
|       | For a long time, incremental language processing algorithms were |  | ||||||
|       | primarily of scientific interest. If you want to write a parser to |  | ||||||
|       | test a theory about how the human sentence processor might work, well, |  | ||||||
|       | that parser needs to build partial interpretations. There’s a wealth |  | ||||||
|       | of evidence, including commonsense introspection, that establishes |  | ||||||
|       | that we don’t buffer input and analyse it once the speaker has finished. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | But now algorithms with that neat scientific feature are winning! |  | ||||||
|       | As best as I can tell, the secret to that success is to be: |  | ||||||
| 
 |  | ||||||
|     ul |  | ||||||
|       li Incremental. Earlier words constrain the search. |  | ||||||
|       li |  | ||||||
|         | Error-driven. Training involves a working hypothesis, which is |  | ||||||
|         | updated as it makes mistakes. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The links to human sentence processing seem tantalising. I look |  | ||||||
|       | forward to seeing whether these engineering breakthroughs lead to |  | ||||||
|       | any psycholinguistic advances. |  | ||||||
| 
 |  | ||||||
|     h3 Bibliography |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The NLP literature is almost entirely open access. All of the relavant |  | ||||||
|       | papers can be found  |  | ||||||
|       a(href=urls.acl_anthology, rel='nofollow') here |  | ||||||
|       | . |  | ||||||
|     p |  | ||||||
|       | The parser I’ve described is an implementation of the dynamic-oracle |  | ||||||
|       | Arc-Hybrid system here: |  | ||||||
| 
 |  | ||||||
|       span.bib-item |  | ||||||
|         | Goldberg, Yoav; Nivre, Joakim.  |  | ||||||
|         em Training Deterministic Parsers with Non-Deterministic Oracles |  | ||||||
|         | . TACL 2013 |  | ||||||
|     p |  | ||||||
|       | However, I wrote my own features for it. The arc-hybrid system was |  | ||||||
|       | originally described here: |  | ||||||
| 
 |  | ||||||
|       span.bib-item |  | ||||||
|         | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic |  | ||||||
|         | programming algorithms for transition-based dependency parsers. ACL 2011 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The dynamic oracle training method was first described here: |  | ||||||
|       span.bib-item |  | ||||||
|         | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; |  | ||||||
|         | Nivre, Joakim. COLING 2012 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | This work depended on a big break-through in accuracy for transition-based |  | ||||||
|       | parsers, when beam-search was properly explored by Zhang and Clark. |  | ||||||
|       | They have several papers, but the preferred citation is: |  | ||||||
| 
 |  | ||||||
|       span.bib-item |  | ||||||
|         | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized |  | ||||||
|         | Perceptron and Beam Search. Computational Linguistics 2011 (1) |  | ||||||
|     p |  | ||||||
|       | Another important paper was this little feature engineering paper, |  | ||||||
|       | which further improved the accuracy: |  | ||||||
| 
 |  | ||||||
|       span.bib-item |  | ||||||
|         | Zhang, Yue;  Nivre, Joakim. Transition-based Dependency Parsing with |  | ||||||
|         | Rich Non-local Features. ACL 2011 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The generalised perceptron, which is the learning framework for these |  | ||||||
|       | beam parsers, is from this paper: |  | ||||||
|       span.bib-item |  | ||||||
|         | Collins, Michael. Discriminative Training Methods for Hidden Markov |  | ||||||
|         | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 |  | ||||||
| 
 |  | ||||||
|     h3 Experimental details |  | ||||||
|     p |  | ||||||
|       | The results at the start of the post refer to Section 22 of the Wall |  | ||||||
|       | Street Journal corpus. The Stanford parser was run as follows: |  | ||||||
| 
 |  | ||||||
|     pre.language-bash |  | ||||||
|       code |  | ||||||
|         | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ |  | ||||||
|         | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | A small post-process was applied, to undo the fancy tokenisation |  | ||||||
|       | Stanford adds for numbers, to make them match the PTB tokenisation: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | """Stanford parser retokenises numbers. Split them.""" |  | ||||||
|         | import sys |  | ||||||
|         | import re |  | ||||||
|         |   |  | ||||||
|         | qp_re = re.compile('\xc2\xa0') |  | ||||||
|         | for line in sys.stdin: |  | ||||||
|         |     line = line.rstrip() |  | ||||||
|         |     if qp_re.search(line): |  | ||||||
|         |         line = line.replace('(CD', '(QP (CD', 1) + ')' |  | ||||||
|         |         line = line.replace('\xc2\xa0', ') (CD ') |  | ||||||
|         |     print line |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The resulting PTB-format files were then converted into dependencies |  | ||||||
|       | using the Stanford converter: |  | ||||||
| 
 |  | ||||||
|     pre.language-bash |  | ||||||
|       code |  | ||||||
|         | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp |  | ||||||
|         | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ |  | ||||||
|         | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll |  | ||||||
|     p |  | ||||||
|       | I can’t easily read that anymore, but it should just convert every |  | ||||||
|       | .mrg file in a folder to a CoNLL-format Stanford basic dependencies |  | ||||||
|       | file, using the settings common in the dependency literature. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | I then converted the gold-standard trees from WSJ 22, for the evaluation. |  | ||||||
|       | Accuracy scores refer to unlabelled attachment score (i.e. the head index) |  | ||||||
|       | of all non-punctuation tokens. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 |  | ||||||
|       | into the same conversion script. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | In a nutshell: The Stanford model and parser.py are trained on the |  | ||||||
|       | same set of sentences, and they each make their predictions on a |  | ||||||
|       | held-out test set, for which we know the answers. Accuracy refers |  | ||||||
|       | to how many of the words’ heads we got correct. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a |  | ||||||
|       | server, to give the Stanford parser more memory. The parser.py system |  | ||||||
|       | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; |  | ||||||
|       | CPython was about half as fast on an early benchmark. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | One of the reasons parser.py is so fast is that it does unlabelled |  | ||||||
|       | parsing. Based on previous experiments, a labelled parser would likely |  | ||||||
|       | be about 40x slower, and about 1% more accurate. Adapting the program |  | ||||||
|       | to labelled parsing would be a good exercise for the reader, if you |  | ||||||
|       | have access to the data. |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | The result from the Redshift parser was produced from commit  |  | ||||||
|       code.language-python b6b624c9900f3bf |  | ||||||
|       | , which was run as follows: |  | ||||||
|     pre.language-bash |  | ||||||
|       code |  | ||||||
|         | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp |  | ||||||
|         | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ |  | ||||||
|         | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll< |  | ||||||
| 
 |  | ||||||
|     footer.meta(role='contentinfo') |  | ||||||
|       a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter |  | ||||||
|       .discuss |  | ||||||
|         a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News |  | ||||||
|         |  |  | ||||||
|         a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit |  | ||||||
|  | @ -1,492 +0,0 @@ | ||||||
| extends ./template_post.jade |  | ||||||
| 
 |  | ||||||
| block body_block |  | ||||||
|   - var urls = {} |  | ||||||
|   - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|   article.post |  | ||||||
|     header |  | ||||||
|       h2 A good Part-of-Speech tagger in about 200 lines of Python |  | ||||||
|       .subhead |  | ||||||
|         | by  |  | ||||||
|         a(href="#" rel="author") Matthew Honnibal |  | ||||||
|         | on  |  | ||||||
|         time(datetime='2013-09-11') October 11, 2013 |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       Up-to-date knowledge about natural language processing is mostly locked away |  | ||||||
|       in academia. And academics are mostly pretty self-conscious when we write. |  | ||||||
|       We’re careful. We don’t want to stick our necks out too much. But under-confident |  | ||||||
|       recommendations suck, so here’s how to write a good part-of-speech tagger. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       There are a tonne of “best known techniques” for POS tagging, and you should |  | ||||||
|       ignore the others and just use Averaged Perceptron. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       You should use two tags of history, and features derived from the Brown word |  | ||||||
|       clusters distributed here. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       If you only need the tagger to work on carefully edited text, you should |  | ||||||
|       use case-sensitive features, but if you want a more robust tagger you |  | ||||||
|       should avoid them because they’ll make you over-fit to the conventions |  | ||||||
|       of your training domain. Instead, features that ask “how frequently is |  | ||||||
|       this word title-cased, in a large sample from the web?” work well. Then |  | ||||||
|       you can lower-case your comparatively tiny training corpus. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       For efficiency, you should figure out which frequent words in your training |  | ||||||
|       data have unambiguous tags, so you don’t have to do anything but output |  | ||||||
|       their tags when they come up. About 50% of the words can be tagged that way. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       And unless you really, really can’t do without an extra 0.1% of accuracy, |  | ||||||
|       you probably shouldn’t bother with any kind of search strategy  you should |  | ||||||
|       just use a greedy model. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       If you do all that, you’ll find your tagger easy to write and understand, |  | ||||||
|       and an efficient Cython implementation will perform as follows on the standard |  | ||||||
|       evaluation, 130,000 words of text from the Wall Street Journal: |  | ||||||
|        |  | ||||||
|     table |  | ||||||
|       thead |  | ||||||
|         tr |  | ||||||
|           th Tagger |  | ||||||
|           th Accuracy |  | ||||||
|           th Time (130k words) |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td CyGreedyAP |  | ||||||
|           td 97.1% |  | ||||||
|           td 4s |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The 4s includes initialisation time — the actual per-token speed is high |  | ||||||
|       enough to be irrelevant; it won’t be your bottleneck. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       It’s tempting to look at 97% accuracy and say something similar, but that’s |  | ||||||
|       not true. My parser is about 1% more accurate if the input has hand-labelled |  | ||||||
|       POS tags, and the taggers all perform much worse on out-of-domain data. |  | ||||||
|       Unfortunately accuracies have been fairly flat for the last ten years. |  | ||||||
|       That’s why my recommendation is to just use a simple and fast tagger that’s |  | ||||||
|       roughly as good. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       The thing is though, it’s very common to see people using taggers that |  | ||||||
|       aren’t anywhere near that good!  For an example of what a non-expert is |  | ||||||
|       likely to use, these were the two taggers wrapped by TextBlob, a new Python |  | ||||||
|       api that I think is quite neat: |  | ||||||
|        |  | ||||||
|     table |  | ||||||
|       thead |  | ||||||
|         tr |  | ||||||
|           th Tagger |  | ||||||
|           th Accuracy |  | ||||||
|           th Time (130k words) |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td NLTK |  | ||||||
|           td 94.0% |  | ||||||
|           td 3m56s |  | ||||||
|         tr |  | ||||||
|           td Pattern |  | ||||||
|           td 93.5% |  | ||||||
|           td 26s |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       Both Pattern and NLTK are very robust and beautifully well documented, so |  | ||||||
|       the appeal of using them is obvious. But Pattern’s algorithms are pretty |  | ||||||
|       crappy, and NLTK carries tremendous baggage around in its implementation |  | ||||||
|       because of its massive framework, and double-duty as a teaching tool. |  | ||||||
| 
 |  | ||||||
|     p.   |  | ||||||
|       As a stand-alone tagger, my Cython implementation is needlessly complicated |  | ||||||
|       – it was written for my parser. So today I wrote a 200 line version |  | ||||||
|       of my recommended algorithm for TextBlob. It gets: |  | ||||||
|        |  | ||||||
|     table |  | ||||||
|       thead |  | ||||||
|         tr |  | ||||||
|           th Tagger |  | ||||||
|           th Accuracy |  | ||||||
|           th Time (130k words) |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td PyGreedyAP |  | ||||||
|           td 96.8% |  | ||||||
|           td 12s |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       I traded some accuracy and a lot of efficiency to keep the implementation |  | ||||||
|       simple. Here’s a far-too-brief description of how it works. |  | ||||||
|        |  | ||||||
|     h3 Averaged perceptron |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       POS tagging is a “supervised learning problem”. You’re given a table of data, |  | ||||||
|       and you’re told that the values in the last column will be missing during |  | ||||||
|       run-time. You have to find correlations from the other columns to predict |  | ||||||
|       that value. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       So for us, the missing column will be “part of speech at word i“. The predictor |  | ||||||
|       columns (features) will be things like “part of speech at word i-1“, “last three |  | ||||||
|       letters of word at i+1“, etc |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       First, here’s what prediction looks like at run-time: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def predict(self, features): |  | ||||||
|         |     '''Dot-product the features and current weights and return the best class.''' |  | ||||||
|         |     scores = defaultdict(float) |  | ||||||
|         |     for feat in features: |  | ||||||
|         |         if feat not in self.weights: |  | ||||||
|         |             continue |  | ||||||
|         |         weights = self.weights[feat] |  | ||||||
|         |         for clas, weight in weights.items(): |  | ||||||
|         |             scores[clas] += weight |  | ||||||
|         |     # Do a secondary alphabetic sort, for stability |  | ||||||
|         |     return max(self.classes, key=lambda clas: (scores[clas], clas)) |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       Earlier I described the learning problem as a table, with one of the columns |  | ||||||
|       marked as missing-at-runtime. For NLP, our tables are always exceedingly |  | ||||||
|       sparse. You have columns like “word i-1=Parliament”, which is almost always |  | ||||||
|       0. So our “weight vectors” can pretty much never be implemented as vectors. |  | ||||||
|       Map-types are good though — here we use dictionaries. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       The input data, features, is a set with a member for every non-zero “column” |  | ||||||
|       in our “table” – every active feature. Usually this is actually a dictionary, |  | ||||||
|       to let you set values for the features. But here all my features are binary |  | ||||||
|       present-or-absent type deals. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       The weights data-structure is a dictionary of dictionaries, that ultimately |  | ||||||
|       associates feature/class pairs with some weight. You want to structure it |  | ||||||
|       this way instead of the reverse because of the way word frequencies are |  | ||||||
|       distributed: most words are rare, frequent words are very frequent. |  | ||||||
|        |  | ||||||
|     h3 Learning the weights |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       Okay, so how do we get the values for the weights? We start with an empty |  | ||||||
|       weights dictionary, and iteratively do the following: |  | ||||||
| 
 |  | ||||||
|     ol |  | ||||||
|       li Receive a new (features, POS-tag) pair |  | ||||||
|       li Guess the value of the POS tag given the current “weights” for the features |  | ||||||
|       li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       It’s one of the simplest learning algorithms. Whenever you make a mistake, |  | ||||||
|       increment the weights for the correct class, and penalise the weights that |  | ||||||
|       led to your false prediction. In code: |  | ||||||
|      |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def train(self, nr_iter, examples): |  | ||||||
|         |     for i in range(nr_iter): |  | ||||||
|         |         for features, true_tag in examples: |  | ||||||
|         |             guess = self.predict(features) |  | ||||||
|         |             if guess != true_tag: |  | ||||||
|         |                 for f in features: |  | ||||||
|         |                     self.weights[f][true_tag] += 1 |  | ||||||
|         |                     self.weights[f][guess] -= 1 |  | ||||||
|         |         random.shuffle(examples) |  | ||||||
|     p. |  | ||||||
|       If you iterate over the same example this way, the weights for the correct |  | ||||||
|       class would have to come out ahead, and you’d get the example right. If |  | ||||||
|       you think about what happens with two examples, you should be able to |  | ||||||
|       see that it will get them both right unless the features are identical. |  | ||||||
|       In general the algorithm will converge so long as the examples are |  | ||||||
|       linearly separable, although that doesn’t matter for our purpose. |  | ||||||
|        |  | ||||||
|     h3 Averaging the weights |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       We need to do one more thing to make the perceptron algorithm competitive. |  | ||||||
|       The problem with the algorithm so far is that if you train it twice on |  | ||||||
|       slightly different sets of examples, you end up with really different models. |  | ||||||
|       It doesn’t generalise that smartly. And the problem is really in the later |  | ||||||
|       iterations — if you let it run to convergence, it’ll pay lots of attention |  | ||||||
|       to the few examples it’s getting wrong, and mutate its whole model around |  | ||||||
|       them. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       So, what we’re going to do is make the weights more "sticky" – give |  | ||||||
|       the model less chance to ruin all its hard work in the later rounds. And |  | ||||||
|       we’re going to do that by returning the averaged weights, not the final |  | ||||||
|       weights. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       I doubt there are many people who are convinced that’s the most obvious |  | ||||||
|       solution to the problem, but whatever. We’re not here to innovate, and this |  | ||||||
|       way is time tested on lots of problems. If you have another idea, run the |  | ||||||
|       experiments and tell us what you find. Actually I’d love to see more work |  | ||||||
|       on this, now that the averaged perceptron has become such a prominent learning |  | ||||||
|       algorithm in NLP. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Okay. So this averaging. How’s that going to work? Note that we don’t want |  | ||||||
|       to just average after each outer-loop iteration. We want the average of all |  | ||||||
|       the values — from the inner loop. So if we have 5,000 examples, and we train |  | ||||||
|       for 10 iterations, we’ll average across 50,000 values for each weight. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Obviously we’re not going to store all those intermediate values. Instead, |  | ||||||
|       we’ll track an accumulator for each weight, and divide it by the number of |  | ||||||
|       iterations at the end. Again: we want the average weight assigned to a |  | ||||||
|       feature/class pair during learning, so the key component we need is the total |  | ||||||
|       weight it was assigned. But we also want to be careful about how we compute |  | ||||||
|       that accumulator, too. On almost any instance, we’re going to see a tiny |  | ||||||
|       fraction of active feature/class pairs. All the other feature/class weights |  | ||||||
|       won’t change. So we shouldn’t have to go back and add the unchanged value |  | ||||||
|       to our accumulators anyway, like chumps. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain |  | ||||||
|       another dictionary that tracks how long each weight has gone unchanged. Now |  | ||||||
|       when we do change a weight, we can do a fast-forwarded update to the accumulator, |  | ||||||
|       for all those iterations where it lay unchanged. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Here’s what a weight update looks like now that we have to maintain the |  | ||||||
|       totals and the time-stamps: |  | ||||||
|        |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def update(self, truth, guess, features): |  | ||||||
|         |     def upd_feat(c, f, v): |  | ||||||
|         |         nr_iters_at_this_weight = self.i - self._timestamps[f][c] |  | ||||||
|         |         self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c] |  | ||||||
|         |         self.weights[f][c] += v |  | ||||||
|         |         self._timestamps[f][c] = self.i |  | ||||||
|         |  | ||||||
|         |     self.i += 1 |  | ||||||
|         |     for f in features: |  | ||||||
|         |         upd_feat(truth, f, 1.0) |  | ||||||
|         |         upd_feat(guess, f, -1.0) |  | ||||||
| 
 |  | ||||||
|     h3 Features and pre-processing |  | ||||||
|      |  | ||||||
|     p. |  | ||||||
|       The POS tagging literature has tonnes of intricate features sensitive to |  | ||||||
|       case, punctuation, etc. They help on the standard test-set, which is from |  | ||||||
|       Wall Street Journal articles from the 1980s, but I don’t see how they’ll |  | ||||||
|       help us learn models that are useful on other text. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       To help us learn a more general model, we’ll pre-process the data prior |  | ||||||
|       to feature extraction, as follows: |  | ||||||
|        |  | ||||||
|     ul |  | ||||||
|       li All words are lower cased; |  | ||||||
|       li Digits in the range 1800-2100 are represented as !YEAR; |  | ||||||
|       li Other digit strings are represented as !DIGITS |  | ||||||
|       li |  | ||||||
|         | It would be better to have a module recognising dates, phone numbers, |  | ||||||
|         | emails, hash-tags, etc. but that will have to be pushed back into the |  | ||||||
|         | tokenization. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       I played around with the features a little, and this seems to be a reasonable |  | ||||||
|       bang-for-buck configuration in terms of getting the development-data accuracy |  | ||||||
|       to 97% (where it typically converges anyway), and having a smaller memory |  | ||||||
|       foot-print: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def _get_features(self, i, word, context, prev, prev2): |  | ||||||
|         |     '''Map tokens-in-contexts into a feature representation, implemented as a |  | ||||||
|         |     set. If the features change, a new model must be trained.''' |  | ||||||
|         |     def add(name, *args): |  | ||||||
|         |         features.add('+'.join((name,) + tuple(args))) |  | ||||||
|         |  | ||||||
|         |     features = set() |  | ||||||
|         |     add('bias') # This acts sort of like a prior |  | ||||||
|         |     add('i suffix', word[-3:]) |  | ||||||
|         |     add('i pref1', word[0]) |  | ||||||
|         |     add('i-1 tag', prev) |  | ||||||
|         |     add('i-2 tag', prev2) |  | ||||||
|         |     add('i tag+i-2 tag', prev, prev2) |  | ||||||
|         |     add('i word', context[i]) |  | ||||||
|         |     add('i-1 tag+i word', prev, context[i]) |  | ||||||
|         |     add('i-1 word', context[i-1]) |  | ||||||
|         |     add('i-1 suffix', context[i-1][-3:]) |  | ||||||
|         |     add('i-2 word', context[i-2]) |  | ||||||
|         |     add('i+1 word', context[i+1]) |  | ||||||
|         |     add('i+1 suffix', context[i+1][-3:]) |  | ||||||
|         |     add('i+2 word', context[i+2]) |  | ||||||
|         |     return features |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       I haven’t added any features from external data, such as case frequency |  | ||||||
|       statistics from the Google Web 1T corpus. I might add those later, but for |  | ||||||
|       now I figured I’d keep things simple. |  | ||||||
|        |  | ||||||
|     h3 What about search? |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The model I’ve recommended commits to its predictions on each word, and |  | ||||||
|       moves on to the next one. Those predictions are then used as features for |  | ||||||
|       the next word. There’s a potential problem here, but it turns out it doesn’t |  | ||||||
|       matter much. It’s easy to fix with beam-search, but I say it’s not really |  | ||||||
|       worth bothering. And it definitely doesn’t matter enough to adopt a slow |  | ||||||
|       and complicated algorithm like Conditional Random Fields. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Here’s the problem. The best indicator for the tag at position, say, 3 in |  | ||||||
|       a sentence is the word at position 3. But the next-best indicators are the |  | ||||||
|       tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want |  | ||||||
|       the predictions for the surrounding words in hand before we commit to a |  | ||||||
|       prediction for the current word. Here’s an example where search might matter: |  | ||||||
|        |  | ||||||
|     p.example. |  | ||||||
|       Their management plan reforms worked |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Depending on just what you’ve learned from your training data, you can |  | ||||||
|       imagine making a different decision if you started at the left and moved |  | ||||||
|       right, conditioning on your previous decisions, than if you’d started at |  | ||||||
|       the right and moved left. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       If that’s not obvious to you, think about it this way: “worked” is almost |  | ||||||
|       surely a verb, so if you tag “reforms” with that in hand, you’ll have a |  | ||||||
|       different idea of its tag than if you’d just come from “plan“, which you |  | ||||||
|       might have regarded as either a noun or a verb. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Search can only help you when you make a mistake. It can prevent that error |  | ||||||
|       from throwing off your subsequent decisions, or sometimes your future choices |  | ||||||
|       will correct the mistake. And that’s why for POS tagging, search hardly matters! |  | ||||||
|       Your model is so good straight-up that your past predictions are almost always |  | ||||||
|       true. So you really need the planets to align for search to matter at all. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       And as we improve our taggers, search will matter less and less. Instead |  | ||||||
|       of search, what we should be caring about is multi-tagging. If we let the |  | ||||||
|       model be a bit uncertain, we can get over 99% accuracy assigning an average |  | ||||||
|       of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is |  | ||||||
|       rubbish at multi-tagging though. That’s its big weakness. You really want |  | ||||||
|       a probability distribution for that. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       One caveat when doing greedy search, though. It’s very important that your |  | ||||||
|       training data model the fact that the history will be imperfect at run-time. |  | ||||||
|       Otherwise, it will be way over-reliant on the tag-history features. Because |  | ||||||
|       the Perceptron is iterative, this is very easy. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       Here’s the training loop for the tagger: |  | ||||||
| 
 |  | ||||||
|     pre.language-python |  | ||||||
|       code |  | ||||||
|         | def train(self, sentences, save_loc=None, nr_iter=5, quiet=False): |  | ||||||
|         |     '''Train a model from sentences, and save it at save_loc. nr_iter |  | ||||||
|         |     controls the number of Perceptron training iterations.''' |  | ||||||
|         |     self._make_tagdict(sentences, quiet=quiet) |  | ||||||
|         |     self.model.classes = self.classes |  | ||||||
|         |     prev, prev2 = START |  | ||||||
|         |     for iter_ in range(nr_iter): |  | ||||||
|         |         c = 0; n = 0 |  | ||||||
|         |         for words, tags in sentences: |  | ||||||
|         |             context = START + [self._normalize(w) for w in words] + END |  | ||||||
|         |             for i, word in enumerate(words): |  | ||||||
|         |                 guess = self.tagdict.get(word) |  | ||||||
|         |                 if not guess: |  | ||||||
|         |                     feats = self._get_features( |  | ||||||
|         |                               i, word, context, prev, prev2) |  | ||||||
|         |                     guess = self.model.predict(feats) |  | ||||||
|         |                     self.model.update(tags[i], guess, feats) |  | ||||||
|         |                 # Set the history features from the guesses, not the |  | ||||||
|         |                 # true tags |  | ||||||
|         |                 prev2 = prev; prev = guess |  | ||||||
|         |                 c += guess == tags[i]; n += 1 |  | ||||||
|         |         random.shuffle(sentences) |  | ||||||
|         |         if not quiet: |  | ||||||
|         |             print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n))) |  | ||||||
|         |     self.model.average_weights() |  | ||||||
|         |     # Pickle as a binary file |  | ||||||
|         |     if save_loc is not None: |  | ||||||
|         |         cPickle.dump((self.model.weights, self.tagdict, self.classes), |  | ||||||
|         |                      open(save_loc, 'wb'), -1) |  | ||||||
|     p. |  | ||||||
|       Unlike the previous snippets, this one’s literal – I tended to edit the |  | ||||||
|       previous ones to simplify. So if they have bugs, hopefully that’s why! |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       At the time of writing, I’m just finishing up the implementation before I |  | ||||||
|       submit a pull request to TextBlob. You can see the rest of the source here: |  | ||||||
|        |  | ||||||
|     ul |  | ||||||
|       li |  | ||||||
|         a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py |  | ||||||
|       li |  | ||||||
|         a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py |  | ||||||
|        |  | ||||||
|     h3 A final comparison… |  | ||||||
|      |  | ||||||
|     p. |  | ||||||
|       Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology. |  | ||||||
|       The claim is that we’ve just been meticulously over-fitting our methods to this |  | ||||||
|       data. Actually the evidence doesn’t really bear this out. Mostly, if a technique |  | ||||||
|       is clearly better on one evaluation, it improves others as well. Still, it’s |  | ||||||
|       very reasonable to want to know how these tools perform on other text. So I |  | ||||||
|       ran the unchanged models over two other sections from the OntoNotes corpus: |  | ||||||
|        |  | ||||||
|     table |  | ||||||
|       thead |  | ||||||
|         tr |  | ||||||
|           th Tagger |  | ||||||
|           th WSJ |  | ||||||
|           th ABC |  | ||||||
|           th Web |  | ||||||
|       tbody |  | ||||||
|         tr |  | ||||||
|           td Pattern |  | ||||||
|           td 93.5 |  | ||||||
|           td 90.7 |  | ||||||
|           td 88.1 |  | ||||||
|         tr |  | ||||||
|           td NLTK |  | ||||||
|           td 94.0 |  | ||||||
|           td 91.5 |  | ||||||
|           td 88.4 |  | ||||||
|         tr |  | ||||||
|           td PyGreedyAP |  | ||||||
|           td 96.8 |  | ||||||
|           td 94.8 |  | ||||||
|           td 91.8 |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t |  | ||||||
|       looked at the data much). |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       As you can see, the order of the systems is stable across the three comparisons, |  | ||||||
|       and the advantage of our Averaged Perceptron tagger over the other two is real |  | ||||||
|       enough. Actually the pattern tagger does very poorly on out-of-domain text. |  | ||||||
|       It mostly just looks up the words, so it’s very domain dependent. I hadn’t |  | ||||||
|       realised it before, but it’s obvious enough now that I think about it. |  | ||||||
|        |  | ||||||
|     p. |  | ||||||
|       We can improve our score greatly by training on some of the foreign data. |  | ||||||
|       The technique described in this paper (Daume III, 2007) is the first thing |  | ||||||
|       I try when I have to do that. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     footer.meta(role='contentinfo') |  | ||||||
|       a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter |  | ||||||
|       .discuss |  | ||||||
|         a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News |  | ||||||
|         |  |  | ||||||
|         a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit |  | ||||||
|  | @ -1,139 +0,0 @@ | ||||||
| - var urls = {} |  | ||||||
| - urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" |  | ||||||
| - urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +comparison("NLTK") |  | ||||||
|   p spaCy is: |  | ||||||
|   ul |  | ||||||
|     li.pro 100x faster; |  | ||||||
|     li.pro 50% more accurate; |  | ||||||
|     li.pro Serializes TODO% smaller; |  | ||||||
| 
 |  | ||||||
|   p spaCy features: |  | ||||||
|     ul  |  | ||||||
|       li.pro Integrated word vectors; |  | ||||||
|       li.pro Efficient binary serialization; |  | ||||||
| 
 |  | ||||||
|   p NLTK features: |  | ||||||
|     ul |  | ||||||
|       li.con Multiple languages;  |  | ||||||
|       li.neutral Educational resources |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| //+comparison("Pattern") |  | ||||||
| +comparison("CoreNLP") |  | ||||||
|   p spaCy is: |  | ||||||
| 
 |  | ||||||
|   ul |  | ||||||
|     li.pro TODO% faster; |  | ||||||
|     li.pro TODO% more accurate; |  | ||||||
|     li.pro Not Java; |  | ||||||
|     li.pro Well documented; |  | ||||||
|     li.pro Cheaper to license commercially; |  | ||||||
|     li.neutral |  | ||||||
|       | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping |  | ||||||
|       | options.   |  | ||||||
| 
 |  | ||||||
|   p CoreNLP features: |  | ||||||
| 
 |  | ||||||
|   ul |  | ||||||
|     li.con Multiple Languages; |  | ||||||
|     li.con Sentiment analysis  |  | ||||||
|     li.con Coreference resolution |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +comparison("ClearNLP") |  | ||||||
|   p spaCy is: |  | ||||||
| 
 |  | ||||||
|   ul |  | ||||||
|     li.pro Not Java; |  | ||||||
|     li.pro TODO% faster; |  | ||||||
|     li.pro Well documented; |  | ||||||
|     li.neutral Slightly more accurate; |  | ||||||
| 
 |  | ||||||
|   p ClearNLP features: |  | ||||||
| 
 |  | ||||||
|   ul |  | ||||||
|     li.con Semantic Role Labelling |  | ||||||
|     li.con Multiple Languages |  | ||||||
|     li.con Model for biology/life-science; |  | ||||||
| 
 |  | ||||||
| //+comparison("Accuracy Summary") |  | ||||||
| 
 |  | ||||||
| //+comparison("Speed Summary") |  | ||||||
| //  table |  | ||||||
| //    thead |  | ||||||
| //      tr |  | ||||||
| //        th. |  | ||||||
| //        th(colspan=3) Absolute (ms per doc) |  | ||||||
| //        th(colspan=3) Relative (to spaCy) |  | ||||||
| // |  | ||||||
| //    tbody |  | ||||||
| //      tr |  | ||||||
| //        td: strong System |  | ||||||
| //        td: strong Split |  | ||||||
| //        td: strong Tag |  | ||||||
| //        td: strong Parse |  | ||||||
| //        td: strong Split |  | ||||||
| //        td: strong Tag |  | ||||||
| //        td: strong Parse |  | ||||||
| // |  | ||||||
| //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") |  | ||||||
| //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") |  | ||||||
| //      +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") |  | ||||||
| //      +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") |  | ||||||
| //      +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") |  | ||||||
| // |  | ||||||
| //  p |  | ||||||
| //    | <strong>Set up</strong>: 100,000 plain-text documents were streamed |  | ||||||
| //    | from an SQLite3 database, and processed with an NLP library, to one |  | ||||||
| //    | of three levels of detail – tokenization, tagging, or parsing. |  | ||||||
| //    | The tasks are additive: to parse the text you have to tokenize and |  | ||||||
| //    | tag it.  The  pre-processing was not subtracted from the times – |  | ||||||
| //    | I report the time required for the pipeline to complete.  I report |  | ||||||
| //    | mean times per document, in milliseconds. |  | ||||||
| // |  | ||||||
| //  p |  | ||||||
| //    | <strong>Hardware</strong>: Intel i7-3770 (2012) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +comparison("Peer-reviewed Evaluations") |  | ||||||
|   p. |  | ||||||
|     spaCy is committed to rigorous evaluation under standard methodology.  Two |  | ||||||
|     papers in 2015 confirm that: |  | ||||||
|   ol |  | ||||||
|     li spaCy is the fastest syntactic parser in the world; |  | ||||||
|     li Its accuracy is within 1% of the best available; |  | ||||||
|     li The few systems that are more accurate are 20× slower or more. |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, |  | ||||||
|     | as part of a survey paper benchmarking the current state-of-the-art dependency |  | ||||||
|     | parsers  |  | ||||||
|     a(href=urls.choi_paper) (Choi et al., 2015) |  | ||||||
|     | . |  | ||||||
| 
 |  | ||||||
|   table |  | ||||||
|     thead |  | ||||||
|       +columns("System", "Language", "Accuracy", "Speed") |  | ||||||
| 
 |  | ||||||
|     tbody |  | ||||||
|       +row("spaCy v0.84", "Cython", "90.6", "13,963") |  | ||||||
|       +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") |  | ||||||
|       +row("ClearNLP", "Java", "91.7", "10,271") |  | ||||||
|       +row("CoreNLP", "Java", "89.6", "8,602") |  | ||||||
|       +row("MATE", "Java", "92.5", "550") |  | ||||||
|       +row("Turbo", "C++", "92.4", "349") |  | ||||||
|       +row("Yara", "Java", "92.3", "340") |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     | Discussion with the authors led to accuracy improvements in spaCy, which |  | ||||||
|     | have been accepted for publication in EMNLP, in joint work with Macquarie |  | ||||||
|     | University |  | ||||||
|     a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) |  | ||||||
|     | .  |  | ||||||
| 
 |  | ||||||
|  | @ -1,129 +0,0 @@ | ||||||
| extends ./outline.jade |  | ||||||
| 
 |  | ||||||
| include ./mixins.jade |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin declare_class(name) |  | ||||||
|   details |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label class |  | ||||||
|         code #{name} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| mixin method(name, parameters) |  | ||||||
|   details(open=attributes.open) |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label #{name} |  | ||||||
|         span.parameters |  | ||||||
|           | self, #{parameters} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin params |  | ||||||
|   ul |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin param(name, type, value) |  | ||||||
|   li |  | ||||||
|     if type |  | ||||||
|       <strong>#{name}</strong> (!{type}) – |  | ||||||
|     else |  | ||||||
|       <strong>#{name}</strong> – |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin attribute(name, type, value) |  | ||||||
|   details(open=attributes.open) |  | ||||||
|     summary |  | ||||||
|       span.declaration |  | ||||||
|         span.label #{name} |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin returns(name, type, value) |  | ||||||
|   li |  | ||||||
|     if type |  | ||||||
|       <strong>#{name}</strong> (!{type}) – |  | ||||||
|     else |  | ||||||
|       <strong>#{name}</strong> – |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin returns(type) |  | ||||||
|   | tmp |  | ||||||
| 
 |  | ||||||
| mixin init |  | ||||||
|   details |  | ||||||
|     summary: h4 Init |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin callable |  | ||||||
|   details |  | ||||||
|     summary: h4 Callable |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin sequence |  | ||||||
|   details |  | ||||||
|     summary: h4 Sequence |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin maptype |  | ||||||
|   details |  | ||||||
|     summary: h4 Map |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin summary |  | ||||||
|   block |  | ||||||
| 
 |  | ||||||
| mixin en_example |  | ||||||
|   pre.language-python |  | ||||||
|     code |  | ||||||
|       | from spacy.en import English |  | ||||||
|       | from spacy._doc_examples import download_war_and_peace |  | ||||||
|       |  |  | ||||||
|       | unprocessed_unicode = download_war_and_peace() |  | ||||||
|       |  |  | ||||||
|       | nlp = English() |  | ||||||
|       | doc = nlp(unprocessed_unicode) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block intro_block |  | ||||||
|   section(class="intro") |  | ||||||
| 
 |  | ||||||
|     nav(role="navigation") |  | ||||||
|       ul |  | ||||||
|         li: a(href="#api" class="button") API |  | ||||||
|         li: a(href="#tutorials" class="button") Tutorials |  | ||||||
|         li: a(href="#spec" class="button") Spec |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block body_block |  | ||||||
|   - var py_docs = '<a class="reference" href="http://docs.python.org/library/' |  | ||||||
| 
 |  | ||||||
|   - |  | ||||||
|     var types = { |  | ||||||
|       'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>', |  | ||||||
|       'bool': py_docs + 'functions.html#bool"><em>bool</em></a>', |  | ||||||
|       'int': py_docs + 'functions.html#int"><em>int</em></a>', |  | ||||||
|       'generator': "", |  | ||||||
|       'Vocab': "", |  | ||||||
|       'Span': "", |  | ||||||
|       'Doc': "" |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|   article |  | ||||||
| 
 |  | ||||||
|     +Section("API", "api", "api.jade") |  | ||||||
|     +Section("Tutorials", "tutorials", "tutorials.jade") |  | ||||||
|     +Section("Annotation Specifications", "spec", "spec.jade") |  | ||||||
|  | @ -1,88 +0,0 @@ | ||||||
| extends ./outline.jade |  | ||||||
| 
 |  | ||||||
| include ./mixins.jade |  | ||||||
| 
 |  | ||||||
| // Notes |  | ||||||
| // |  | ||||||
| // 1. Where to put version notice? Should say something like |  | ||||||
| //   2015-08-12: v0.89 |  | ||||||
| //   and be a link |  | ||||||
| //    |  | ||||||
| //   Only needs to appear on home page. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| - var slogan = "Build Tomorrow's Language Technologies" |  | ||||||
| - var tag_line = "spaCy – " + slogan |  | ||||||
| 
 |  | ||||||
| mixin lede |  | ||||||
|   - var state_of_the_art = '<a href="#">state-of-the-art</a>' |  | ||||||
|   - var a_minor_miracle = '<a href="">a minor miracle</a>' |  | ||||||
|   - var great_documentation = '<a href="">great documentation</a>' |  | ||||||
|   - var concise_API = '<a href="">concise API</a>' |  | ||||||
|    |  | ||||||
|   p. |  | ||||||
|     <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a |  | ||||||
|     library for industrial-strength natural language processing in Python and |  | ||||||
|     Cython.  It features !{state_of_the_art} speed and accuracy, a !{concise_API}, |  | ||||||
|     and <a href="#license">license terms</a> designed to get out of your way. |  | ||||||
|     If you're a small company doing NLP, we want <strong>spaCy</strong> to seem |  | ||||||
|     like !{a_minor_miracle}. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin comparison(name) |  | ||||||
|   details |  | ||||||
|     summary |  | ||||||
|       h4= name |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
|   |  | ||||||
| mixin columns(...names) |  | ||||||
|   tr |  | ||||||
|     each name in names |  | ||||||
|       th= name |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin row(...cells) |  | ||||||
|   tr |  | ||||||
|     each cell in cells |  | ||||||
|       td= cell |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin social       |  | ||||||
|   footer(role="contentinfo") |  | ||||||
|     a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter |  | ||||||
| 
 |  | ||||||
|     div.discuss |  | ||||||
|       a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") |  | ||||||
|         | Discuss on Hacker News |  | ||||||
| 
 |  | ||||||
|       a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") |  | ||||||
|         | Discuss on Reddit |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block intro_block |  | ||||||
|   section(class="intro") |  | ||||||
|     +lede |  | ||||||
| 
 |  | ||||||
|     nav(role="navigation") |  | ||||||
|       ul |  | ||||||
|         li: a(href="#example-use" class="button") Examples |  | ||||||
|         li: a(href="#comparisons" class="button") Comparisons |  | ||||||
|         li: a(href="#online-demo" class="button") Try Online |  | ||||||
|         li: a(href="#install" class="button") |  | ||||||
|           | Install |  | ||||||
|           <span class="button-caption">v0.89</span> |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| block body_block |  | ||||||
|   article(class="page landing-page") |  | ||||||
| 
 |  | ||||||
|     +Section("Usage by Example", "example-use", "./usage_examples.jade") |  | ||||||
| 
 |  | ||||||
|     +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") |  | ||||||
|        |  | ||||||
|     +Section("Online Demo", "online-demo", "./online_demo.jade") |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     +Section("Install", "install", "./install.jade") |  | ||||||
|  | @ -1,71 +0,0 @@ | ||||||
| mixin Option(name, open) |  | ||||||
|   details(open=open) |  | ||||||
|     summary |  | ||||||
|       h4= name |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| +Option("conda", true) |  | ||||||
|   pre.language-bash: code |  | ||||||
|     | $ conda install spacy |  | ||||||
|     | $ python -m spacy.en.download |  | ||||||
| 
 |  | ||||||
| +Option("pip and virtualenv", true) |  | ||||||
|   p With Python 2.7 or Python 3, using Linux or OSX, run: |  | ||||||
| 
 |  | ||||||
|     pre.language-bash: code |  | ||||||
|       | $ pip install spacy |  | ||||||
|       | $ python -m spacy.en.download |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     | The download command fetches and installs about 300mb of data, for |  | ||||||
|     | the parser model and word vectors, which it installs within the spacy.en |  | ||||||
|     | package directory. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|   +Option("Workaround for obsolete system Python", false) |  | ||||||
|     p |  | ||||||
|       | If you're stuck using a server with an old version of Python, and you |  | ||||||
|       | don't have root access, I've prepared a bootstrap script to help you |  | ||||||
|       | compile a local Python install.  Run: |  | ||||||
| 
 |  | ||||||
|     pre.language-bash: code |  | ||||||
|       | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +Option("Compile from source", false) |  | ||||||
|   p |  | ||||||
|     | The other way to install the package is to clone the github repository, |  | ||||||
|     | and build it from source.  This installs an additional dependency, |  | ||||||
|     | Cython.  If you're using Python 2, I also recommend installing fabric |  | ||||||
|     | and fabtools – this is how I build the project. |  | ||||||
| 
 |  | ||||||
|   pre.language-bash: code |  | ||||||
|     | $ git clone https://github.com/honnibal/spaCy.git |  | ||||||
|     | $ cd spaCy |  | ||||||
|     | $ virtualenv .env && source .env/bin/activate |  | ||||||
|     | $ export PYTHONPATH=`pwd` |  | ||||||
|     | $ pip install -r requirements.txt |  | ||||||
|     | $ python setup.py build_ext --inplace |  | ||||||
|     | $ python -m spacy.en.download |  | ||||||
|     | $ pip install pytest |  | ||||||
|     | $ py.test tests/ |  | ||||||
| 
 |  | ||||||
|   p |  | ||||||
|     | Python packaging is awkward at the best of times, and it's particularly tricky |  | ||||||
|     | with C extensions, built via Cython, requiring large data files.  So, |  | ||||||
|     | please report issues as you encounter them. |  | ||||||
| 
 |  | ||||||
| +Option("pypy (Unsupported)") |  | ||||||
|   | If PyPy support is a priority for you, please get in touch.  We could likely |  | ||||||
|   | fix the remaining issues, if necessary.  However, the library is likely to |  | ||||||
|   | be much slower on PyPy, as it's written in Cython, which produces code tuned |  | ||||||
|   | for the performance of CPython. |  | ||||||
| 
 |  | ||||||
| +Option("Windows (Unsupported)") |  | ||||||
|   | Unfortunately we don't currently have access to a Windows machine, and have |  | ||||||
|   | no experience developing on a MicroSoft stack. In theory the only problems are |  | ||||||
|   | with the installation and packaging – there should be no deep platform |  | ||||||
|   | dependency. Unfortunately we can't debug these issues at present, simply due |  | ||||||
|   | to lack of a development environment. |  | ||||||
| 
 |  | ||||||
|  | @ -1,179 +0,0 @@ | ||||||
| extends ./outline.jade |  | ||||||
| 
 |  | ||||||
| mixin columns(...names) |  | ||||||
|   tr |  | ||||||
|     each name in names |  | ||||||
|       th= name |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin row(...cells) |  | ||||||
|   tr |  | ||||||
|     each cell in cells |  | ||||||
|       td= cell |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin LicenseOption(name, period, price, audience) |  | ||||||
|     .item |  | ||||||
|       h4 #{name} |  | ||||||
|          |  | ||||||
|       .focus #{period} |  | ||||||
| 
 |  | ||||||
|       span #{price} |  | ||||||
|          |  | ||||||
|       h5 Suggested for: |  | ||||||
|          |  | ||||||
|       span #{audience} |  | ||||||
|          |  | ||||||
|       a.button(href="spacy_trial_free.docx") Download license |  | ||||||
| 
 |  | ||||||
|       span or  |  | ||||||
|         a(href="#") get in touch |  | ||||||
| 
 |  | ||||||
|   |  | ||||||
| block body_block |  | ||||||
|   article.pricing |  | ||||||
| 
 |  | ||||||
|     .box.license |  | ||||||
|       +LicenseOption("Trial", "90 days", "$0", "Evaluation") |  | ||||||
|       +LicenseOption("Production", "1 year", "$5,000", "Production") |  | ||||||
|       +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") |  | ||||||
| 
 |  | ||||||
|     p.caption |  | ||||||
|       | Researcher, hobbyist, or open-source developer? spaCy also offers  |  | ||||||
|       a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3  |  | ||||||
|       | licenses. |  | ||||||
| 
 |  | ||||||
|     p. |  | ||||||
|       What we offer is a rare, simple certainty: a long-term, permissive license |  | ||||||
|       that comes with full access to the source, complete transparency, and almost |  | ||||||
|       complete flexibility.  The difference between this and a black-box API is |  | ||||||
|       night and day.  You cannot build a great product against a service you |  | ||||||
|       don't understand, and you can't build a great business on a service you |  | ||||||
|       don't control. |  | ||||||
|        |  | ||||||
|     p |  | ||||||
|       | Let's face it: services disappear.  Constantly. The good start-ups get |  | ||||||
|       | bought; the bad ones go bankrupt.  Open-source projects become abandoned |  | ||||||
|       | or bloated.  Google's graveyard is over-flowing – ditto for Yahoo!, |  | ||||||
|       | Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? |  | ||||||
| 
 |  | ||||||
|     p |  | ||||||
|       | A 5 year license won't expire until 2020.  spaCy will be with you for |  | ||||||
|       | longer than most of your current staff.  If that's still not enough, |  | ||||||
|       | get in touch. I'm sure we can work something out. |  | ||||||
| 
 |  | ||||||
|     //p. |  | ||||||
|     //  To make spaCy as valuable as possible, licenses to it are for life.  You get |  | ||||||
|     //  complete transparency, certainty and control.  If you need to use spaCy |  | ||||||
|     //  as an API, it's trivial to host it yourself – and you don't need to |  | ||||||
|     //  worry about the service changing or disappearing.  And if you're ever in |  | ||||||
|     //  acquisition or IPO talks, the story is simple. |  | ||||||
| 
 |  | ||||||
|     //p. |  | ||||||
|     //  spaCy can also be used as free open-source software, under the Aferro GPL |  | ||||||
|     //  license.  If you use it this way, you must comply with the AGPL license |  | ||||||
|     //  terms.  When you distribute your project, or offer it as a network service, |  | ||||||
|     //  you must distribute the source-code and grant users an AGPL license to it. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     //h3 Examples |  | ||||||
| 
 |  | ||||||
|     //p. |  | ||||||
|     //  In order to clarify how spaCy's license structure might apply to you, I've |  | ||||||
|     //  written a few examples, in the form of user-stories. |  | ||||||
| 
 |  | ||||||
|     //details |  | ||||||
|     //  summary: h4 Seed stage start-ups |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Ashley and Casey have an idea for a start-up.  To explore their idea, they |  | ||||||
|     //    want to build a minimum viable product they can put in front of potential |  | ||||||
|     //    users and investors. |  | ||||||
| 
 |  | ||||||
|     //  p. They have two options. |  | ||||||
| 
 |  | ||||||
|     //  ol |  | ||||||
|     //    li |  | ||||||
|     //      p. |  | ||||||
|     //        <strong>Trial commercial license.</strong> With a simple form, they can |  | ||||||
|     //        use spaCy for 90 days, for a nominal fee of $1.  They are free to modify |  | ||||||
|     //        spaCy, and they will own the copyright to their modifications for the |  | ||||||
|     //        duration of the license.  After the trial period elapses, they can either |  | ||||||
|     //        pay the license fee, stop using spaCy, release their project under the |  | ||||||
|     //        AGPL. |  | ||||||
|     // |  | ||||||
|     //    li |  | ||||||
|     //      p. |  | ||||||
|     //        <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL |  | ||||||
|     //        license. However, they must then release any code that statically or |  | ||||||
|     //        dynamically links to spaCy under the AGPL as well (e.g. if they import |  | ||||||
|     //        the module, or import a module that imports it, etc).  They also cannot |  | ||||||
|     //        use spaCy as a network resource, by running it as a service --- this is |  | ||||||
|     //        the loophole that the "A" part of the AGPL is designed to close. |  | ||||||
|     //   |  | ||||||
|     //  p. |  | ||||||
|     //    Ashley and Casey find the AGPL license unattractive for commercial use. |  | ||||||
|     //    They decide to take up the trial commercial license.  However,  over the |  | ||||||
|     //    next 90 days, Ashley has to move house twice, and Casey gets sick.  By |  | ||||||
|     //    the time the trial expires, they still don't have a demo they can show |  | ||||||
|     //    investors.  They send an email explaining the situation, and a 90 day extension |  | ||||||
|     //    to their trial license is granted. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    By the time the extension period has elapsed, spaCy has helped them secure |  | ||||||
|     //    funding, and they even have a little revenue.  They are glad to pay the |  | ||||||
|     //    $5,000 commercial license fee. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    spaCy is now permanently licensed for the product Ashley and Casey are |  | ||||||
|     //    developing.  They own the copyright to any modifications they make to spaCy, |  | ||||||
|     //    but not to the original spaCy code. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    No additional fees will be due when they hire new developers, run spaCy on |  | ||||||
|     //    additional internal servers, etc.  If their company is acquired, the license |  | ||||||
|     //    will be transferred to the company acquiring them.  However, to use spaCy |  | ||||||
|     //    in another product, they will have to buy a second license. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     // details |  | ||||||
|     //  summary: h4 University academics |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Alex and Sasha are post-doctoral researchers working for a university. |  | ||||||
|     //    Part of their funding comes from a grant from Google, but Google will not |  | ||||||
|     //    own any part of the work that they produce.  Their mission is just to write |  | ||||||
|     //    papers. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Alex and Sasha find spaCy convenient, so they use it in their system under |  | ||||||
|     //    the AGPL.  This means that their system must also be released under the |  | ||||||
|     //    AGPL, but they're cool with that – they were going to release their |  | ||||||
|     //    code anyway, as it's the only way to ensure their experiments are properly |  | ||||||
|     //    repeatable. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Alex and Sasha find and fix a few bugs in spaCy.  They must release these |  | ||||||
|     //    modifications, and they ask that they be accepted into the main spaCy repo. |  | ||||||
|     //    In order to do this, they must sign a contributor agreement, ceding their |  | ||||||
|     //    copyright.  When commercial licenses to spaCy are sold, Alex and Sasha will |  | ||||||
|     //    not be able to claim any royalties from their contributions. |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Later, Alex and Sasha implement new features into spaCy, for another paper. |  | ||||||
|     //    The code was quite rushed, and they don't want to take the time to put |  | ||||||
|     //    together a proper pull request.  They must release their modifications |  | ||||||
|     //    under the AGPL, but they are not obliged to contribute it to the spaCy |  | ||||||
|     //    repository, or concede their copyright. |  | ||||||
| 
 |  | ||||||
|     // details |  | ||||||
|     //  summary: h4 Open Source developers |  | ||||||
| 
 |  | ||||||
|     //  p. |  | ||||||
|     //    Phuong and Jessie use the open-source software Calibre to manage their |  | ||||||
|     //    e-book libraries.  They have an idea for a search feature, and they want |  | ||||||
|     //    to use spaCy to implement it.  Calibre is released under the GPLv3.  The |  | ||||||
|     //    AGPL has additional restrictions for projects used as a network resource, |  | ||||||
|     //    but they don't apply to this project, so Phuong and Jessie can use spaCy |  | ||||||
|     //    to improve Calibre.  They'll have to release their code, but that was |  | ||||||
|     //    always their intention anyway. |  | ||||||
|  | @ -1,17 +0,0 @@ | ||||||
| mixin Section(title_text, link_name, include_file) |  | ||||||
|   h3: a(name=link_name) #{title_text} |  | ||||||
| 
 |  | ||||||
|   if (link_name == "example-use") |  | ||||||
|     include ./usage_examples.jade |  | ||||||
|   else if (link_name == "online-demo") |  | ||||||
|     include ./online_demo.jade |  | ||||||
|   else if (link_name == "comparisons") |  | ||||||
|     include ./comparisons.jade |  | ||||||
|   else if (link_name == "install") |  | ||||||
|     include ./installation.jade |  | ||||||
|   else if (link_name == "api") |  | ||||||
|     include ./api.jade |  | ||||||
|   else if (link_name == "tutorials") |  | ||||||
|     include ./tutorials.jade |  | ||||||
|   else if (link_name == "spec") |  | ||||||
|     include ./spec.jade |  | ||||||
|  | @ -1,18 +0,0 @@ | ||||||
| mixin Displacy(sentence, caption_text, height) |  | ||||||
|   - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20") |  | ||||||
| 
 |  | ||||||
|   .displacy |  | ||||||
|     iframe.displacy(src="displacy/displacy_demo.html" height=height) |  | ||||||
|      |  | ||||||
|     a.view-displacy(href=url) |  | ||||||
|       | Interactive Visualizer |  | ||||||
| 
 |  | ||||||
|     p.caption. |  | ||||||
|       #{caption_text} |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +Displacy( |  | ||||||
|   "Click the button to see this sentence in displaCy.", |  | ||||||
|   "The best parse-tree visualizer and annotation tool in all the land.", |  | ||||||
|   275 |  | ||||||
| ) |  | ||||||
|  | @ -1,37 +0,0 @@ | ||||||
| - var slogan = "Build Tomorrow's Language Technologies" |  | ||||||
| - var tag_line = "spaCy – " + slogan |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| doctype html |  | ||||||
| html(lang="en") |  | ||||||
|   head |  | ||||||
|     meta(charset="utf-8") |  | ||||||
|     title!= tag_line |  | ||||||
|     meta(name="description" content="") |  | ||||||
|     meta(name="author" content="Matthew Honnibal") |  | ||||||
|     link(rel="stylesheet" href="css/style.css") |  | ||||||
|     <!--[if lt IE 9]> |  | ||||||
|     script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js") |  | ||||||
|     <![endif]--> |  | ||||||
| 
 |  | ||||||
|   body(id="home" role="document") |  | ||||||
|     header(role="banner") |  | ||||||
|       h1(class="logo")!= tag_line |  | ||||||
|       div(class="slogan")!= slogan |  | ||||||
| 
 |  | ||||||
|     nav(role="navigation") |  | ||||||
|       ul |  | ||||||
|         li: a(href="home.html") Home |  | ||||||
|         li: a(href="docs.html") Docs |  | ||||||
|         li: a(href="license.html") License |  | ||||||
|         li: a(href="blog.html") Blog |  | ||||||
| 
 |  | ||||||
|     main(id="content" role="main") |  | ||||||
|       block intro_block |  | ||||||
| 
 |  | ||||||
|       block body_block |  | ||||||
|   |  | ||||||
|   footer(role="contentinfo") |  | ||||||
| 
 |  | ||||||
|   script(src="js/prism.js") |  | ||||||
|   script(src="js/details_polyfill.js") |  | ||||||
|  | @ -1,129 +0,0 @@ | ||||||
| mixin columns(...names) |  | ||||||
|   tr |  | ||||||
|     each name in names |  | ||||||
|       th= name |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| mixin row(...cells) |  | ||||||
|   tr |  | ||||||
|     each cell in cells |  | ||||||
|       td= cell |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Overview |  | ||||||
|    |  | ||||||
|   p. |  | ||||||
|     This document describes the target annotations spaCy is trained to predict. |  | ||||||
|     This is currently a work in progress. Please ask questions on the issue tracker, |  | ||||||
|     so that the answers can be integrated here to improve the documentation. |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Tokenization |  | ||||||
| 
 |  | ||||||
|   p Tokenization standards are based on the OntoNotes 5 corpus. |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     The tokenizer differs from most by including tokens for significant |  | ||||||
|     whitespace. Any sequence of whitespace characters beyond a single space |  | ||||||
|     (' ') is included as a token. For instance: |  | ||||||
| 
 |  | ||||||
|   pre.language-python |  | ||||||
|     code |  | ||||||
|       | from spacy.en import English |  | ||||||
|       | nlp = English(parse=False) |  | ||||||
|       | tokens = nlp('Some\nspaces  and\ttab characters') |  | ||||||
|       | print([t.orth_ for t in tokens]) |  | ||||||
|          |  | ||||||
|   p Which produces: |  | ||||||
|      |  | ||||||
|   pre.language-python |  | ||||||
|     code |  | ||||||
|       | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     The whitespace tokens are useful for much the same reason punctuation is |  | ||||||
|     – it's often an important delimiter in the text.  By preserving |  | ||||||
|     it in the token output, we are able to maintain a simple alignment |  | ||||||
|     between the tokens and the original string, and we ensure that no |  | ||||||
|     information is lost during processing. |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Sentence boundary detection |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     Sentence boundaries are calculated from the syntactic parse tree, so |  | ||||||
|     features such as punctuation and capitalisation play an important but |  | ||||||
|     non-decisive role in determining the sentence boundaries.  Usually this |  | ||||||
|     means that the sentence boundaries will at least coincide with clause |  | ||||||
|     boundaries, even given poorly punctuated text. |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Part-of-speech Tagging |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank |  | ||||||
|     tag set.  We also map the tags to the simpler Google Universal POS Tag set. |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Lemmatization |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     A "lemma" is the uninflected form of a word. In English, this means: |  | ||||||
| 
 |  | ||||||
|   ul |  | ||||||
|     li Adjectives: The form like "happy", not "happier" or "happiest" |  | ||||||
|     li Adverbs: The form like "badly", not "worse" or "worst" |  | ||||||
|     li Nouns: The form like "dog", not "dogs"; like "child", not "children" |  | ||||||
|     li Verbs: The form like "write", not "writes", "writing", "wrote" or "written"  |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     The lemmatization data is taken from WordNet. However, we also add a |  | ||||||
|     special case for pronouns: all pronouns are lemmatized to the special |  | ||||||
|     token -PRON-. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Syntactic Dependency Parsing |  | ||||||
| 
 |  | ||||||
|   p. |  | ||||||
|     The parser is trained on data produced by the ClearNLP converter. Details |  | ||||||
|     of the annotation scheme can be found here:  http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf |  | ||||||
| 
 |  | ||||||
| details |  | ||||||
|   summary: h4 Named Entity Recognition |  | ||||||
| 
 |  | ||||||
|   table |  | ||||||
|     thead |  | ||||||
|       +columns("Entity Type", "Description") |  | ||||||
|        |  | ||||||
|     tbody |  | ||||||
|       +row("PERSON", "People, including fictional.") |  | ||||||
|       +row("NORP", "Nationalities or religious or political groups.") |  | ||||||
|       +row("FACILITY", "Buildings, airports, highways, bridges, etc.") |  | ||||||
|       +row("ORG", "Companies, agencies, institutions, etc.") |  | ||||||
|       +row("GPE", "Countries, cities, states.") |  | ||||||
|       +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") |  | ||||||
|       +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") |  | ||||||
|       +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") |  | ||||||
|       +row("WORK_OF_ART", "Titles of books, songs, etc.") |  | ||||||
|       +row("LAW", "Named documents made into laws") |  | ||||||
|       +row("LANGUAGE", "Any named language") |  | ||||||
| 
 |  | ||||||
|   p The following values are also annotated in a style similar to names: |  | ||||||
| 
 |  | ||||||
|   table |  | ||||||
|     thead |  | ||||||
|       +columns("Entity Type", "Description") |  | ||||||
|        |  | ||||||
|     tbody |  | ||||||
|       +row("DATE", "Absolute or relative dates or periods") |  | ||||||
|       +row("TIME", "Times smaller than a day") |  | ||||||
|       +row("PERCENT", 'Percentage (including “%”)') |  | ||||||
|       +row("MONEY", "Monetary values, including unit") |  | ||||||
|       +row("QUANTITY", "Measurements, as of weight or distance") |  | ||||||
|       +row("ORDINAL", 'first", "second"') |  | ||||||
|       +row("CARDINAL", "Numerals that do not fall under another type") |  | ||||||
|  | @ -1,31 +0,0 @@ | ||||||
| doctype html |  | ||||||
| html(lang='en') |  | ||||||
|   head |  | ||||||
|     meta(charset='utf-8') |  | ||||||
|     title spaCy Blog |  | ||||||
|     meta(name='description', content='') |  | ||||||
|     meta(name='author', content='Matthew Honnibal') |  | ||||||
|     link(rel='stylesheet', href='css/style.css') |  | ||||||
|     //if lt IE 9 |  | ||||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') |  | ||||||
|   body#blog(role="document") |  | ||||||
|     header(role='banner') |  | ||||||
|       h1.logo spaCy Blog |  | ||||||
|       .slogan Blog |  | ||||||
| 
 |  | ||||||
|     nav(role="navigation") |  | ||||||
|       ul |  | ||||||
|         li: a(href="home.html")        Home |  | ||||||
|         li: a(href="docs.html")        Docs |  | ||||||
|         li.active: a(href="blog.html") Blog |  | ||||||
|         li: a(href="license.html")     License |  | ||||||
| 
 |  | ||||||
|     main#content(role='main') |  | ||||||
|       block intro_block |  | ||||||
| 
 |  | ||||||
|       block body_block |  | ||||||
|   |  | ||||||
|   footer(role='contentinfo') |  | ||||||
| 
 |  | ||||||
|   script(src="js/prism.js") |  | ||||||
|   script(src="js/details_polyfill.js") |  | ||||||
|  | @ -1,200 +0,0 @@ | ||||||
| doctype html |  | ||||||
| html(lang='en') |  | ||||||
|   head |  | ||||||
|     meta(charset='utf-8') |  | ||||||
|     title spaCy Blog |  | ||||||
|     meta(name='description', content='') |  | ||||||
|     meta(name='author', content='Matthew Honnibal') |  | ||||||
|     link(rel='stylesheet', href='css/style.css') |  | ||||||
|     //if lt IE 9 |  | ||||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') |  | ||||||
|   body#blog |  | ||||||
|     header(role='banner') |  | ||||||
|       h1.logo spaCy Blog |  | ||||||
|       .slogan Blog |  | ||||||
|     main#content(role='main') |  | ||||||
|       article.post |  | ||||||
|   |  | ||||||
| 
 |  | ||||||
|         :markdown-it |  | ||||||
|           # Adverbs |  | ||||||
|    |  | ||||||
|           Let's say you're developing a proofreading tool, or possibly an IDE for |  | ||||||
|           writers.  You're convinced by Stephen King's advice that `adverbs are |  | ||||||
|           not your friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_, |  | ||||||
|           so you want to **highlight all adverbs**.  We'll use one of the examples |  | ||||||
|           he finds particularly egregious: |  | ||||||
|      |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | import spacy.en |  | ||||||
|             | >>> from spacy.parts_of_speech import ADV |  | ||||||
|             | >>> # Load the pipeline, and call it with some text. |  | ||||||
|             | >>> nlp = spacy.en.English() |  | ||||||
|             | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False) |  | ||||||
|             | >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) |  | ||||||
|             | u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           Easy enough --- but the problem is that we've also highlighted "back". |  | ||||||
|           While "back" is undoubtedly an adverb, we probably don't want to highlight |  | ||||||
|           it. If what we're trying to do is flag dubious stylistic choices, we'll |  | ||||||
|           need to refine our logic.  It turns out only a certain type of adverb |  | ||||||
|           is of interest to us. |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|         :markdown-it |  | ||||||
|           There are lots of ways we might do this, depending on just what words |  | ||||||
|           we want to flag.  The simplest way to exclude adverbs like "back" and |  | ||||||
|           "not" is by word frequency: these words are much more common than the |  | ||||||
|           prototypical manner adverbs that the style guides are worried about. |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a |  | ||||||
|           log probability estimate of the word: |  | ||||||
|    |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> nlp.vocab[u'back'].prob |  | ||||||
|             | -7.403977394104004 |  | ||||||
|             | >>> nlp.vocab[u'not'].prob |  | ||||||
|             | -5.407193660736084 |  | ||||||
|             | >>> nlp.vocab[u'quietly'].prob |  | ||||||
|             | -11.07155704498291 |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           (The probability estimate is based on counts from a 3 billion word corpus, |  | ||||||
|           smoothed using the `Simple Good-Turing`_ method.) |  | ||||||
|    |  | ||||||
|           So we can easily exclude the N most frequent words in English from our |  | ||||||
|           adverb marker.  Let's try N=1000 for now: |  | ||||||
|   |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> import spacy.en |  | ||||||
|             | >>> from spacy.parts_of_speech import ADV |  | ||||||
|             | >>> nlp = spacy.en.English() |  | ||||||
|             | >>> # Find log probability of Nth most frequent word |  | ||||||
|             | >>> probs = [lex.prob for lex in nlp.vocab] |  | ||||||
|             | >>> probs.sort() |  | ||||||
|             | >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] |  | ||||||
|             | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") |  | ||||||
|             | >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) |  | ||||||
|             | ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ |  | ||||||
|          |  | ||||||
|         :markdown-it |  | ||||||
|           There are lots of other ways we could refine the logic, depending on |  | ||||||
|           just what words we want to flag.  Let's say we wanted to only flag |  | ||||||
|           adverbs that modified words similar to "pleaded".  This is easy to do, |  | ||||||
|           as spaCy loads a vector-space representation for every word (by default, |  | ||||||
|           the vectors produced by `Levy and Goldberg (2014)`_).  Naturally, the |  | ||||||
|           vector is provided as a numpy array: |  | ||||||
| 
 |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> pleaded = tokens[7] |  | ||||||
|             | >>> pleaded.repvec.shape |  | ||||||
|             | (300,) |  | ||||||
|             | >>> pleaded.repvec[:5] |  | ||||||
|             | array([ 0.04229792,  0.07459262,  0.00820188, -0.02181299,  0.07519238], dtype=float32) |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           We want to sort the words in our vocabulary by their similarity to |  | ||||||
|           "pleaded".  There are lots of ways to measure the similarity of two |  | ||||||
|           vectors.  We'll use the cosine metric: |  | ||||||
| 
 |  | ||||||
|         pre.language-python |  | ||||||
|           code  |  | ||||||
|             | >>> from numpy import dot |  | ||||||
|             | >>> from numpy.linalg import norm |  | ||||||
|    |  | ||||||
|             | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) |  | ||||||
|             | >>> words = [w for w in nlp.vocab if w.has_repvec] |  | ||||||
|             | >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) |  | ||||||
|             | >>> words.reverse() |  | ||||||
|             | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) |  | ||||||
|             | 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading |  | ||||||
|             | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) |  | ||||||
|             | 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses |  | ||||||
|             | >>> print('100-110', ', '.join(w.orth_ for w in words[100:110])) |  | ||||||
|             | 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes |  | ||||||
|             | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) |  | ||||||
|             | 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged |  | ||||||
|             | >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010])) |  | ||||||
|             | 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           As you can see, the similarity model that these vectors give us is excellent |  | ||||||
|           --- we're still getting meaningful results at 1000 words, off a single |  | ||||||
|           prototype!  The only problem is that the list really contains two clusters of |  | ||||||
|           words: one associated with the legal meaning of "pleaded", and one for the more |  | ||||||
|           general sense.  Sorting out these clusters is an area of active research. |  | ||||||
|    |  | ||||||
|           A simple work-around is to average the vectors of several words, and use that |  | ||||||
|           as our target: |  | ||||||
|    |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] |  | ||||||
|             | >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) |  | ||||||
|             | >>> words.sort(key=lambda w: cosine(w.repvec * say_vector)) |  | ||||||
|             | >>> words.reverse() |  | ||||||
|             | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) |  | ||||||
|             | 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired |  | ||||||
|             | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) |  | ||||||
|             | 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed |  | ||||||
|             | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) |  | ||||||
|             | 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           These definitely look like words that King might scold a writer for attaching |  | ||||||
|           adverbs to.  Recall that our original adverb highlighting function looked like |  | ||||||
|           this: |  | ||||||
|    |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> import spacy.en |  | ||||||
|             | >>> from spacy.parts_of_speech import ADV |  | ||||||
|             | >>> # Load the pipeline, and call it with some text. |  | ||||||
|             | >>> nlp = spacy.en.English() |  | ||||||
|             | >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", |  | ||||||
|             |                  tag=True, parse=False) |  | ||||||
|             | >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) |  | ||||||
|             | ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ |  | ||||||
|    |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           We wanted to refine the logic so that only adverbs modifying evocative |  | ||||||
|           verbs of communication, like "pleaded", were highlighted.  We've now |  | ||||||
|           built a vector that represents that type of word, so now we can highlight |  | ||||||
|           adverbs based on subtle logic, honing in on adverbs that seem the most |  | ||||||
|           stylistically problematic, given our starting assumptions: |  | ||||||
|    |  | ||||||
|         pre.language-python |  | ||||||
|           code |  | ||||||
|             | >>> import numpy |  | ||||||
|             | >>> from numpy import dot |  | ||||||
|             | >>> from numpy.linalg import norm |  | ||||||
|             | >>> import spacy.en |  | ||||||
|             | >>> from spacy.parts_of_speech import ADV, VERB |  | ||||||
|             | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) |  | ||||||
|             | >>> def is_bad_adverb(token, target_verb, tol): |  | ||||||
|             | ...   if token.pos != ADV |  | ||||||
|             | ...     return False |  | ||||||
|             | ...   elif token.head.pos != VERB: |  | ||||||
|             | ...     return False |  | ||||||
|             | ...   elif cosine(token.head.repvec, target_verb) < tol: |  | ||||||
|             | ...     return False |  | ||||||
|             | ...   else: |  | ||||||
|             | ...     return True |  | ||||||
|    |  | ||||||
|         :markdown-it |  | ||||||
|           This example was somewhat contrived --- and, truth be told, I've never |  | ||||||
|           really bought the idea that adverbs were a grave stylistic sin.  But |  | ||||||
|           hopefully it got the message across: the state-of-the-art NLP technologies |  | ||||||
|           are very powerful. spaCy gives you easy and efficient access to them, |  | ||||||
|           which lets you build all sorts of useful products and features that |  | ||||||
|           were previously impossible. |  | ||||||
| 
 |  | ||||||
|   footer(role='contentinfo') |  | ||||||
|   script(src='js/prism.js') |  | ||||||
|  | @ -1,132 +0,0 @@ | ||||||
| doctype html |  | ||||||
| html(lang='en') |  | ||||||
|   head |  | ||||||
|     meta(charset='utf-8') |  | ||||||
|     title spaCy Blog |  | ||||||
|     meta(name='description', content='') |  | ||||||
|     meta(name='author', content='Matthew Honnibal') |  | ||||||
|     link(rel='stylesheet', href='css/style.css') |  | ||||||
|     //if lt IE 9 |  | ||||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') |  | ||||||
|   body#blog |  | ||||||
|     header(role='banner') |  | ||||||
|       h1.logo spaCy Blog |  | ||||||
|       .slogan Blog |  | ||||||
|     main#content(role='main') |  | ||||||
|       section.intro |  | ||||||
|         p |  | ||||||
|           | Example use of the spaCy NLP tools for data exploration. |  | ||||||
|           | Here we will look for reddit comments that describe Google doing something, |  | ||||||
|           | i.e. discuss the company's actions. This is difficult, because other senses of |  | ||||||
|           | "Google" now dominate usage of the word in conversation, particularly references to |  | ||||||
|           | using Google products. |  | ||||||
|          |  | ||||||
|         p |  | ||||||
|           | The heuristics used are quick and dirty – about 5 minutes work. |  | ||||||
|            |  | ||||||
|         //| A better approach is to use the word vector of the verb. But, the |  | ||||||
|         //  | demo here is just to show what's possible to build up quickly, to |  | ||||||
|         //  | start to understand some data. |  | ||||||
| 
 |  | ||||||
|       article.post |  | ||||||
|         header |  | ||||||
|           h2 Syntax-specific Search |  | ||||||
|           .subhead |  | ||||||
|             | by  |  | ||||||
|             a(href='#', rel='author') Matthew Honnibal |  | ||||||
|             |  on  |  | ||||||
|             time(datetime='2015-08-14') August |  | ||||||
|            |  | ||||||
|         details |  | ||||||
|           summary: h4 Imports |  | ||||||
| 
 |  | ||||||
|           pre.language-python |  | ||||||
|             code |  | ||||||
|               | from __future__ import unicode_literals |  | ||||||
|               | from __future__ import print_function |  | ||||||
|               | import sys |  | ||||||
|               |  |  | ||||||
|               | import plac |  | ||||||
|               | import bz2 |  | ||||||
|               | import ujson |  | ||||||
|               | import spacy.en |  | ||||||
|            |  | ||||||
|         details |  | ||||||
|           summary: h4 Load the model and iterate over the data |  | ||||||
| 
 |  | ||||||
|           pre.language-python |  | ||||||
|             code  |  | ||||||
|               | def main(input_loc): |  | ||||||
|               |     nlp = spacy.en.English()                 # Load the model takes 10-20 seconds. |  | ||||||
|               |     for line in bz2.BZ2File(input_loc):      # Iterate over the reddit comments from the dump.  |  | ||||||
|               |         comment_str = ujson.loads(line)['body']  # Parse the json object, and extract the 'body' attribute.  |  | ||||||
|               |          |  | ||||||
|         details |  | ||||||
|           summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want |  | ||||||
| 
 |  | ||||||
|           pre.language-python |  | ||||||
|             code |  | ||||||
|               |         comment_parse = nlp(comment_str)  |  | ||||||
|               |         for word in comment_parse:   |  | ||||||
|               |             if google_doing_something(word): |  | ||||||
|               |                 # Print the clause |  | ||||||
|               |                 print(''.join(w.string for w in word.head.subtree).strip()) |  | ||||||
|         details |  | ||||||
|           summary: h4 Define the filter function |  | ||||||
| 
 |  | ||||||
|           pre.language-python |  | ||||||
|             code |  | ||||||
| 
 |  | ||||||
|               |  |  | ||||||
|               | def google_doing_something(w): |  | ||||||
|               |     if w.lower_ != 'google': |  | ||||||
|               |         return False |  | ||||||
|               |     # Is it the subject of a verb? |  | ||||||
|               |     elif w.dep_ != 'nsubj':  |  | ||||||
|               |         return False |  | ||||||
|               |     # And not 'is' |  | ||||||
|               |     elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux':  |  | ||||||
|               |         return False |  | ||||||
|               |     # Exclude e.g. "Google says..." |  | ||||||
|               |     elif w.head.lemma_ in ('say', 'show'):  |  | ||||||
|               |         return False |  | ||||||
|               |     else: |  | ||||||
|               |         return True |  | ||||||
|               |  |  | ||||||
|               |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Call main |  | ||||||
| 
 |  | ||||||
|           pre.language-python |  | ||||||
|             code |  | ||||||
|               | if __name__ == '__main__': |  | ||||||
|               |     plac.call(main) |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Example output |  | ||||||
| 
 |  | ||||||
|           p. |  | ||||||
|             Many false positives remain. Some are from incorrect interpretations |  | ||||||
|             of the sentence by spaCy, some are flaws in our filtering logic. But |  | ||||||
|             the results are vastly better than a string-based search, which returns |  | ||||||
|             almost no examples of the pattern we're looking for. |  | ||||||
| 
 |  | ||||||
|           code |  | ||||||
|             | Google dropped support for Android < 4.0 already |  | ||||||
|             | google drive |  | ||||||
|             | Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc |  | ||||||
|             | When Google responds |  | ||||||
|             | Google translate cyka pasterino. |  | ||||||
|             | A quick google looks like Synology does have a sync'ing feature which does support block level so that should work  |  | ||||||
|             | (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible? |  | ||||||
|             | Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop. |  | ||||||
|             | Google offers something like this already, but it is truly terrible. |  | ||||||
|             | google isn't helping me |  | ||||||
|             | Google tells me: 0 results, 250 pages removed from google. |  | ||||||
|             | how did Google swoop in and eat our lunch |  | ||||||
| 
 |  | ||||||
|              |  | ||||||
| 
 |  | ||||||
|   script(src="js/prism.js") |  | ||||||
|   script(src="js/details_polyfill.js") |  | ||||||
|  | @ -1,204 +0,0 @@ | ||||||
| doctype html |  | ||||||
| html(lang='en') |  | ||||||
|   head |  | ||||||
|     meta(charset='utf-8') |  | ||||||
|     title spaCy Blog |  | ||||||
|     meta(name='description', content='') |  | ||||||
|     meta(name='author', content='Matthew Honnibal') |  | ||||||
|     link(rel='stylesheet', href='css/style.css') |  | ||||||
|     //if lt IE 9 |  | ||||||
|       script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') |  | ||||||
|   body#blog |  | ||||||
|     header(role='banner') |  | ||||||
|       h1.logo spaCy Blog |  | ||||||
|       .slogan Blog |  | ||||||
|     main#content(role='main') |  | ||||||
|       article.post |  | ||||||
|         header |  | ||||||
|           h2 Finding Relevant Tweets |  | ||||||
|           .subhead |  | ||||||
|             | by  |  | ||||||
|             a(href='#', rel='author') Matthew Honnibal |  | ||||||
|             |  on  |  | ||||||
|             time(datetime='2015-08-14') December |  | ||||||
|            |  | ||||||
|         details |  | ||||||
|           summary: h4 Imports |  | ||||||
|           pre.language-python |  | ||||||
| 
 |  | ||||||
|             | from __future__ import unicode_literals, print_function |  | ||||||
|             | import plac |  | ||||||
|             | import codecs |  | ||||||
|             | import sys |  | ||||||
|             | import math |  | ||||||
|             |  |  | ||||||
|             | import spacy.en |  | ||||||
|             | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ |  | ||||||
|             |  |  | ||||||
|             | from termcolor import colored |  | ||||||
|             | from twython import TwythonStreamer |  | ||||||
|             |  |  | ||||||
|             | from os import path |  | ||||||
|             | from math import sqrt |  | ||||||
|             |  |  | ||||||
|             | from numpy import dot |  | ||||||
|             | from numpy.linalg import norm |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Simple vector-averaging similarity |  | ||||||
| 
 |  | ||||||
|           pre.language-python: code |  | ||||||
| 
 |  | ||||||
|             | class Meaning(object): |  | ||||||
|             |     def __init__(self, vectors): |  | ||||||
|             |         if vectors: |  | ||||||
|             |             self.vector = sum(vectors) / len(vectors) |  | ||||||
|             |             self.norm = norm(self.vector) |  | ||||||
|             |         else: |  | ||||||
|             |             self.vector = None |  | ||||||
|             |             self.norm = 0 |  | ||||||
|             |  |  | ||||||
|             |     @classmethod |  | ||||||
|             |     def from_path(cls, nlp, loc): |  | ||||||
|             |         with codecs.open(loc, 'r', 'utf8') as file_: |  | ||||||
|             |             terms = file_.read().strip().split() |  | ||||||
|             |         return cls.from_terms(nlp, terms) |  | ||||||
|             |  |  | ||||||
|             |     @classmethod |  | ||||||
|             |     def from_tokens(cls, nlp, tokens): |  | ||||||
|             |         vectors = [t.repvec for t in tokens] |  | ||||||
|             |         return cls(vectors) |  | ||||||
|             |  |  | ||||||
|             |     @classmethod |  | ||||||
|             |     def from_terms(cls, nlp, examples): |  | ||||||
|             |         lexemes = [nlp.vocab[eg] for eg in examples] |  | ||||||
|             |         vectors = [eg.repvec for eg in lexemes] |  | ||||||
|             |         return cls(vectors) |  | ||||||
|             |  |  | ||||||
|             |     def similarity(self, other): |  | ||||||
|             |         if not self.norm or not other.norm: |  | ||||||
|             |             return -1 |  | ||||||
|             |         return dot(self.vector, other.vector) / (self.norm * other.norm) |  | ||||||
|             |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Print matches |  | ||||||
|                |  | ||||||
|           pre.language-python: code |  | ||||||
| 
 |  | ||||||
|             |  |  | ||||||
|             | def print_colored(model, stream=sys.stdout): |  | ||||||
|             |     if model['is_match']: |  | ||||||
|             |         color = 'green' |  | ||||||
|             |     elif model['is_reject']: |  | ||||||
|             |         color = 'red' |  | ||||||
|             |     else: |  | ||||||
|             |         color = 'grey' |  | ||||||
|             |      |  | ||||||
|             |     if not model['is_rare'] and model['is_match'] and not model['is_reject']: |  | ||||||
|             |         match_score = colored('%.3f' % model['match_score'], 'green') |  | ||||||
|             |         reject_score = colored('%.3f' % model['reject_score'], 'red') |  | ||||||
|             |         prob = '%.5f' % model['prob'] |  | ||||||
|             |  |  | ||||||
|             |         print(match_score, reject_score, prob) |  | ||||||
|             |         print(repr(model['text']), color) |  | ||||||
|             |         print('') |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 TextMatcher: Process the tweets using spaCy |  | ||||||
| 
 |  | ||||||
|           pre.language-python: code |  | ||||||
| 
 |  | ||||||
|             | class TextMatcher(object): |  | ||||||
|             |     def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): |  | ||||||
|             |         self.nlp = nlp |  | ||||||
|             |         self.get_target = get_target |  | ||||||
|             |         self.get_reject = get_reject |  | ||||||
|             |         self.min_prob = min_prob |  | ||||||
|             |         self.min_match = min_match |  | ||||||
|             |         self.max_reject = max_reject |  | ||||||
|             |  |  | ||||||
|             |     def __call__(self, text): |  | ||||||
|             |         tweet = self.nlp(text) |  | ||||||
|             |         target_terms = self.get_target() |  | ||||||
|             |         reject_terms = self.get_reject() |  | ||||||
|             |  |  | ||||||
|             |         prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) |  | ||||||
|             |         meaning = Meaning.from_tokens(self, tweet) |  | ||||||
|             |          |  | ||||||
|             |         match_score = meaning.similarity(self.get_target()) |  | ||||||
|             |         reject_score = meaning.similarity(self.get_reject()) |  | ||||||
|             |         return { |  | ||||||
|             |             'text': tweet.string, |  | ||||||
|             |             'prob': prob, |  | ||||||
|             |             'match_score': match_score, |  | ||||||
|             |             'reject_score': reject_score, |  | ||||||
|             |             'is_rare': prob < self.min_prob, |  | ||||||
|             |             'is_match': prob >= self.min_prob  and match_score  >= self.min_match, |  | ||||||
|             |             'is_reject': prob >= self.min_prob and reject_score >= self.max_reject |  | ||||||
|             |         } |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Connect to Twitter and stream tweets |  | ||||||
| 
 |  | ||||||
|           pre.language-python: code |  | ||||||
| 
 |  | ||||||
|             | class Connection(TwythonStreamer): |  | ||||||
|             |     def __init__(self, keys_dir, handler, view): |  | ||||||
|             |         keys = Secrets(keys_dir) |  | ||||||
|             |         TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)  |  | ||||||
|             |         self.handler = handler |  | ||||||
|             |         self.view = view |  | ||||||
|             |  |  | ||||||
|             |     def on_success(self, data): |  | ||||||
|             |         text = data.get('text', u'') |  | ||||||
|             |         # Twython returns either bytes or unicode, depending on tweet. |  | ||||||
|             |         # #APIshaming |  | ||||||
|             |         try: |  | ||||||
|             |             model = self.handler(text) |  | ||||||
|             |         except TypeError: |  | ||||||
|             |             model = self.handler(text.decode('utf8')) |  | ||||||
|             |         status = self.view(model, sys.stdin) |  | ||||||
|             |  |  | ||||||
|             |     def on_error(self, status_code, data): |  | ||||||
|             |         print(status_code) |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
|             | class Secrets(object): |  | ||||||
|             |     def __init__(self, key_dir): |  | ||||||
|             |         self.key = open(path.join(key_dir, 'key.txt')).read().strip() |  | ||||||
|             |         self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() |  | ||||||
|             |         self.token = open(path.join(key_dir, 'token.txt')).read().strip() |  | ||||||
|             |         self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
| 
 |  | ||||||
|         details |  | ||||||
|           summary: h4 Command-line interface |  | ||||||
| 
 |  | ||||||
|           pre.language-python: code |  | ||||||
| 
 |  | ||||||
|             | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): |  | ||||||
|             |     # We don't need the parser for this demo, so may as well save the loading time |  | ||||||
|             |     nlp = spacy.en.English(Parser=None) |  | ||||||
|             |     get_target = lambda: Meaning.from_path(nlp, target_loc) |  | ||||||
|             |     get_reject = lambda: Meaning.from_path(nlp, reject_loc) |  | ||||||
|             |     matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) |  | ||||||
|             |  |  | ||||||
|             |     twitter = Connection(keys_dir, matcher, print_colored) |  | ||||||
|             |     twitter.statuses.filter(track=term) |  | ||||||
|             |  |  | ||||||
|             |  |  | ||||||
|             | if __name__ == '__main__': |  | ||||||
|             |     plac.call(main) |  | ||||||
|             |    |  | ||||||
| 
 |  | ||||||
|   footer(role='contentinfo') |  | ||||||
|   script(src='js/prism.js') |  | ||||||
| 
 |  | ||||||
|  | @ -1,29 +0,0 @@ | ||||||
| mixin Tutorial(title) |  | ||||||
|   details |  | ||||||
|     summary |  | ||||||
|       h4= title  |  | ||||||
| 
 |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| +Tutorial("Mark-up all manner adverbs, especially for verbs of speech") |  | ||||||
|   | Let's say you're developing a proofreading tool, or possibly an IDE for |  | ||||||
|   | writers.  You're convinced by Stephen King's advice that  |  | ||||||
|   | adverbs are not your friend |  | ||||||
|   | so you want to  |  | ||||||
|   a.readmore(href='tute_adverbs.html')  |  | ||||||
|     | highlight all adverbs. ► |  | ||||||
| 
 |  | ||||||
| +Tutorial("Search Reddit for comments about Google doing something") |  | ||||||
|   | Example use of the spaCy NLP tools for data exploration. |  | ||||||
|   | Here we will look for Reddit comments that describe Google doing something, |  | ||||||
|   | i.e. discuss the company's actions. This is difficult, because other |  | ||||||
|   | senses of "Google" now dominate usage of the word in conversation, |  | ||||||
|   | particularly references to using Google products.  |  | ||||||
|   a.readmore(href='tute_adverbs.html')  |  | ||||||
|     | ► |  | ||||||
| 
 |  | ||||||
| +Tutorial("Use word vectors for semantic search of Twitter") |  | ||||||
|   | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. |  | ||||||
|   | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. |  | ||||||
|   a.readmore(href='tute_twitter.html')  |  | ||||||
|     | ► |  | ||||||
|  | @ -1,167 +0,0 @@ | ||||||
| mixin example(name) |  | ||||||
|   details |  | ||||||
|     summary |  | ||||||
|       h4= name |  | ||||||
|     block |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +example("Load resources and process text") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | from __future__ import unicode_literals, print_function |  | ||||||
|     | from spacy.en import English |  | ||||||
|     | nlp = English() |  | ||||||
|     | doc = nlp('Hello, world. Here are two sentences.') |  | ||||||
| 
 |  | ||||||
| +example("Get tokens and sentences") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | token = doc[0] |  | ||||||
|     | sentence = doc.sents[0] |  | ||||||
|     | assert token[0] is sentence[0] |  | ||||||
| 
 |  | ||||||
| +example("Use integer IDs for any string") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | hello_id = nlp.vocab.strings['Hello'] |  | ||||||
|     | hello_str = nlp.vocab.strings[hello_id] |  | ||||||
|     |  |  | ||||||
|     | assert token.orth  == hello_id  == 52 |  | ||||||
|     | assert token.orth_ == hello_str == 'Hello' |  | ||||||
| 
 |  | ||||||
| +example("Get and set string views and flags") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | assert token.shape_ == 'Xxxx' |  | ||||||
|     | for lexeme in nlp.vocab: |  | ||||||
|     |     if lexeme.is_alpha: |  | ||||||
|     |         lexeme.shape_ = 'W' |  | ||||||
|     |     elif lexeme.is_digit: |  | ||||||
|     |         lexeme.shape_ = 'D' |  | ||||||
|     |     elif lexeme.is_punct: |  | ||||||
|     |         lexeme.shape_ = 'P' |  | ||||||
|     |     else: |  | ||||||
|     |         lexeme.shape_ = 'M' |  | ||||||
|     | assert token.shape_ == 'W' |  | ||||||
| 
 |  | ||||||
| +example("Export to numpy arrays") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV |  | ||||||
|     |  |  | ||||||
|     | attr_ids = [ORTH, LIKE_URL, IS_OOV] |  | ||||||
|     | doc_array = doc.to_array(attr_ids) |  | ||||||
|     | assert doc_array.shape == (len(doc), len(attrs) |  | ||||||
|     | assert doc[0].orth == doc_array[0, 0] |  | ||||||
|     | assert doc[1].orth == doc_array[1, 0] |  | ||||||
|     | assert doc[0].like_url == doc_array[0, 1] |  | ||||||
|     | assert doc_array[, 1] == [t.like_url for t in doc] |  | ||||||
| 
 |  | ||||||
| +example("Word vectors") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") |  | ||||||
|     |  |  | ||||||
|     | apples = doc[0] |  | ||||||
|     | oranges = doc[1] |  | ||||||
|     | boots = doc[6] |  | ||||||
|     | hippos = doc[8] |  | ||||||
|     |  |  | ||||||
|     | assert apples.similarity(oranges) > boots.similarity(hippos) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +example("Part-of-speech tags") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | from spacy.parts_of_speech import ADV |  | ||||||
|     |  |  | ||||||
|     | def is_adverb(token): |  | ||||||
|     |     return token.pos == spacy.parts_of_speech.ADV |  | ||||||
|     |  |  | ||||||
|     | # These are data-specific, so no constants are provided. You have to look |  | ||||||
|     | # up the IDs from the StringStore. |  | ||||||
|     | NNS = nlp.vocab.strings['NNS'] |  | ||||||
|     | NNPS = nlp.vocab.strings['NNPS'] |  | ||||||
|     | def is_plural_noun(token): |  | ||||||
|     |     return token.tag == NNS or token.tag == NNPS |  | ||||||
|     |  |  | ||||||
|     | def print_coarse_pos(token): |  | ||||||
|     |     print(token.pos_) |  | ||||||
|     |  |  | ||||||
|     | def print_fine_pos(token): |  | ||||||
|     |     print(token.tag_) |  | ||||||
| 
 |  | ||||||
| +example("Syntactic dependencies") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | def dependency_labels_to_root(token): |  | ||||||
|     |     '''Walk up the syntactic tree, collecting the arc labels.''' |  | ||||||
|     |     dep_labels = [] |  | ||||||
|     |     while token.root is not token: |  | ||||||
|     |         dep_labels.append(token.dep) |  | ||||||
|     |         token = token.head |  | ||||||
|     |     return dep_labels |  | ||||||
| 
 |  | ||||||
| +example("Named entities") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | def iter_products(docs): |  | ||||||
|     |     for doc in docs: |  | ||||||
|     |         for ent in doc.ents: |  | ||||||
|     |             if ent.label_ == 'PRODUCT': |  | ||||||
|     |                 yield ent |  | ||||||
|     |  |  | ||||||
|     | def word_is_in_entity(word): |  | ||||||
|     |     return word.ent_type != 0 |  | ||||||
|     |  |  | ||||||
|     | def count_parent_verb_by_person(docs): |  | ||||||
|     |     counts = defaultdict(defaultdict(int)) |  | ||||||
|     |     for doc in docs: |  | ||||||
|     |         for ent in doc.ents: |  | ||||||
|     |             if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: |  | ||||||
|     |                 counts[ent.orth_][ent.root.head.lemma_] += 1 |  | ||||||
|     |     return counts |  | ||||||
| 
 |  | ||||||
|   //+example("Define custom NER rules") |  | ||||||
|   //  pre.language-python: code |  | ||||||
|   //    | nlp.matcher |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +example("Calculate inline mark-up on original string") |  | ||||||
|   pre.language-python: code |  | ||||||
|     | def put_spans_around_tokens(doc, get_classes): |  | ||||||
|     |     '''Given some function to compute class names, put each token in a |  | ||||||
|     |     span element, with the appropriate classes computed. |  | ||||||
|     |   |  | ||||||
|     |     All whitespace is preserved, outside of the spans. (Yes, I know HTML |  | ||||||
|     |     won't display it. But the point is no information is lost, so you can |  | ||||||
|     |     calculate what you need, e.g. <br /> tags, <p> tags, etc.) |  | ||||||
|     |     ''' |  | ||||||
|     |     output = [] |  | ||||||
|     |     template = '<span classes="{classes}">{word}</span>{space}' |  | ||||||
|     |     for token in doc: |  | ||||||
|     |         if token.is_space: |  | ||||||
|     |             output.append(token.orth_) |  | ||||||
|     |         else: |  | ||||||
|     |             output.append( |  | ||||||
|     |               template.format( |  | ||||||
|     |                 classes=' '.join(get_classes(token)), |  | ||||||
|     |                 word=token.orth_, |  | ||||||
|     |                 space=token.whitespace_)) |  | ||||||
|     |     string = ''.join(output) |  | ||||||
|     |     string = string.replace('\n', '<br />') |  | ||||||
|     |     string = string.replace('\t', '    ' |  | ||||||
|     |     return string |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| +example("Efficient binary serialization") |  | ||||||
|   pre.language-python: code |  | ||||||
|     |  |  | ||||||
|     | byte_string = doc.as_bytes() |  | ||||||
|     | open('/tmp/moby_dick.bin', 'wb').write(byte_string) |  | ||||||
|     |  |  | ||||||
|     | nlp = spacy.en.English() |  | ||||||
|     | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): |  | ||||||
|     |    doc = Doc(nlp.vocab) |  | ||||||
|     |    doc.from_bytes(byte_string) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| p |  | ||||||
|   | See the  |  | ||||||
|   a(href="docs.html") docs page  |  | ||||||
|   | for  |  | ||||||
|   a(href="docs.html#api") API documentation,  |  | ||||||
|   a(href="docs.html#tutorials") tutorials,  |  | ||||||
|   | and  |  | ||||||
|   a(href="docs.html#spec") annotation specs. |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user