* Remove old docs

2025-10-30 23:47:31 +03:00 · 2015-08-22 22:06:30 +02:00 · 2015-08-22 22:06:30 +02:00 · 890d6aa216
commit 890d6aa216
parent cad0cca4e3
21 changed files with 0 additions and 3837 deletions
--- a/docs/redesign/api.jade
+++ b/docs/redesign/api.jade
@ -1,661 +0,0 @@
 mixin declare_class(name)
  details
    summary
      span.declaration
        span.label class
        code #{name}
    block
 mixin method(name, parameters)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
        span.parameters
          | self, #{parameters}
    block
 mixin params
  ul
    block
 mixin param(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin attribute(name, type, value)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
    block
 mixin returns(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin returns(type)
  | tmp
 mixin init
  details
    summary: h4 Init
    block
 mixin callable
  details
    summary: h4 Callable
    block
 mixin sequence
  details
    summary: h4 Sequence
    block
 mixin maptype
  details
    summary: h4 Map
    block
 mixin summary
  block
 mixin en_example
  pre.language-python
    code
      | from spacy.en import English
      | from spacy._doc_examples import download_war_and_peace
      | 
      | unprocessed_unicode = download_war_and_peace()
      | 
      | nlp = English()
      | doc = nlp(unprocessed_unicode)
 +declare_class("English")
  p Load models into a callable object to process English text.
  +summary
    +en_example
  +init
    p
      | Load the resources.  Loading takes 20 seconds, and the instance
      | consumes 2 to 3 gigabytes of memory.
    p 
      | Intended use is for one instance to be created per process.
      | You can create more if you're doing something unusual.
    p
      | You may wish to make the instance a global variable or "singleton".
      | We usually instantiate the object in the <code>main()</code>
      | function and pass it around as an explicit argument. 
    +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
      +params
        +param("data_dir")
          | The data directory.  May be #{None}, to disable any data loading
          | (including the vocabulary).
        +param("Tokenizer")
          | A class/function that creates the tokenizer.
        +param("Tagger")
          | A class/function that creates the part-of-speech tagger.
        +param("Parser")
          | A class/function that creates the dependency parser.
        +param("Entity")
          | A class/function that creates the named entity recogniser.
        +param("load_vectors")
          | A boolean value to control whether the word vectors are loaded.
  +callable
    +method("__call__", "text, tag=True, parse=True, entity=True")
      +params
        +param("text", types.unicode)
          | The text to be processed.  No pre-processing needs to be applied,
          | and any length of text can be submitted.  Usually you will submit
          | a whole document. Text may be zero-length. An exception is raised
          | if byte strings are supplied.
        +param("tag", types.bool)
          | Whether to apply the part-of-speech tagger. Required for parsing
          | and entity recognition.
        +param("parse", types.bool)
          | Whether to apply the syntactic dependency parser.
        +param("entity", types.bool)
          | Whether to apply the named entity recognizer.
      pre.language-python
        code
          | from spacy.en import English
          | nlp = English()
          | doc = nlp(u'Some text.) # Applies tagger, parser, entity
          | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
          | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
          | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
          | doc = nlp(u'') # Zero-length tokens, not an error
          | # doc = nlp(b'Some text') <-- Error: need unicode
          | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
 +declare_class("Doc")
  p I'm a doc
  +init
    +method("__init__", "vocab")
      +params
        +param("vocab", vocab_type)
          | A vocabulary object
  +sequence
    +method("__getitem__", "i", types.int)
      +returns(types.Token)
    +method("__getitem__", "start_end", types.slice)
      +returns(types.Span)
    +method("__iter__")
      | Iterate over tokens
    +method("__len__")
      | Number of tokens in the document.
  details
    summary: h4 Spans
    +attribute("sents", types.generator)
      | Iterate over sentences in the document.
    +attribute("ents", types.generator)
      | Iterate over named entities in the document.
    +attribute("noun_chunks", types.generator)
  details
    summary: h4 Export/Import
    +method("to_array", "attr_ids")
      | Given a list of M attribute IDs, export the tokens to a numpy ndarray
      | of shape N*M, where N is the length of the sentence.
      +params
        +param("attr_ids", "list[int]")
          | A list of attribute ID ints.
      +returns("feat_array")
        | A feature matrix, with one row per word, and one column per attribute
        | indicated in the input attr_ids.
    +method("count_by", "attr_id")
      | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
      | by the values of the given attribute ID.
      pre.language-python
        code
          | >>> from spacy.en import English, attrs
          | >>> nlp = English()
          | >>> tokens = nlp(u'apple apple orange banana')
          | >>> tokens.count_by(attrs.ORTH)
          | {12800L: 1, 11880L: 2, 7561L: 1}
          | >>> tokens.to_array([attrs.ORTH])
          | array([[11880],
          |         [11880],
          |         [7561],
          |         [12800]])
    +method("from_array", "attrs, array")
      | Load from array
    +method("from_bytes")
      | Deserialize, loading from bytes
    +method("read_bytes")
      | classmethod
    //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
    //  | Merge a multi-word expression into a single token.  Currently
    //  | experimental; API is likely to change.
 +declare_class("Token")
  +init
    +method("__init__", "vocab, doc, offset")
      +params
        +param("vocab", types.Vocab)
          p A Vocab object
        +param("doc", types.Doc)
          p The parent sequence
      +param("offset", types.int)
        p The index of the token within the document
  details
    summary: h4 String Views
    +attribute("orth / orth_")
      | The form of the word with no string normalization or processing, as
      | it appears in the string, without trailing whitespace.
    +attribute("lemma / lemma_")
      | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
      | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that
      | <em>derivational</em> suffixes are not stripped, e.g. the lemma of
      | "instutitions" is "institution", not "institute".  Lemmatization is
      | performed using the WordNet data, but extended to also cover closed-class
      | words such as pronouns.  By default, the WN lemmatizer returns "hi"
      | as the lemma of "his". We assign pronouns the lemma -PRON-.
    +attribute("lower / lower_")
      | The form of the word, but forced to lower-case, i.e.
      pre.language-python: code lower = word.orth\_.lower()
    //+attribute("norm / norm_")
    //  | The form of the word, after language-specific normalizations has been
    //  | applied.
    +attribute("shape / shape_")
      | A transform of the word's string, to show orthographic features.
      | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
      | to d. After these mappings, sequences of 4 or more of the same character
      | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx,
      | :) --> :)
    +attribute("prefix / prefix_")
      | A length-N substring from the start of the word.  Length may vary by
      | language; currently for English n=1, i.e.
      pre.language-python: code prefix = word.orth\_[:1]
    +attribute("suffix / suffix_")
      | A length-N substring from the end of the word.  Length may vary by
      | language; currently for English n=3, i.e.
      pre.language-python: code suffix = word.orth\_[-3:]
    //+attribute("lex_id")
    //  | lex_id
  details
    summary: h4 Alignment and Output
    +attribute("idx")
      p Start index of the token in the string
    +method("__len__", "")
      p Length of the token's orth string, in unicode code-points.
    +method("__unicode__", "")
      p Same as token.orth_
    +method("__str__", "")
      p Varies between Python 2 and Python 3
    +attribute("string")
      p
        | The form of the word as it appears in the string, <strong>including
        | trailing whitespace</strong>.  This is useful when you need to use
        | linguistic features to add inline mark-up to the string.
    +method("nbor, i=1")
      +params
        +param("i")
          p Offset relative to token
  details
    summary: h4 Distributional Features
    +attribute("repvec")
      p
        | A "word embedding" representation: a dense real-valued vector that supports
        | similarity queries between words.  By default, spaCy currently loads
        | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
        | model.
    +attribute("cluster")
      p
        | The Brown cluster ID of the word.  These are often useful features for
        | linear models.  If you're using a non-linear model, particularly a
        | neural net or random forest, consider using the real-valued word
        | representation vector, in Token.repvec, instead.
    +attribute("prob")
      p
        | The unigram log-probability of the word, estimated from counts from a
        | large corpus, smoothed using Simple Good Turing estimation.
  details
    summary: h4 Syntactic Tags
    +attribute("pos / pos_")
      p
        | A part-of-speech tag, from the Google Universal Tag Set, e.g. 
        | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for
        | the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
    +attribute("tag / tag_")
      p
        | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
        | <code>DT</code>, etc.  These tags are language/corpus specific, and
        | typically describe part-of-speech and some amount of morphological
        | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code>
        | is assigned to a present-tense singular verb.
    +attribute("dep / dep_")
      p
        | The type of syntactic dependency relation between the word and its
        | syntactic head.
  details
    summary: h4 Navigating the Parse Tree
    +attribute("head")
      p
        | The Token that is the immediate syntactic head of the word.  If the
        | word is the root of the dependency tree, the same word is returned.
    +attribute("lefts")
      p
        | An iterator for the immediate leftward syntactic children of the
        | word.
    +attribute("rights")
      p
        | An iterator for the immediate rightward syntactic children of the
        | word.
    +attribute("n_lefts")
      p
        | The number of immediate syntactic children preceding the word in 
        | the string.
    +attribute("n_rights")
      p
        | The number of immediate syntactic children following the word in
        | the string.
    +attribute("children")
      p
        | An iterator that yields from lefts, and then yields from rights.
    +attribute("subtree")
      p
        | An iterator for the part of the sentence syntactically governed by
        | the word, including the word itself.
    +attribute("left_edge")
      p The leftmost edge of the token's subtree
    +attribute("right_edge")
      p The rightmost edge of the token's subtree
  details
    summary: h4 Named Entities
    +attribute("ent_type")
      p If the token is part of an entity, its entity type.
    +attribute("ent_iob")
      p The IOB (inside, outside, begin) entity recognition tag for the token.
  details
    summary: h4 Lexeme Flags
    +method("check_flag", "flag_id")
      +params
        +param("flag_id")
          | flag ID
    +attribute("is_oov")
    +attribute("is_alpha")
    +attribute("is_ascii")
    +attribute("is_digit")
    +attribute("is_lower")
    +attribute("is_title")
    +attribute("is_punct")
    +attribute("is_space")
    +attribute("like_url")
    +attribute("like_num")
    +attribute("like_email")
    //+attribute("conjuncts")
    //  | Conjuncts
 +declare_class("Span")
  +init
    +method("__init__")
      Temp
    <code>span = doc[0:4]</code>
  +sequence
    +method("__getitem__")
      p Get item
    +method("__iter__")
      p Iter
    +method("__len__")
      p Len
  details
    summary: h4 Parse
    +attribute("root")
      p Syntactic head
    +attribute("lefts")
      p Tokens that are:
      ol
        li To the left of the span;
        li Syntactic children of words within the span
      p i.e.
      pre.language-python
        code
          | lefts = [span.doc[i] for i in range(0, span.start)
          |          if span.doc[i].head in span]
    +attribute("rights")
      p Tokens that are:
        ol 
          li To the right of the span;
          li Syntactic children of words within the span
      p i.e.
      pre.language-python
        code
          | rights = [span.doc[i] for i in range(span.end, len(span.doc))
          |           if span.doc[i].head in span]
    +attribute("subtree")
      p String
  details
    summary: h4 String Views
    +attribute("string")
      p String
    +attribute("lemma / lemma_")
      p String
    +attribute("label / label_")
      p String
 +declare_class("Lexeme")
  p
    | The Lexeme object represents a lexical type, stored in the vocabulary
    | &ndash; as opposed to a token, occurring in a document.
  p
    | Lexemes store various features, so that these features can be computed
    | once per type, rather than once per token. As job sizes grow, this
    | can amount to a substantial efficiency improvement.
  p
    | All Lexeme attributes are therefore context independent, as a single
    | lexeme is reused for all usages of that word. Lexemes are keyed by
    | the “orth” attribute.
  p
    All Lexeme attributes are accessible directly on the Token object.
  +init
    +method("__init__")
      p Init
    details
      summary: h4 String Features
        +attribute("orth / orth_")
          p
            | The form of the word with no string normalization or processing,
            | as it appears in the string, without trailing whitespace.
        +attribute("lower / lower_")
          p Tmp
        +attribute("norm / norm_")
          p Tmp
        +attribute("shape / shape_")
          p Tmp
        +attribute("prefix / prefix_")
          p Tmp
        +attribute("suffix / suffix_")
          p TMP
 +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
  +sequence
    +method("__len__")
      +returns
        p Number of words in the vocabulary.
    +method("__iter__")
      +returns
        p Lexeme
  +maptype
    +method("__getitem__", "key_int")
      +params
        +param("key")
          p Integer ID
      +returns: p A Lexeme object
    +method("__getitem__", "key_str")
      +params
        +param("key_str", types.unicode)
          p A string in the vocabulary
      +returns("Lexeme")
    +method("__setitem__", "orth_str", "props")
      +params
        +param("orth_str", types.unicode)
          p The orth key
        +param("props", types.dict)
          p A props dictionary
      +returns("None")
  details
    summary: h4 Import/Export
    +method("dump", "loc")
      +params
        +param("loc", types.unicode)
          p Path where the vocabulary should be saved
    +method("load_lexemes", "loc")
    +params
      +param("loc", types.unicode)
        p Path to load the lexemes.bin file from
    +method("load_vectors", "loc")
      +params
        +param("loc", types.unicode)
          p Path to load the vectors.bin from
 +declare_class("StringStore")
  +init
    Tmp
  +sequence
    +method("__len__")
      +returns("int")
        p Number of strings in the string-store
    +method("__iter__")
      +returns
        p Lexeme
  +maptype
    +method("__getitem__", "key_int")
      +params
        +param("key_int")
          p An integer key
      +returns(types.unicode)
        p The string that the integer key maps to
    +method("__getitem__", "key_unicode")
      +params
        +param("key_unicode")
          p A key, as a unicode string
      +returns(types.int)
        p The integer ID of the string.
    +method("__getitem__", "key_utf8_bytes")
      +params
        +param("key_utf8_bytes", types.bytes)
          p p A key, as a UTF-8 encoded byte-string
      +returns(types.int)
        p The integer ID of the string.
  details
    summary: h4 Import/Export
    +method("dump", "loc")
      +params
        +param("loc")
          p File path to save the strings.txt to.
    +method("load")
      +params
        +param("loc")
          p File path to load the strings.txt from.
--- a/docs/redesign/blog.jade
+++ b/docs/redesign/blog.jade
@ -1,95 +0,0 @@
 mixin Teaser(title, url, date_long, date_short, author, lede)
  article.post
    header
      h2
        a(href=url)= title
      .subhead
        | by 
        a(href='#', rel='author')= author
        |  on 
        time(datetime=date_short)= date_long
    p!= lede
      &nbsp;
      a.readmore(href='#') ►
 doctype html
 html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    nav(role="navigation")
      ul
        li: a(href="home.html")        Home
        li: a(href="docs.html")        Docs
        li.active: a(href="blog.html") Blog
        li: a(href="license.html")     License
    main#content(role='main')
      section.intro.profile
        p
          img(src='img/matt.png')
          | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
          span.social
            a(href='#') Follow me on Twitter
        nav(role='navigation')
          ul
            li
              a.button(href='#') Blog
            li
              a.button(href='#tutorials') Tutorials
      section.blogs
        +Teaser(
          "Introducing spaCy",
          "blog_intro.html",
          "February 2015",
          "2015-02-18",
          "Matthew Honnibal",
          "<strong>spaCy</strong> is a new library for text processing in Python " +
          "and Cython. I wrote it because I think small companies are terrible at " +
          "natural language processing (NLP).  Or rather: small companies are using " +
          "terrible NLP technology."
        )
        +Teaser(
          "Parsing English with 500 lines of Python",
          "blog_parser.html",
          "December 18, 2013",
          "2013-12-18",
          "Matthew Hannibal",
          "The Natural Language Processing (NLP) community has made big progress" +
          "in syntactic parsing over the last few years. It’s now possible for a" +
          "tiny Python implementation to perform better than the widely-used Stanford " +
          "PCFG parser.")
        +Teaser(
          "A good Part-of-Speech tagger in about 200 lines of Python",
          "blog_tagger.html",
          "October 11, 2013",
          "2013-09-11",
          "Matthew Honnibal",
          "There are a tonne of “best known techniques” for POS tagging, and you " +
          "should ignore the others and just use greedy Averaged Perceptron."
        )
      section.intro
        h2
          a.permalink(href='#tutorials', name='tutorials') Tutorials
      section.tutorials
        include ./tutorials.jade
    footer(role="contentinfo")
      span.slogan.copyright &copy; 2015 Syllogism Co.
    script(src='js/prism.js')
--- a/docs/redesign/blog_intro.jade
+++ b/docs/redesign/blog_intro.jade
@ -1,81 +0,0 @@
 extends ./template_post.jade
 -
  var urls = {
    'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/',
    'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html",
    'implementation': 'https://gist.github.com/syllog1sm/10343947',
    'redshift': 'http://github.com/syllog1sm/redshift',
    'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm',
    'acl_anthology': 'http://aclweb.org/anthology/',
    'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal'
    }
 - var my_research_software = '<a href="https://github.com/syllog1sm/redshift/tree/develop">my research software</a>'
 - var how_to_write_a_POS_tagger = '<a href="https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/">how to write a part-of-speech tagger</a>'
 - var parser_lnk = '<a href="https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/">parser</a>'
 - var buy_a_commercial_license = '<a href="license.html">buy a commercial license</a>'
 block body_block
  article.post
    p.
      <strong>spaCy</strong> is a new library for text processing in Python
      and Cython. I wrote it because I think small companies are terrible at
      natural language processing (NLP).  Or rather: small companies are using
      terrible NLP technology.
    p.
      To do great NLP, you have to know a little about linguistics, a lot
      about machine learning, and almost everything about the latest research.
      The people who fit this description seldom join small companies.
      Most are broke &ndash; they've just finished grad school.
      If they don't want to stay in academia, they join Google, IBM, etc.
    p.
      The net result is that outside of the tech giants, commercial NLP has
      changed little in the last ten years.  In academia, it's changed entirely.
      Amazing improvements in quality.  Orders of magnitude faster.  But the
      academic code is always GPL, undocumented, unuseable, or all three. 
      You could implement the ideas yourself, but the papers are hard to read,
      and training data is exorbitantly expensive.  So what are you left with?
      A common answer is NLTK, which was written primarily as an educational resource.
      Nothing past the tokenizer is suitable for production use.
    p.
      I used to think that the NLP community just needed to do more to communicate
      its findings to software engineers.  So I wrote two blog posts, explaining
      !{how_to_write_a_POS_tagger} and !{parser_lnk}.  Both were well
      received, and there's been a bit of interest in !{my_research_software}
      &ndash; even though it's entirely undocumented, and mostly unuseable to
      anyone but me.
    p.
      So six months ago I quit my post-doc, and I've been working day and night
      on spaCy since.  I'm now pleased to announce an alpha release.
    p.
      If you're a small company doing NLP, I think spaCy will seem like a minor
      miracle.  It's by far the fastest NLP software ever released.  The
      full processing pipeline completes in 20ms per document, including accurate
      tagging and parsing.  All strings are mapped to integer IDs, tokens are
      linked to embedded word representations, and a range of useful features
      are pre-calculated and cached.
    p.
      If none of that made any sense to you, here's the gist of it.  Computers
      don't understand text.  This is unfortunate, because that's what the
      web almost entirely consists of.  We want to recommend people text based
      on other text they liked.  We want to shorten text to display it on a
      mobile screen.  We want to aggregate it, link it, filter it, categorise
      it, generate it and correct it.
    p. 
      spaCy provides a library of utility functions that help programmers
      build such products.  It's commercial open source software: you can
      either use it under the AGPL, or you can !{buy_a_commercial_license}
      under generous terms.
  footer(role='contentinfo')
--- a/docs/redesign/blog_parser.jade
+++ b/docs/redesign/blog_parser.jade
@ -1,938 +0,0 @@
 extends ./template_post.jade
 block body_block
  - var urls = {}
  //- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/'
  - urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html"
  - urls.implementation = 'https://gist.github.com/syllog1sm/10343947'
  - urls.redshift = 'http://github.com/syllog1sm/redshift'
  - urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm'
  - urls.acl_anthology = 'http://aclweb.org/anthology/'
  - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal"
  // A comment
  article.post
    header
      h2 Parsing English in 500 lines of Python
      .subhead
        | by 
        a(href='#', rel='author') Matthew Honnibal
        |  on 
        time(datetime='2013-12-18') December 18, 2013
    p
      | A  
      a(href=urls.google_ngrams) syntactic parser 
      | describes a sentence’s grammatical structure, to help another
      | application reason about it. Natural languages introduce many unexpected
      | ambiguities, which our world-knowledge immediately filters out. A
      | favourite example:
    p.example They ate the pizza with anchovies
    p
      img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity')
    p
      | A correct parse links “with” to “pizza”, while an incorrect parse
      | links “with” to “eat”:
    .displacy
      iframe(src='displacy/anchovies_bad.html', height='275')
    .displacy
      iframe.displacy(src='displacy/anchovies_good.html', height='275')
      a.view-displacy(href='#') View on displaCy
      p.caption
        | The Natural Language Processing (NLP) community has made big progress
        | in syntactic parsing over the last few years.
    p
      | The Natural Language Processing (NLP) community has made big progress
      | in syntactic parsing over the last few years. It’s now possible for
      | a tiny Python implementation to perform better than the widely-used
      | Stanford PCFG parser.
    p
      strong Update!
      |  The Stanford CoreNLP library now includes a greedy transition-based
      | dependency parser, similar to the one described in this post, but with
      | an improved learning strategy. It is much faster and more accurate
      | than this simple Python implementation.
    table
      thead
        tr
          th Parser
          th Accuracy
          th Speed (w/s)
          th Language
          th LOC
      tbody
        tr
          td Stanford
          td 89.6%
          td 19
          td Java
          td
            | > 4,000
            sup
              a(href='#note-1') [1]
        tr
          td
            strong parser.py
          td 89.8%
          td 2,020
          td Python
            strong ~500
        tr
          td Redshift
          td
            strong 93.6%
          td
            strong 2,580
          td Cython
          td ~4,000
    p
      | The rest of the post sets up the problem, and then takes you through 
      a(href=urls.implementation) a concise implementation
      | , prepared for this post. The first 200 lines of parser.py, the
      | part-of-speech tagger and learner, are described 
      a(href=pos_tagger_url) here. You should probably at least skim that
      | post before reading this one, unless you’re very familiar with NLP
      | research.
    p
      | The Cython system, Redshift, was written for my current research. I
      | plan to improve it for general use in June, after my contract ends
      | at Macquarie University. The current version is 
      a(href=urls.redshift) hosted on GitHub
      | .
    h3 Problem Description
    p It’d be nice to type an instruction like this into your phone:
    p.example
      Set volume to zero when I’m in a meeting, unless John’s school calls.
    p
      | And have it set the appropriate policy. On Android you can do this
      | sort of thing with 
      a(href=urls.tasker) Tasker
      | , but an NL interface would be much better. It’d be especially nice
      | to receive a meaning representation you could edit, so you could see
      | what it thinks you said, and correct it.
    p
      | There are lots of problems to solve to make that work, but some sort
      | of syntactic representation is definitely necessary. We need to know that:
    p.example
      Unless John’s school calls, when I’m in a meeting, set volume to zero
    p is another way of phrasing the first instruction, while:
    p.example
      Unless John’s school, call when I’m in a meeting
    p means something completely different.
    p
      | A dependency parser returns a graph of word-word relationships,
      | intended to make such reasoning easier. Our graphs will be trees &ndash;
      | edges will be directed, and every node (word) will have exactly one
      | incoming arc (one dependency, with its head), except one.
    h4 Example usage
    pre.language-python
      code
        | parser = parser.Parser()
        | tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split()
        | >>> tags, heads = parser.parse(tokens)
        | >>> heads
        | [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11]
        | >>> for i, h in enumerate(heads): 
        | ...   head = tokens[heads[h]] if h &gt;= 1 else 'None'
        | ...   print(tokens[i] + ' &lt;-- ' + head])
        | Set <-- None
        | the <-- volume
        | volume <-- Set
        | to <-- Set
        | zero <-- to
        | when <-- Set
        | I <-- 'm
        | 'm <-- when
        | in <-- 'm
        | a <-- meeting
        | meeting <-- in
        | unless <-- Set
        | John <-- 's
        | 's   <-- calls
        | school <-- calls
        | calls <-- unless
    p.
      The idea is that it should be slightly easier to reason from the parse,
      than it was from the string. The parse-to-meaning mapping is hopefully
      simpler than the string-to-meaning mapping.
    p.
      The most confusing thing about this problem area is that “correctness”
      is defined by convention — by annotation guidelines. If you haven’t
      read the guidelines and you’re not a linguist, you can’t tell whether
      the parse is “wrong” or “right”, which makes the whole task feel weird
      and artificial.
    p.
      For instance, there’s a mistake in the parse above: “John’s school
      calls” is structured wrongly, according to the Stanford annotation
      guidelines. The structure of that part of the sentence is how the
      annotators were instructed to parse an example like “John’s school
      clothes”.
    p
      | It’s worth dwelling on this point a bit. We could, in theory, have
      | written our guidelines so that the “correct” parses were reversed.
      | There’s good reason to believe the parsing task will be harder if we
      | reversed our convention, as it’d be less consistent with the rest of
      | the grammar. 
      sup: a(href='#note-2') [2]
      | But we could test that empirically, and we’d be pleased to gain an
      | advantage by reversing the policy.
    p
      | We definitely do want that distinction in the guidelines — we don’t
      | want both to receive the same structure, or our output will be less
      | useful. The annotation guidelines strike a balance between what
      | distinctions downstream applications will find useful, and what
      | parsers will be able to predict easily.
    h4 Projective trees
    p
      | There’s a particularly useful simplification that we can make, when
      | deciding what we want the graph to look like: we can restrict the
      | graph structures we’ll be dealing with. This doesn’t just give us a
      | likely advantage in learnability; it can have deep algorithmic
      | implications. We follow most work on English in constraining the
      | dependency graphs to be 
      em projective trees
      | :
    ol
      li Tree. Every word has exactly one head, except for the dummy ROOT symbol.
      li
        | Projective. For every pair of dependencies (a1, a2) and (b1, b2),
        | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”.
        | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or
        | b1 a1 b2 a2.
    p
      | There’s a rich literature on parsing non-projective trees, and a
      | smaller literature on parsing DAGs. But the parsing algorithm I’ll
      | be explaining deals with projective trees.
    h3 Greedy transition-based parsing
    p
      | Our parser takes as input a list of string tokens, and outputs a
      | list of head indices, representing edges in the graph. If the 
      em i
      | th member of heads is 
      em j
      | , the dependency parse contains an edge (j, i). A transition-based
      | parser is a finite-state transducer; it maps an array of N words
      | onto an output array of N head indices:
    table.center
      tbody
        tr
          td
            em start
          td MSNBC
          td reported
          td that
          td Facebook
          td bought
          td WhatsApp
          td for
          td $16bn
          td
            em root
        tr
          td 0
          td 2
          td 9
          td 2
          td 4
          td 2
          td 4
          td 4
          td 7
          td 0
    p
      | The heads array denotes that the head of 
      em MSNBC
      |  is 
      em reported
      | : 
      em MSNBC
      |  is word 1, and 
      em reported
      |  is word 2, and 
      code.language-python heads[1] == 2
      | . You can already see why parsing a tree is handy — this data structure
      | wouldn’t work if we had to output a DAG, where words may have multiple
      | heads.
    p
      | Although 
      code.language-python heads
      | can be represented as an array, we’d actually like to maintain some
      | alternate ways to access the parse, to make it easy and efficient to
      | extract features. Our 
      code.language-python Parse
      | class looks like this:
    pre.language-python
      code
        | class Parse(object):
        |     def __init__(self, n):
        |         self.n = n
        |         self.heads = [None] * (n-1)
        |         self.lefts = []
        |         self.rights = []
        |         for i in range(n+1):
        |             self.lefts.append(DefaultList(0))
        |             self.rights.append(DefaultList(0))
        |     
        |     def add_arc(self, head, child):
        |         self.heads[child] = head
        |         if child < head:
        |             self.lefts[head].append(child)
        |         else:
        |             self.rights[head].append(child)
    p
      | As well as the parse, we also have to keep track of where we’re up
      | to in the sentence. We’ll do this with an index into the 
      code.language-python words
      |  array, and a stack, to which we’ll push words, before popping them
      | once their head is set. So our state data structure is fundamentally:
    ul
      li An index, i, into the list of tokens;
      li The dependencies added so far, in Parse
      li
        | A stack, containing words that occurred before i, for which we’re
        | yet to assign a head.
    p Each step of the parsing process applies one of three actions to the state:
    pre.language-python
      code
        | SHIFT = 0; RIGHT = 1; LEFT = 2
        | MOVES = [SHIFT, RIGHT, LEFT]
        | 
        | def transition(move, i, stack, parse):
        |     global SHIFT, RIGHT, LEFT
        |     if move == SHIFT:
        |         stack.append(i)
        |         return i + 1
        |     elif move == RIGHT:
        |         parse.add_arc(stack[-2], stack.pop())
        |         return i
        |     elif move == LEFT:
        |         parse.add_arc(i, stack.pop())
        |         return i
        |     raise GrammarError(&quot;Unknown move: %d&quot; % move)
    p
      | The 
      code.language-python LEFT
      |  and 
      code.language-python RIGHT
      |  actions add dependencies and pop the stack, while 
      code.language-python SHIFT
      |  pushes the stack and advances i into the buffer.
    p.
      So, the parser starts with an empty stack, and a buffer index at 0, with
      no dependencies recorded. It chooses one of the (valid) actions, and
      applies it to the state. It continues choosing actions and applying
      them until the stack is empty and the buffer index is at the end of
      the input. (It’s hard to understand this sort of algorithm without
      stepping through it. Try coming up with a sentence, drawing a projective
      parse tree over it, and then try to reach the parse tree by choosing
      the right sequence of transitions.)
    p Here’s what the parsing loop looks like in code:
    pre.language-python
      code
        | class Parser(object):
        |     ...
        |     def parse(self, words):
        |         tags = self.tagger(words)
        |         n = len(words)
        |         idx = 1
        |         stack = [0]
        |         deps = Parse(n)
        |         while stack or idx < n:
        |             features = extract_features(words, tags, idx, n, stack, deps)
        |             scores = self.model.score(features)
        |             valid_moves = get_valid_moves(i, n, len(stack))
        |             next_move = max(valid_moves, key=lambda move: scores[move])
        |             idx = transition(next_move, idx, stack, parse)
        |         return tags, parse
        | 
        | def get_valid_moves(i, n, stack_depth):
        |     moves = []
        |     if i < n:
        |         moves.append(SHIFT)
        |     if stack_depth <= 2:
        |         moves.append(RIGHT)
        |     if stack_depth <= 1:
        |         moves.append(LEFT)
        |     return moves
    p.
      We start by tagging the sentence, and initializing the state. We then
      map the state to a set of features, which we score using a linear model.
      We then find the best-scoring valid move, and apply it to the state.
    p
      | The model scoring works the same as it did in 
      a(href=urls.post) the POS tagger.
      | If you’re confused about the idea of extracting features and scoring
      | them with a linear model, you should review that post. Here’s a reminder
      | of how the model scoring works:
    pre.language-python
      code
        | class Perceptron(object)
        |     ...
        |     def score(self, features):
        |         all_weights = self.weights
        |         scores = dict((clas, 0) for clas in self.classes)
        |         for feat, value in features.items():
        |             if value == 0:
        |                 continue
        |             if feat not in all_weights:
        |                 continue
        |             weights = all_weights[feat]
        |             for clas, weight in weights.items():
        |                 scores[clas] += value * weight
        |         return scores
    p.
      It’s just summing the class-weights for each feature. This is often
      expressed as a dot-product, but when you’re dealing with multiple
      classes, that gets awkward, I find.
    p.
      The beam parser (RedShift) tracks multiple candidates, and only decides
      on the best one at the very end. We’re going to trade away accuracy
      in favour of efficiency and simplicity. We’ll only follow a single
      analysis. Our search strategy will be entirely greedy, as it was with
      the POS tagger. We’ll lock-in our choices at every step.
    p.
      If you read the POS tagger post carefully, you might see the underlying
      similarity. What we’ve done is mapped the parsing problem onto a
      sequence-labelling problem, which we address using a “flat”, or unstructured,
      learning algorithm (by doing greedy search).
    h3 Features
    p.
      Feature extraction code is always pretty ugly. The features for the parser
      refer to a few tokens from the context:
    ul
      li The first three words of the buffer (n0, n1, n2)
      li The top three words of the stack (s0, s1, s2)
      li The two leftmost children of s0 (s0b1, s0b2);
      li The two rightmost children of s0 (s0f1, s0f2);
      li The two leftmost children of n0 (n0b1, n0b2)
    p.
      For these 12 tokens, we refer to the word-form, the part-of-speech tag,
      and the number of left and right children attached to the token.
    p.
      Because we’re using a linear model, we have our features refer to pairs
      and triples of these atomic properties.
    pre.language-python
      code
        | def extract_features(words, tags, n0, n, stack, parse):
        |     def get_stack_context(depth, stack, data):
        |         if depth &gt;= 3:
        |             return data[stack[-1]], data[stack[-2]], data[stack[-3]]
        |         elif depth &gt;= 2:
        |             return data[stack[-1]], data[stack[-2]], ''
        |         elif depth == 1:
        |             return data[stack[-1]], '', ''
        |         else:
        |             return '', '', ''
        | 
        |     def get_buffer_context(i, n, data):
        |         if i + 1 &gt;= n:
        |             return data[i], '', ''
        |         elif i + 2 &gt;= n:
        |             return data[i], data[i + 1], ''
        |         else:
        |             return data[i], data[i + 1], data[i + 2]
        | 
        |     def get_parse_context(word, deps, data):
        |         if word == -1:
        |             return 0, '', ''
        |         deps = deps[word]
        |         valency = len(deps)
        |         if not valency:
        |             return 0, '', ''
        |         elif valency == 1:
        |             return 1, data[deps[-1]], ''
        |         else:
        |             return valency, data[deps[-1]], data[deps[-2]]
        | 
        |     features = {}
        |     # Set up the context pieces --- the word, W, and tag, T, of:
        |     # S0-2: Top three words on the stack
        |     # N0-2: First three words of the buffer
        |     # n0b1, n0b2: Two leftmost children of the first word of the buffer
        |     # s0b1, s0b2: Two leftmost children of the top word of the stack
        |     # s0f1, s0f2: Two rightmost children of the top word of the stack
        | 
        |     depth = len(stack)
        |     s0 = stack[-1] if depth else -1
        | 
        |     Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words)
        |     Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags)
        | 
        |     Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words)
        |     Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags)
        | 
        |     Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words)
        |     Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags)
        | 
        |     Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words)
        |     _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags)
        | 
        |     Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words)
        |     _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags)
        | 
        |     Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words)
        |     _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags)
        | 
        |     # Cap numeric features at 5? 
        |     # String-distance
        |     Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0
        | 
        |     features['bias'] = 1
        |     # Add word and tag unigrams
        |     for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2):
        |         if w:
        |             features['w=%s' % w] = 1
        |     for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2):
        |         if t:
        |             features['t=%s' % t] = 1
        | 
        |     # Add word/tag pairs
        |     for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))):
        |         if w or t:
        |             features['%d w=%s, t=%s' % (i, w, t)] = 1
        | 
        |     # Add some bigrams
        |     features['s0w=%s,  n0w=%s' % (Ws0, Wn0)] = 1
        |     features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1
        |     features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1
        |     features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1
        |     features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1
        |     features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1
        |     features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1
        |     features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1
        | 
        |     # Add some tag trigrams
        |     trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), 
        |                 (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1),
        |                 (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2),
        |                 (Ts0, Ts1, Ts1))
        |     for i, (t1, t2, t3) in enumerate(trigrams):
        |         if t1 or t2 or t3:
        |             features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1
        | 
        |     # Add some valency and distance features
        |     vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b))
        |     vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b))
        |     d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0),
        |         ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0))
        |     for i, (w_t, v_d) in enumerate(vw + vt + d):
        |         if w_t or v_d:
        |             features['val/d-%d %s %d' % (i, w_t, v_d)] = 1
        |     return features</code></pre>
    h3 Training
    p.
      Weights are learned using the same algorithm, averaged perceptron, that
      we used for part-of-speech tagging. Its key strength is that it’s an
      online learning algorithm: examples stream in one-by-one, we make our
      prediction, check the actual answer, and adjust our beliefs (weights)
      if we were wrong.
    p The training loop looks like this:
    pre.language-python
      code
          | class Parser(object):
          |     ...
          |     def train_one(self, itn, words, gold_tags, gold_heads):
          |         n = len(words)
          |         i = 2; stack = [1]; parse = Parse(n)
          |         tags = self.tagger.tag(words)
          |         while stack or (i + 1) < n:
          |             features = extract_features(words, tags, i, n, stack, parse)
          |             scores = self.model.score(features)
          |             valid_moves = get_valid_moves(i, n, len(stack))
          |             guess = max(valid_moves, key=lambda move: scores[move])
          |             gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads)
          |             best = max(gold_moves, key=lambda move: scores[move])
          |         self.model.update(best, guess, features)
          |         i = transition(guess, i, stack, parse)
          |     # Return number correct
          |     return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]])
    p 
      | The most interesting part of the training process is in 
      code.language-python get_gold_moves.
      | The performance of our parser is made possible by an advance by Goldberg
      | and Nivre (2012), who showed that we’d been doing this wrong for years.
    p
      | In the POS-tagging post, I cautioned that during training you need to
      | make sure you pass in the last two
      em predicted
      | tags as features for the current tag, not the last two 
      em gold
      | tags. At test time you’ll only have the predicted tags, so if you
      | base your features on the gold sequence during training, your training
      | contexts won’t resemble your test-time contexts, so you’ll learn the
      | wrong weights.
    p.
      In parsing, the problem was that we didn’t know 
      em how
      | to pass in the predicted sequence! Training worked by taking the
      | gold-standard tree, and finding a transition sequence that led to it.
      | i.e., you got back a sequence of moves, with the guarantee that if
      | you followed those moves, you’d get the gold-standard dependencies.
    p
      | The problem is, we didn’t know how to define the “correct” move to
      | teach a parser to make if it was in any state that 
      em wasn’t
      |  along that gold-standard sequence. Once the parser had made a mistake,
      | we didn’t know how to train from that example.
    p
      | That was a big problem, because it meant that once the parser started
      | making mistakes, it would end up in states unlike any in its training
      | data &ndash; leading to yet more mistakes. The problem was specific
      | to greedy parsers: once you use a beam, there’s a natural way to do
      | structured prediction.
    p
      | The solution seems obvious once you know it, like all the best breakthroughs.
      | What we do is define a function that asks “How many gold-standard
      | dependencies can be recovered from this state?”. If you can define
      | that function, then you can apply each move in turn, and ask, “How
      | many gold-standard dependencies can be recovered from 
      em this
      | state?”. If the action you applied allows 
      em fewer
      | gold-standard dependencies to be reached, then it is sub-optimal.
    p That’s a lot to take in.
    p
      | So we have this function 
      code Oracle(state)
      | :
      pre
        code
          | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) |
    p
      | We also have a set of actions, each of which returns a new state.
      | We want to know:
    ul
      li shift_cost = Oracle(state) – Oracle(shift(state))
      li right_cost = Oracle(state) – Oracle(right(state))
      li left_cost = Oracle(state) – Oracle(left(state))
    p
      | Now, at least one of those costs 
      em has
      | to be zero. Oracle(state) is asking, “what’s the cost of the best
      | path forward?”, and the first action of that best path has to be
      | shift, right, or left.
    p
      | It turns out that we can derive Oracle fairly simply for many transition
      | systems. The derivation for the transition system we’re using, Arc
      | Hybrid, is in Goldberg and Nivre (2013).
    p
      | We’re going to implement the oracle as a function that returns the
      | zero-cost moves, rather than implementing a function Oracle(state).
      | This prevents us from doing a bunch of costly copy operations.
      | Hopefully the reasoning in the code isn’t too hard to follow, but
      | you can also consult Goldberg and Nivre’s papers if you’re confused
      | and want to get to the bottom of this.
    pre.language-python
      code
        | def get_gold_moves(n0, n, stack, heads, gold):
        |     def deps_between(target, others, gold):
        |         for word in others:
        |             if gold[word] == target or gold[target] == word:
        |                 return True
        |         return False
        | 
        |     valid = get_valid_moves(n0, n, len(stack))
        |     if not stack or (SHIFT in valid and gold[n0] == stack[-1]):
        |         return [SHIFT]
        |     if gold[stack[-1]] == n0:
        |         return [LEFT]
        |     costly = set([m for m in MOVES if m not in valid])
        |     # If the word behind s0 is its gold head, Left is incorrect
        |     if len(stack) >= 2 and gold[stack[-1]] == stack[-2]:
        |         costly.add(LEFT)
        |     # If there are any dependencies between n0 and the stack,
        |     # pushing n0 will lose them.
        |     if SHIFT not in costly and deps_between(n0, stack, gold):
        |         costly.add(SHIFT)
        |     # If there are any dependencies between s0 and the buffer, popping
        |     # s0 will lose them.
        |     if deps_between(stack[-1], range(n0+1, n-1), gold):
        |         costly.add(LEFT)
        |         costly.add(RIGHT)
        |     return [m for m in MOVES if m not in costly]</code></pre>
    p
      | Doing this “dynamic oracle” training procedure makes a big difference
      | to accuracy — typically 1-2%, with no difference to the way the run-time
      | works. The old “static oracle” greedy training procedure is fully
      | obsolete; there’s no reason to do it that way any more.
    h3 Conclusion
    p
      | I have the sense that language technologies, particularly those relating
      | to grammar, are particularly mysterious. I can imagine having no idea
      | what the program might even do.
    p
      | I think it therefore seems natural to people that the best solutions
      | would be over-whelmingly complicated. A 200,000 line Java package
      | feels appropriate.
    p
      | But, algorithmic code is usually short, when only a single algorithm
      | is implemented. And when you only implement one algorithm, and you
      | know exactly what you want to write before you write a line, you
      | also don’t pay for any unnecessary abstractions, which can have a
      | big performance impact.
    h3 Notes
    p
      a(name='note-1')
        | [1] I wasn’t really sure how to count the lines of code in the Stanford
        | parser. Its jar file ships over 200k, but there are a lot of different
        | models in it. It’s not important, but it's certainly over 4k.
    p
      a(name='note-2')
      | [2] For instance, how would you parse, “John’s school of music calls”?
      | You want to make sure the phrase “John’s school” has a consistent
      | structure in both “John’s school calls” and “John’s school of music
      | calls”. Reasoning about the different “slots” you can put a phrase
      | into is a key way we reason about what syntactic analyses look like.
      | You can think of each phrase as having a different shaped connector,
      | which you need to plug into different slots — which each phrase also
      | has a certain number of, each of a different shape. We’re trying to
      | figure out what connectors are where, so we can figure out how the
      | sentences are put together.
    h3 Idle speculation
    p
      | For a long time, incremental language processing algorithms were
      | primarily of scientific interest. If you want to write a parser to
      | test a theory about how the human sentence processor might work, well,
      | that parser needs to build partial interpretations. There’s a wealth
      | of evidence, including commonsense introspection, that establishes
      | that we don’t buffer input and analyse it once the speaker has finished.
    p
      | But now algorithms with that neat scientific feature are winning!
      | As best as I can tell, the secret to that success is to be:
    ul
      li Incremental. Earlier words constrain the search.
      li
        | Error-driven. Training involves a working hypothesis, which is
        | updated as it makes mistakes.
    p
      | The links to human sentence processing seem tantalising. I look
      | forward to seeing whether these engineering breakthroughs lead to
      | any psycholinguistic advances.
    h3 Bibliography
    p
      | The NLP literature is almost entirely open access. All of the relavant
      | papers can be found 
      a(href=urls.acl_anthology, rel='nofollow') here
      | .
    p
      | The parser I’ve described is an implementation of the dynamic-oracle
      | Arc-Hybrid system here:
      span.bib-item
        | Goldberg, Yoav; Nivre, Joakim. 
        em Training Deterministic Parsers with Non-Deterministic Oracles
        | . TACL 2013
    p
      | However, I wrote my own features for it. The arc-hybrid system was
      | originally described here:
      span.bib-item
        | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic
        | programming algorithms for transition-based dependency parsers. ACL 2011
    p
      | The dynamic oracle training method was first described here:
      span.bib-item
        | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav;
        | Nivre, Joakim. COLING 2012
    p
      | This work depended on a big break-through in accuracy for transition-based
      | parsers, when beam-search was properly explored by Zhang and Clark.
      | They have several papers, but the preferred citation is:
      span.bib-item
        | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized
        | Perceptron and Beam Search. Computational Linguistics 2011 (1)
    p
      | Another important paper was this little feature engineering paper,
      | which further improved the accuracy:
      span.bib-item
        | Zhang, Yue;  Nivre, Joakim. Transition-based Dependency Parsing with
        | Rich Non-local Features. ACL 2011
    p
      | The generalised perceptron, which is the learning framework for these
      | beam parsers, is from this paper:
      span.bib-item
        | Collins, Michael. Discriminative Training Methods for Hidden Markov
        | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002
    h3 Experimental details
    p
      | The results at the start of the post refer to Section 22 of the Wall
      | Street Journal corpus. The Stanford parser was run as follows:
    pre.language-bash
      code
        | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \
        | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $*
    p
      | A small post-process was applied, to undo the fancy tokenisation
      | Stanford adds for numbers, to make them match the PTB tokenisation:
    pre.language-python
      code
        | """Stanford parser retokenises numbers. Split them."""
        | import sys
        | import re
        |  
        | qp_re = re.compile('\xc2\xa0')
        | for line in sys.stdin:
        |     line = line.rstrip()
        |     if qp_re.search(line):
        |         line = line.replace('(CD', '(QP (CD', 1) + ')'
        |         line = line.replace('\xc2\xa0', ') (CD ')
        |     print line
    p
      | The resulting PTB-format files were then converted into dependencies
      | using the Stanford converter:
    pre.language-bash
      code
        | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
        | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
        | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll
    p
      | I can’t easily read that anymore, but it should just convert every
      | .mrg file in a folder to a CoNLL-format Stanford basic dependencies
      | file, using the settings common in the dependency literature.
    p
      | I then converted the gold-standard trees from WSJ 22, for the evaluation.
      | Accuracy scores refer to unlabelled attachment score (i.e. the head index)
      | of all non-punctuation tokens.
    p
      | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21
      | into the same conversion script.
    p
      | In a nutshell: The Stanford model and parser.py are trained on the
      | same set of sentences, and they each make their predictions on a
      | held-out test set, for which we know the answers. Accuracy refers
      | to how many of the words’ heads we got correct.
    p
      | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a
      | server, to give the Stanford parser more memory. The parser.py system
      | runs fine on my MacBook Air. I used PyPy for the parser.py experiments;
      | CPython was about half as fast on an early benchmark.
    p
      | One of the reasons parser.py is so fast is that it does unlabelled
      | parsing. Based on previous experiments, a labelled parser would likely
      | be about 40x slower, and about 1% more accurate. Adapting the program
      | to labelled parsing would be a good exercise for the reader, if you
      | have access to the data.
    p
      | The result from the Redshift parser was produced from commit 
      code.language-python b6b624c9900f3bf
      | , which was run as follows:
    pre.language-bash
      code
        | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp
        | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/
        | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll<
    footer.meta(role='contentinfo')
      a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter
      .discuss
        a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News
        | 
        a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit
--- a/docs/redesign/blog_tagger.jade
+++ b/docs/redesign/blog_tagger.jade
@ -1,492 +0,0 @@
 extends ./template_post.jade
 block body_block
  - var urls = {}
  - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal"
  article.post
    header
      h2 A good Part-of-Speech tagger in about 200 lines of Python
      .subhead
        | by 
        a(href="#" rel="author") Matthew Honnibal
        | on 
        time(datetime='2013-09-11') October 11, 2013
    p.
      Up-to-date knowledge about natural language processing is mostly locked away
      in academia. And academics are mostly pretty self-conscious when we write.
      We’re careful. We don’t want to stick our necks out too much. But under-confident
      recommendations suck, so here’s how to write a good part-of-speech tagger.
    p.
      There are a tonne of “best known techniques” for POS tagging, and you should
      ignore the others and just use Averaged Perceptron.
    p.
      You should use two tags of history, and features derived from the Brown word
      clusters distributed here.
    p.
      If you only need the tagger to work on carefully edited text, you should
      use case-sensitive features, but if you want a more robust tagger you
      should avoid them because they’ll make you over-fit to the conventions
      of your training domain. Instead, features that ask “how frequently is
      this word title-cased, in a large sample from the web?” work well. Then
      you can lower-case your comparatively tiny training corpus.
    p.
      For efficiency, you should figure out which frequent words in your training
      data have unambiguous tags, so you don’t have to do anything but output
      their tags when they come up. About 50% of the words can be tagged that way.
    p.
      And unless you really, really can’t do without an extra 0.1% of accuracy,
      you probably shouldn’t bother with any kind of search strategy  you should
      just use a greedy model.
    p.
      If you do all that, you’ll find your tagger easy to write and understand,
      and an efficient Cython implementation will perform as follows on the standard
      evaluation, 130,000 words of text from the Wall Street Journal:
    table
      thead
        tr
          th Tagger
          th Accuracy
          th Time (130k words)
      tbody
        tr
          td CyGreedyAP
          td 97.1%
          td 4s
    p.
      The 4s includes initialisation time — the actual per-token speed is high
      enough to be irrelevant; it won’t be your bottleneck.
    p.
      It’s tempting to look at 97% accuracy and say something similar, but that’s
      not true. My parser is about 1% more accurate if the input has hand-labelled
      POS tags, and the taggers all perform much worse on out-of-domain data.
      Unfortunately accuracies have been fairly flat for the last ten years.
      That’s why my recommendation is to just use a simple and fast tagger that’s
      roughly as good.
    p.
      The thing is though, it’s very common to see people using taggers that
      aren’t anywhere near that good!  For an example of what a non-expert is
      likely to use, these were the two taggers wrapped by TextBlob, a new Python
      api that I think is quite neat:
    table
      thead
        tr
          th Tagger
          th Accuracy
          th Time (130k words)
      tbody
        tr
          td NLTK
          td 94.0%
          td 3m56s
        tr
          td Pattern
          td 93.5%
          td 26s
    p.
      Both Pattern and NLTK are very robust and beautifully well documented, so
      the appeal of using them is obvious. But Pattern’s algorithms are pretty
      crappy, and NLTK carries tremendous baggage around in its implementation
      because of its massive framework, and double-duty as a teaching tool.
    p.  
      As a stand-alone tagger, my Cython implementation is needlessly complicated
      &ndash; it was written for my parser. So today I wrote a 200 line version
      of my recommended algorithm for TextBlob. It gets:
    table
      thead
        tr
          th Tagger
          th Accuracy
          th Time (130k words)
      tbody
        tr
          td PyGreedyAP
          td 96.8%
          td 12s
    p.
      I traded some accuracy and a lot of efficiency to keep the implementation
      simple. Here’s a far-too-brief description of how it works.
    h3 Averaged perceptron
    p.
      POS tagging is a “supervised learning problem”. You’re given a table of data,
      and you’re told that the values in the last column will be missing during
      run-time. You have to find correlations from the other columns to predict
      that value.
    p.
      So for us, the missing column will be “part of speech at word i“. The predictor
      columns (features) will be things like “part of speech at word i-1“, “last three
      letters of word at i+1“, etc
    p.
      First, here’s what prediction looks like at run-time:
    pre.language-python
      code
        | def predict(self, features):
        |     '''Dot-product the features and current weights and return the best class.'''
        |     scores = defaultdict(float)
        |     for feat in features:
        |         if feat not in self.weights:
        |             continue
        |         weights = self.weights[feat]
        |         for clas, weight in weights.items():
        |             scores[clas] += weight
        |     # Do a secondary alphabetic sort, for stability
        |     return max(self.classes, key=lambda clas: (scores[clas], clas))
    p.
      Earlier I described the learning problem as a table, with one of the columns
      marked as missing-at-runtime. For NLP, our tables are always exceedingly
      sparse. You have columns like “word i-1=Parliament”, which is almost always
      0. So our “weight vectors” can pretty much never be implemented as vectors.
      Map-types are good though — here we use dictionaries.
    p.
      The input data, features, is a set with a member for every non-zero “column”
      in our “table” &ndash; every active feature. Usually this is actually a dictionary,
      to let you set values for the features. But here all my features are binary
      present-or-absent type deals.
    p.
      The weights data-structure is a dictionary of dictionaries, that ultimately
      associates feature/class pairs with some weight. You want to structure it
      this way instead of the reverse because of the way word frequencies are
      distributed: most words are rare, frequent words are very frequent.
    h3 Learning the weights
    p.
      Okay, so how do we get the values for the weights? We start with an empty
      weights dictionary, and iteratively do the following:
    ol
      li Receive a new (features, POS-tag) pair
      li Guess the value of the POS tag given the current “weights” for the features
      li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class.
    p.
      It’s one of the simplest learning algorithms. Whenever you make a mistake,
      increment the weights for the correct class, and penalise the weights that
      led to your false prediction. In code:
    pre.language-python
      code
        | def train(self, nr_iter, examples):
        |     for i in range(nr_iter):
        |         for features, true_tag in examples:
        |             guess = self.predict(features)
        |             if guess != true_tag:
        |                 for f in features:
        |                     self.weights[f][true_tag] += 1
        |                     self.weights[f][guess] -= 1
        |         random.shuffle(examples)
    p.
      If you iterate over the same example this way, the weights for the correct
      class would have to come out ahead, and you’d get the example right. If
      you think about what happens with two examples, you should be able to
      see that it will get them both right unless the features are identical.
      In general the algorithm will converge so long as the examples are
      linearly separable, although that doesn’t matter for our purpose.
    h3 Averaging the weights
    p.
      We need to do one more thing to make the perceptron algorithm competitive.
      The problem with the algorithm so far is that if you train it twice on
      slightly different sets of examples, you end up with really different models.
      It doesn’t generalise that smartly. And the problem is really in the later
      iterations — if you let it run to convergence, it’ll pay lots of attention
      to the few examples it’s getting wrong, and mutate its whole model around
      them.
    p.
      So, what we’re going to do is make the weights more "sticky" &ndash; give
      the model less chance to ruin all its hard work in the later rounds. And
      we’re going to do that by returning the averaged weights, not the final
      weights.
    p.
      I doubt there are many people who are convinced that’s the most obvious
      solution to the problem, but whatever. We’re not here to innovate, and this
      way is time tested on lots of problems. If you have another idea, run the
      experiments and tell us what you find. Actually I’d love to see more work
      on this, now that the averaged perceptron has become such a prominent learning
      algorithm in NLP.
    p.
      Okay. So this averaging. How’s that going to work? Note that we don’t want
      to just average after each outer-loop iteration. We want the average of all
      the values — from the inner loop. So if we have 5,000 examples, and we train
      for 10 iterations, we’ll average across 50,000 values for each weight.
    p.
      Obviously we’re not going to store all those intermediate values. Instead,
      we’ll track an accumulator for each weight, and divide it by the number of
      iterations at the end. Again: we want the average weight assigned to a
      feature/class pair during learning, so the key component we need is the total
      weight it was assigned. But we also want to be careful about how we compute
      that accumulator, too. On almost any instance, we’re going to see a tiny
      fraction of active feature/class pairs. All the other feature/class weights
      won’t change. So we shouldn’t have to go back and add the unchanged value
      to our accumulators anyway, like chumps.
    p.
      Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain
      another dictionary that tracks how long each weight has gone unchanged. Now
      when we do change a weight, we can do a fast-forwarded update to the accumulator,
      for all those iterations where it lay unchanged.
    p.
      Here’s what a weight update looks like now that we have to maintain the
      totals and the time-stamps:
    pre.language-python
      code
        | def update(self, truth, guess, features):
        |     def upd_feat(c, f, v):
        |         nr_iters_at_this_weight = self.i - self._timestamps[f][c]
        |         self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c]
        |         self.weights[f][c] += v
        |         self._timestamps[f][c] = self.i
        |     self.i += 1
        |     for f in features:
        |         upd_feat(truth, f, 1.0)
        |         upd_feat(guess, f, -1.0)
    h3 Features and pre-processing
    p.
      The POS tagging literature has tonnes of intricate features sensitive to
      case, punctuation, etc. They help on the standard test-set, which is from
      Wall Street Journal articles from the 1980s, but I don’t see how they’ll
      help us learn models that are useful on other text.
    p.
      To help us learn a more general model, we’ll pre-process the data prior
      to feature extraction, as follows:
    ul
      li All words are lower cased;
      li Digits in the range 1800-2100 are represented as !YEAR;
      li Other digit strings are represented as !DIGITS
      li
        | It would be better to have a module recognising dates, phone numbers,
        | emails, hash-tags, etc. but that will have to be pushed back into the
        | tokenization.
    p.
      I played around with the features a little, and this seems to be a reasonable
      bang-for-buck configuration in terms of getting the development-data accuracy
      to 97% (where it typically converges anyway), and having a smaller memory
      foot-print:
    pre.language-python
      code
        | def _get_features(self, i, word, context, prev, prev2):
        |     '''Map tokens-in-contexts into a feature representation, implemented as a
        |     set. If the features change, a new model must be trained.'''
        |     def add(name, *args):
        |         features.add('+'.join((name,) + tuple(args)))
        |     features = set()
        |     add('bias') # This acts sort of like a prior
        |     add('i suffix', word[-3:])
        |     add('i pref1', word[0])
        |     add('i-1 tag', prev)
        |     add('i-2 tag', prev2)
        |     add('i tag+i-2 tag', prev, prev2)
        |     add('i word', context[i])
        |     add('i-1 tag+i word', prev, context[i])
        |     add('i-1 word', context[i-1])
        |     add('i-1 suffix', context[i-1][-3:])
        |     add('i-2 word', context[i-2])
        |     add('i+1 word', context[i+1])
        |     add('i+1 suffix', context[i+1][-3:])
        |     add('i+2 word', context[i+2])
        |     return features
    p.
      I haven’t added any features from external data, such as case frequency
      statistics from the Google Web 1T corpus. I might add those later, but for
      now I figured I’d keep things simple.
    h3 What about search?
    p.
      The model I’ve recommended commits to its predictions on each word, and
      moves on to the next one. Those predictions are then used as features for
      the next word. There’s a potential problem here, but it turns out it doesn’t
      matter much. It’s easy to fix with beam-search, but I say it’s not really
      worth bothering. And it definitely doesn’t matter enough to adopt a slow
      and complicated algorithm like Conditional Random Fields.
    p.
      Here’s the problem. The best indicator for the tag at position, say, 3 in
      a sentence is the word at position 3. But the next-best indicators are the
      tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want
      the predictions for the surrounding words in hand before we commit to a
      prediction for the current word. Here’s an example where search might matter:
    p.example.
      Their management plan reforms worked
    p.
      Depending on just what you’ve learned from your training data, you can
      imagine making a different decision if you started at the left and moved
      right, conditioning on your previous decisions, than if you’d started at
      the right and moved left.
    p.
      If that’s not obvious to you, think about it this way: “worked” is almost
      surely a verb, so if you tag “reforms” with that in hand, you’ll have a
      different idea of its tag than if you’d just come from “plan“, which you
      might have regarded as either a noun or a verb.
    p.
      Search can only help you when you make a mistake. It can prevent that error
      from throwing off your subsequent decisions, or sometimes your future choices
      will correct the mistake. And that’s why for POS tagging, search hardly matters!
      Your model is so good straight-up that your past predictions are almost always
      true. So you really need the planets to align for search to matter at all.
    p.
      And as we improve our taggers, search will matter less and less. Instead
      of search, what we should be caring about is multi-tagging. If we let the
      model be a bit uncertain, we can get over 99% accuracy assigning an average
      of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is
      rubbish at multi-tagging though. That’s its big weakness. You really want
      a probability distribution for that.
    p.
      One caveat when doing greedy search, though. It’s very important that your
      training data model the fact that the history will be imperfect at run-time.
      Otherwise, it will be way over-reliant on the tag-history features. Because
      the Perceptron is iterative, this is very easy.
    p.
      Here’s the training loop for the tagger:
    pre.language-python
      code
        | def train(self, sentences, save_loc=None, nr_iter=5, quiet=False):
        |     '''Train a model from sentences, and save it at save_loc. nr_iter
        |     controls the number of Perceptron training iterations.'''
        |     self._make_tagdict(sentences, quiet=quiet)
        |     self.model.classes = self.classes
        |     prev, prev2 = START
        |     for iter_ in range(nr_iter):
        |         c = 0; n = 0
        |         for words, tags in sentences:
        |             context = START + [self._normalize(w) for w in words] + END
        |             for i, word in enumerate(words):
        |                 guess = self.tagdict.get(word)
        |                 if not guess:
        |                     feats = self._get_features(
        |                               i, word, context, prev, prev2)
        |                     guess = self.model.predict(feats)
        |                     self.model.update(tags[i], guess, feats)
        |                 # Set the history features from the guesses, not the
        |                 # true tags
        |                 prev2 = prev; prev = guess
        |                 c += guess == tags[i]; n += 1
        |         random.shuffle(sentences)
        |         if not quiet:
        |             print(&quot;Iter %d: %d/%d=%.3f&quot; % (iter_, c, n, _pc(c, n)))
        |     self.model.average_weights()
        |     # Pickle as a binary file
        |     if save_loc is not None:
        |         cPickle.dump((self.model.weights, self.tagdict, self.classes),
        |                      open(save_loc, 'wb'), -1)
    p.
      Unlike the previous snippets, this one’s literal &ndash; I tended to edit the
      previous ones to simplify. So if they have bugs, hopefully that’s why!
    p.
      At the time of writing, I’m just finishing up the implementation before I
      submit a pull request to TextBlob. You can see the rest of the source here:
    ul
      li
        a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py
      li
        a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py
    h3 A final comparison…
    p.
      Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology.
      The claim is that we’ve just been meticulously over-fitting our methods to this
      data. Actually the evidence doesn’t really bear this out. Mostly, if a technique
      is clearly better on one evaluation, it improves others as well. Still, it’s
      very reasonable to want to know how these tools perform on other text. So I
      ran the unchanged models over two other sections from the OntoNotes corpus:
    table
      thead
        tr
          th Tagger
          th WSJ
          th ABC
          th Web
      tbody
        tr
          td Pattern
          td 93.5
          td 90.7
          td 88.1
        tr
          td NLTK
          td 94.0
          td 91.5
          td 88.4
        tr
          td PyGreedyAP
          td 96.8
          td 94.8
          td 91.8
    p.
      The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t
      looked at the data much).
    p.
      As you can see, the order of the systems is stable across the three comparisons,
      and the advantage of our Averaged Perceptron tagger over the other two is real
      enough. Actually the pattern tagger does very poorly on out-of-domain text.
      It mostly just looks up the words, so it’s very domain dependent. I hadn’t
      realised it before, but it’s obvious enough now that I think about it.
    p.
      We can improve our score greatly by training on some of the foreign data.
      The technique described in this paper (Daume III, 2007) is the first thing
      I try when I have to do that.
    footer.meta(role='contentinfo')
      a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter
      .discuss
        a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News
        | 
        a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit
--- a/docs/redesign/change_log.jade
+++ b/docs/redesign/change_log.jade
--- a/docs/redesign/comparisons.jade
+++ b/docs/redesign/comparisons.jade
@ -1,139 +0,0 @@
 - var urls = {}
 - urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf"
 - urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf"
 +comparison("NLTK")
  p spaCy is:
  ul
    li.pro 100x faster;
    li.pro 50% more accurate;
    li.pro Serializes TODO% smaller;
  p spaCy features:
    ul 
      li.pro Integrated word vectors;
      li.pro Efficient binary serialization;
  p NLTK features:
    ul
      li.con Multiple languages; 
      li.neutral Educational resources
 //+comparison("Pattern")
 +comparison("CoreNLP")
  p spaCy is:
  ul
    li.pro TODO% faster;
    li.pro TODO% more accurate;
    li.pro Not Java;
    li.pro Well documented;
    li.pro Cheaper to license commercially;
    li.neutral
      | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping
      | options.  
  p CoreNLP features:
  ul
    li.con Multiple Languages;
    li.con Sentiment analysis 
    li.con Coreference resolution
 +comparison("ClearNLP")
  p spaCy is:
  ul
    li.pro Not Java;
    li.pro TODO% faster;
    li.pro Well documented;
    li.neutral Slightly more accurate;
  p ClearNLP features:
  ul
    li.con Semantic Role Labelling
    li.con Multiple Languages
    li.con Model for biology/life-science;
 //+comparison("Accuracy Summary")
 //+comparison("Speed Summary")
 //  table
 //    thead
 //      tr
 //        th.
 //        th(colspan=3) Absolute (ms per doc)
 //        th(colspan=3) Relative (to spaCy)
 //
 //    tbody
 //      tr
 //        td: strong System
 //        td: strong Split
 //        td: strong Tag
 //        td: strong Parse
 //        td: strong Split
 //        td: strong Tag
 //        td: strong Parse
 //
 //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
 //      +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x")
 //      +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x")
 //      +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x")
 //      +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a")
 //
 //  p
 //    | <strong>Set up</strong>: 100,000 plain-text documents were streamed
 //    | from an SQLite3 database, and processed with an NLP library, to one
 //    | of three levels of detail &ndash; tokenization, tagging, or parsing.
 //    | The tasks are additive: to parse the text you have to tokenize and
 //    | tag it.  The  pre-processing was not subtracted from the times &ndash;
 //    | I report the time required for the pipeline to complete.  I report
 //    | mean times per document, in milliseconds.
 //
 //  p
 //    | <strong>Hardware</strong>: Intel i7-3770 (2012)
 +comparison("Peer-reviewed Evaluations")
  p.
    spaCy is committed to rigorous evaluation under standard methodology.  Two
    papers in 2015 confirm that:
  ol
    li spaCy is the fastest syntactic parser in the world;
    li Its accuracy is within 1% of the best available;
    li The few systems that are more accurate are 20&times; slower or more.
  p
    | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University,
    | as part of a survey paper benchmarking the current state-of-the-art dependency
    | parsers 
    a(href=urls.choi_paper) (Choi et al., 2015)
    | .
  table
    thead
      +columns("System", "Language", "Accuracy", "Speed")
    tbody
      +row("spaCy v0.84", "Cython", "90.6", "13,963")
      +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)")
      +row("ClearNLP", "Java", "91.7", "10,271")
      +row("CoreNLP", "Java", "89.6", "8,602")
      +row("MATE", "Java", "92.5", "550")
      +row("Turbo", "C++", "92.4", "349")
      +row("Yara", "Java", "92.3", "340")
  p
    | Discussion with the authors led to accuracy improvements in spaCy, which
    | have been accepted for publication in EMNLP, in joint work with Macquarie
    | University
    a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015)
    | . 
--- a/docs/redesign/docs.jade
+++ b/docs/redesign/docs.jade
@ -1,129 +0,0 @@
 extends ./outline.jade
 include ./mixins.jade
 mixin declare_class(name)
  details
    summary
      span.declaration
        span.label class
        code #{name}
    block
 mixin method(name, parameters)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
        span.parameters
          | self, #{parameters}
    block
 mixin params
  ul
    block
 mixin param(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin attribute(name, type, value)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
    block
 mixin returns(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin returns(type)
  | tmp
 mixin init
  details
    summary: h4 Init
    block
 mixin callable
  details
    summary: h4 Callable
    block
 mixin sequence
  details
    summary: h4 Sequence
    block
 mixin maptype
  details
    summary: h4 Map
    block
 mixin summary
  block
 mixin en_example
  pre.language-python
    code
      | from spacy.en import English
      | from spacy._doc_examples import download_war_and_peace
      | 
      | unprocessed_unicode = download_war_and_peace()
      | 
      | nlp = English()
      | doc = nlp(unprocessed_unicode)
 block intro_block
  section(class="intro")
    nav(role="navigation")
      ul
        li: a(href="#api" class="button") API
        li: a(href="#tutorials" class="button") Tutorials
        li: a(href="#spec" class="button") Spec
 block body_block
  - var py_docs = '<a class="reference" href="http://docs.python.org/library/'
  -
    var types = {
      'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
      'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
      'int': py_docs + 'functions.html#int"><em>int</em></a>',
      'generator': "",
      'Vocab': "",
      'Span': "",
      'Doc': ""
    }
  article
    +Section("API", "api", "api.jade")
    +Section("Tutorials", "tutorials", "tutorials.jade")
    +Section("Annotation Specifications", "spec", "spec.jade")
--- a/docs/redesign/home.jade
+++ b/docs/redesign/home.jade
@ -1,88 +0,0 @@
 extends ./outline.jade
 include ./mixins.jade
 // Notes
 //
 // 1. Where to put version notice? Should say something like
 //   2015-08-12: v0.89
 //   and be a link
 //   
 //   Only needs to appear on home page.
 - var slogan = "Build Tomorrow's Language Technologies"
 - var tag_line = "spaCy &ndash; " + slogan
 mixin lede
  - var state_of_the_art = '<a href="#">state-of-the-art</a>'
  - var a_minor_miracle = '<a href="">a minor miracle</a>'
  - var great_documentation = '<a href="">great documentation</a>'
  - var concise_API = '<a href="">concise API</a>'
  p.
    <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
    library for industrial-strength natural language processing in Python and
    Cython.  It features !{state_of_the_art} speed and accuracy, a !{concise_API},
    and <a href="#license">license terms</a> designed to get out of your way.
    If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
    like !{a_minor_miracle}.
 mixin comparison(name)
  details
    summary
      h4= name
    block
 mixin columns(...names)
  tr
    each name in names
      th= name
 mixin row(...cells)
  tr
    each cell in cells
      td= cell
 mixin social      
  footer(role="contentinfo")
    a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
    div.discuss
      a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
        | Discuss on Hacker News
      a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
        | Discuss on Reddit
 block intro_block
  section(class="intro")
    +lede
    nav(role="navigation")
      ul
        li: a(href="#example-use" class="button") Examples
        li: a(href="#comparisons" class="button") Comparisons
        li: a(href="#online-demo" class="button") Try Online
        li: a(href="#install" class="button")
          | Install
          <span class="button-caption">v0.89</span>
 block body_block
  article(class="page landing-page")
    +Section("Usage by Example", "example-use", "./usage_examples.jade")
    +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
    +Section("Online Demo", "online-demo", "./online_demo.jade")
    +Section("Install", "install", "./install.jade")
--- a/docs/redesign/installation.jade
+++ b/docs/redesign/installation.jade
@ -1,71 +0,0 @@
 mixin Option(name, open)
  details(open=open)
    summary
      h4= name
    block
 +Option("conda", true)
  pre.language-bash: code
    | $ conda install spacy
    | $ python -m spacy.en.download
 +Option("pip and virtualenv", true)
  p With Python 2.7 or Python 3, using Linux or OSX, run:
    pre.language-bash: code
      | $ pip install spacy
      | $ python -m spacy.en.download
  p
    | The download command fetches and installs about 300mb of data, for
    | the parser model and word vectors, which it installs within the spacy.en
    | package directory.
  +Option("Workaround for obsolete system Python", false)
    p
      | If you're stuck using a server with an old version of Python, and you
      | don't have root access, I've prepared a bootstrap script to help you
      | compile a local Python install.  Run:
    pre.language-bash: code
      | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
 +Option("Compile from source", false)
  p
    | The other way to install the package is to clone the github repository,
    | and build it from source.  This installs an additional dependency,
    | Cython.  If you're using Python 2, I also recommend installing fabric
    | and fabtools &ndash; this is how I build the project.
  pre.language-bash: code
    | $ git clone https://github.com/honnibal/spaCy.git
    | $ cd spaCy
    | $ virtualenv .env && source .env/bin/activate
    | $ export PYTHONPATH=`pwd`
    | $ pip install -r requirements.txt
    | $ python setup.py build_ext --inplace
    | $ python -m spacy.en.download
    | $ pip install pytest
    | $ py.test tests/
  p
    | Python packaging is awkward at the best of times, and it's particularly tricky
    | with C extensions, built via Cython, requiring large data files.  So,
    | please report issues as you encounter them.
 +Option("pypy (Unsupported)")
  | If PyPy support is a priority for you, please get in touch.  We could likely
  | fix the remaining issues, if necessary.  However, the library is likely to
  | be much slower on PyPy, as it's written in Cython, which produces code tuned
  | for the performance of CPython.
 +Option("Windows (Unsupported)")
  | Unfortunately we don't currently have access to a Windows machine, and have
  | no experience developing on a MicroSoft stack. In theory the only problems are
  | with the installation and packaging &ndash; there should be no deep platform
  | dependency. Unfortunately we can't debug these issues at present, simply due
  | to lack of a development environment.
--- a/docs/redesign/license.jade
+++ b/docs/redesign/license.jade
@ -1,179 +0,0 @@
 extends ./outline.jade
 mixin columns(...names)
  tr
    each name in names
      th= name
 mixin row(...cells)
  tr
    each cell in cells
      td= cell
 mixin LicenseOption(name, period, price, audience)
    .item
      h4 #{name}
      .focus #{period}
      span #{price}
      h5 Suggested for:
      span #{audience}
      a.button(href="spacy_trial_free.docx") Download license
      span or 
        a(href="#") get in touch
 block body_block
  article.pricing
    .box.license
      +LicenseOption("Trial", "90 days", "$0", "Evaluation")
      +LicenseOption("Production", "1 year", "$5,000", "Production")
      +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning")
    p.caption
      | Researcher, hobbyist, or open-source developer? spaCy also offers 
      a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3 
      | licenses.
    p.
      What we offer is a rare, simple certainty: a long-term, permissive license
      that comes with full access to the source, complete transparency, and almost
      complete flexibility.  The difference between this and a black-box API is
      night and day.  You cannot build a great product against a service you
      don't understand, and you can't build a great business on a service you
      don't control.
    p
      | Let's face it: services disappear.  Constantly. The good start-ups get
      | bought; the bad ones go bankrupt.  Open-source projects become abandoned
      | or bloated.  Google's graveyard is over-flowing &ndash; ditto for Yahoo!,
      | Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset?
    p
      | A 5 year license won't expire until 2020.  spaCy will be with you for
      | longer than most of your current staff.  If that's still not enough,
      | get in touch. I'm sure we can work something out.
    //p.
    //  To make spaCy as valuable as possible, licenses to it are for life.  You get
    //  complete transparency, certainty and control.  If you need to use spaCy
    //  as an API, it's trivial to host it yourself &ndash; and you don't need to
    //  worry about the service changing or disappearing.  And if you're ever in
    //  acquisition or IPO talks, the story is simple.
    //p.
    //  spaCy can also be used as free open-source software, under the Aferro GPL
    //  license.  If you use it this way, you must comply with the AGPL license
    //  terms.  When you distribute your project, or offer it as a network service,
    //  you must distribute the source-code and grant users an AGPL license to it.
    //h3 Examples
    //p.
    //  In order to clarify how spaCy's license structure might apply to you, I've
    //  written a few examples, in the form of user-stories.
    //details
    //  summary: h4 Seed stage start-ups
    //  p.
    //    Ashley and Casey have an idea for a start-up.  To explore their idea, they
    //    want to build a minimum viable product they can put in front of potential
    //    users and investors.
    //  p. They have two options.
    //  ol
    //    li
    //      p.
    //        <strong>Trial commercial license.</strong> With a simple form, they can
    //        use spaCy for 90 days, for a nominal fee of $1.  They are free to modify
    //        spaCy, and they will own the copyright to their modifications for the
    //        duration of the license.  After the trial period elapses, they can either
    //        pay the license fee, stop using spaCy, release their project under the
    //        AGPL.
    //
    //    li
    //      p.
    //        <strong>AGPL.</strong> Casey and Pat can instead use spaCy under the AGPL
    //        license. However, they must then release any code that statically or
    //        dynamically links to spaCy under the AGPL as well (e.g. if they import
    //        the module, or import a module that imports it, etc).  They also cannot
    //        use spaCy as a network resource, by running it as a service --- this is
    //        the loophole that the "A" part of the AGPL is designed to close.
    //  
    //  p.
    //    Ashley and Casey find the AGPL license unattractive for commercial use.
    //    They decide to take up the trial commercial license.  However,  over the
    //    next 90 days, Ashley has to move house twice, and Casey gets sick.  By
    //    the time the trial expires, they still don't have a demo they can show
    //    investors.  They send an email explaining the situation, and a 90 day extension
    //    to their trial license is granted.
    //  p.
    //    By the time the extension period has elapsed, spaCy has helped them secure
    //    funding, and they even have a little revenue.  They are glad to pay the
    //    $5,000 commercial license fee.
    //  p.
    //    spaCy is now permanently licensed for the product Ashley and Casey are
    //    developing.  They own the copyright to any modifications they make to spaCy,
    //    but not to the original spaCy code.
    //  p.
    //    No additional fees will be due when they hire new developers, run spaCy on
    //    additional internal servers, etc.  If their company is acquired, the license
    //    will be transferred to the company acquiring them.  However, to use spaCy
    //    in another product, they will have to buy a second license.
    // details
    //  summary: h4 University academics
    //  p.
    //    Alex and Sasha are post-doctoral researchers working for a university.
    //    Part of their funding comes from a grant from Google, but Google will not
    //    own any part of the work that they produce.  Their mission is just to write
    //    papers.
    //  p.
    //    Alex and Sasha find spaCy convenient, so they use it in their system under
    //    the AGPL.  This means that their system must also be released under the
    //    AGPL, but they're cool with that &ndash; they were going to release their
    //    code anyway, as it's the only way to ensure their experiments are properly
    //    repeatable.
    //  p.
    //    Alex and Sasha find and fix a few bugs in spaCy.  They must release these
    //    modifications, and they ask that they be accepted into the main spaCy repo.
    //    In order to do this, they must sign a contributor agreement, ceding their
    //    copyright.  When commercial licenses to spaCy are sold, Alex and Sasha will
    //    not be able to claim any royalties from their contributions.
    //  p.
    //    Later, Alex and Sasha implement new features into spaCy, for another paper.
    //    The code was quite rushed, and they don't want to take the time to put
    //    together a proper pull request.  They must release their modifications
    //    under the AGPL, but they are not obliged to contribute it to the spaCy
    //    repository, or concede their copyright.
    // details
    //  summary: h4 Open Source developers
    //  p.
    //    Phuong and Jessie use the open-source software Calibre to manage their
    //    e-book libraries.  They have an idea for a search feature, and they want
    //    to use spaCy to implement it.  Calibre is released under the GPLv3.  The
    //    AGPL has additional restrictions for projects used as a network resource,
    //    but they don't apply to this project, so Phuong and Jessie can use spaCy
    //    to improve Calibre.  They'll have to release their code, but that was
    //    always their intention anyway.
--- a/docs/redesign/mixins.jade
+++ b/docs/redesign/mixins.jade
@ -1,17 +0,0 @@
 mixin Section(title_text, link_name, include_file)
  h3: a(name=link_name) #{title_text}
  if (link_name == "example-use")
    include ./usage_examples.jade
  else if (link_name == "online-demo")
    include ./online_demo.jade
  else if (link_name == "comparisons")
    include ./comparisons.jade
  else if (link_name == "install")
    include ./installation.jade
  else if (link_name == "api")
    include ./api.jade
  else if (link_name == "tutorials")
    include ./tutorials.jade
  else if (link_name == "spec")
    include ./spec.jade
--- a/docs/redesign/online_demo.jade
+++ b/docs/redesign/online_demo.jade
@ -1,18 +0,0 @@
 mixin Displacy(sentence, caption_text, height)
  - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20")
  .displacy
    iframe.displacy(src="displacy/displacy_demo.html" height=height)
    a.view-displacy(href=url)
      | Interactive Visualizer
    p.caption.
      #{caption_text}
 +Displacy(
  "Click the button to see this sentence in displaCy.",
  "The best parse-tree visualizer and annotation tool in all the land.",
  275
 )
--- a/docs/redesign/outline.jade
+++ b/docs/redesign/outline.jade
@ -1,37 +0,0 @@
 - var slogan = "Build Tomorrow's Language Technologies"
 - var tag_line = "spaCy &ndash; " + slogan
 doctype html
 html(lang="en")
  head
    meta(charset="utf-8")
    title!= tag_line
    meta(name="description" content="")
    meta(name="author" content="Matthew Honnibal")
    link(rel="stylesheet" href="css/style.css")
    <!--[if lt IE 9]>
    script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
    <![endif]-->
  body(id="home" role="document")
    header(role="banner")
      h1(class="logo")!= tag_line
      div(class="slogan")!= slogan
    nav(role="navigation")
      ul
        li: a(href="home.html") Home
        li: a(href="docs.html") Docs
        li: a(href="license.html") License
        li: a(href="blog.html") Blog
    main(id="content" role="main")
      block intro_block
      block body_block
  footer(role="contentinfo")
  script(src="js/prism.js")
  script(src="js/details_polyfill.js")
--- a/docs/redesign/spec.jade
+++ b/docs/redesign/spec.jade
@ -1,129 +0,0 @@
 mixin columns(...names)
  tr
    each name in names
      th= name
 mixin row(...cells)
  tr
    each cell in cells
      td= cell
 details
  summary: h4 Overview
  p.
    This document describes the target annotations spaCy is trained to predict.
    This is currently a work in progress. Please ask questions on the issue tracker,
    so that the answers can be integrated here to improve the documentation.
 details
  summary: h4 Tokenization
  p Tokenization standards are based on the OntoNotes 5 corpus.
  p.
    The tokenizer differs from most by including tokens for significant
    whitespace. Any sequence of whitespace characters beyond a single space
    (' ') is included as a token. For instance:
  pre.language-python
    code
      | from spacy.en import English
      | nlp = English(parse=False)
      | tokens = nlp('Some\nspaces  and\ttab characters')
      | print([t.orth_ for t in tokens])
  p Which produces:
  pre.language-python
    code
      | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters']
  p.
    The whitespace tokens are useful for much the same reason punctuation is
    &ndash; it's often an important delimiter in the text.  By preserving
    it in the token output, we are able to maintain a simple alignment
    between the tokens and the original string, and we ensure that no
    information is lost during processing.
 details
  summary: h4 Sentence boundary detection
  p.
    Sentence boundaries are calculated from the syntactic parse tree, so
    features such as punctuation and capitalisation play an important but
    non-decisive role in determining the sentence boundaries.  Usually this
    means that the sentence boundaries will at least coincide with clause
    boundaries, even given poorly punctuated text.
 details
  summary: h4 Part-of-speech Tagging
  p.
    The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank
    tag set.  We also map the tags to the simpler Google Universal POS Tag set.
  p.
    Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124
 details
  summary: h4 Lemmatization
  p.
    A "lemma" is the uninflected form of a word. In English, this means:
  ul
    li Adjectives: The form like "happy", not "happier" or "happiest"
    li Adverbs: The form like "badly", not "worse" or "worst"
    li Nouns: The form like "dog", not "dogs"; like "child", not "children"
    li Verbs: The form like "write", not "writes", "writing", "wrote" or "written" 
  p.
    The lemmatization data is taken from WordNet. However, we also add a
    special case for pronouns: all pronouns are lemmatized to the special
    token -PRON-.
 details
  summary: h4 Syntactic Dependency Parsing
  p.
    The parser is trained on data produced by the ClearNLP converter. Details
    of the annotation scheme can be found here:  http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf
 details
  summary: h4 Named Entity Recognition
  table
    thead
      +columns("Entity Type", "Description")
    tbody
      +row("PERSON", "People, including fictional.")
      +row("NORP", "Nationalities or religious or political groups.")
      +row("FACILITY", "Buildings, airports, highways, bridges, etc.")
      +row("ORG", "Companies, agencies, institutions, etc.")
      +row("GPE", "Countries, cities, states.")
      +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.")
      +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services")
      +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.")
      +row("WORK_OF_ART", "Titles of books, songs, etc.")
      +row("LAW", "Named documents made into laws")
      +row("LANGUAGE", "Any named language")
  p The following values are also annotated in a style similar to names:
  table
    thead
      +columns("Entity Type", "Description")
    tbody
      +row("DATE", "Absolute or relative dates or periods")
      +row("TIME", "Times smaller than a day")
      +row("PERCENT", 'Percentage (including “%”)')
      +row("MONEY", "Monetary values, including unit")
      +row("QUANTITY", "Measurements, as of weight or distance")
      +row("ORDINAL", 'first", "second"')
      +row("CARDINAL", "Numerals that do not fall under another type")
--- a/docs/redesign/template_post.jade
+++ b/docs/redesign/template_post.jade
@ -1,31 +0,0 @@
 doctype html
 html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog(role="document")
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    nav(role="navigation")
      ul
        li: a(href="home.html")        Home
        li: a(href="docs.html")        Docs
        li.active: a(href="blog.html") Blog
        li: a(href="license.html")     License
    main#content(role='main')
      block intro_block
      block body_block
  footer(role='contentinfo')
  script(src="js/prism.js")
  script(src="js/details_polyfill.js")
--- a/docs/redesign/tute_adverbs.jade
+++ b/docs/redesign/tute_adverbs.jade
@ -1,200 +0,0 @@
 doctype html
 html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    main#content(role='main')
      article.post
        :markdown-it
          # Adverbs
          Let's say you're developing a proofreading tool, or possibly an IDE for
          writers.  You're convinced by Stephen King's advice that `adverbs are
          not your friend <http://www.brainpickings.org/2013/03/13/stephen-king-on-adverbs/>`_,
          so you want to **highlight all adverbs**.  We'll use one of the examples
          he finds particularly egregious:
        pre.language-python
          code
            | import spacy.en
            | >>> from spacy.parts_of_speech import ADV
            | >>> # Load the pipeline, and call it with some text.
            | >>> nlp = spacy.en.English()
            | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False)
            | >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)
            | u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
        :markdown-it
          Easy enough --- but the problem is that we've also highlighted "back".
          While "back" is undoubtedly an adverb, we probably don't want to highlight
          it. If what we're trying to do is flag dubious stylistic choices, we'll
          need to refine our logic.  It turns out only a certain type of adverb
          is of interest to us.
        :markdown-it
          There are lots of ways we might do this, depending on just what words
          we want to flag.  The simplest way to exclude adverbs like "back" and
          "not" is by word frequency: these words are much more common than the
          prototypical manner adverbs that the style guides are worried about.
        :markdown-it
          The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a
          log probability estimate of the word:
        pre.language-python
          code
            | >>> nlp.vocab[u'back'].prob
            | -7.403977394104004
            | >>> nlp.vocab[u'not'].prob
            | -5.407193660736084
            | >>> nlp.vocab[u'quietly'].prob
            | -11.07155704498291
        :markdown-it
          (The probability estimate is based on counts from a 3 billion word corpus,
          smoothed using the `Simple Good-Turing`_ method.)
          So we can easily exclude the N most frequent words in English from our
          adverb marker.  Let's try N=1000 for now:
        pre.language-python
          code
            | >>> import spacy.en
            | >>> from spacy.parts_of_speech import ADV
            | >>> nlp = spacy.en.English()
            | >>> # Find log probability of Nth most frequent word
            | >>> probs = [lex.prob for lex in nlp.vocab]
            | >>> probs.sort()
            | >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000]
            | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’")
            | >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens)
            | ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’
        :markdown-it
          There are lots of other ways we could refine the logic, depending on
          just what words we want to flag.  Let's say we wanted to only flag
          adverbs that modified words similar to "pleaded".  This is easy to do,
          as spaCy loads a vector-space representation for every word (by default,
          the vectors produced by `Levy and Goldberg (2014)`_).  Naturally, the
          vector is provided as a numpy array:
        pre.language-python
          code
            | >>> pleaded = tokens[7]
            | >>> pleaded.repvec.shape
            | (300,)
            | >>> pleaded.repvec[:5]
            | array([ 0.04229792,  0.07459262,  0.00820188, -0.02181299,  0.07519238], dtype=float32)
        :markdown-it
          We want to sort the words in our vocabulary by their similarity to
          "pleaded".  There are lots of ways to measure the similarity of two
          vectors.  We'll use the cosine metric:
        pre.language-python
          code 
            | >>> from numpy import dot
            | >>> from numpy.linalg import norm
            | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
            | >>> words = [w for w in nlp.vocab if w.has_repvec]
            | >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec))
            | >>> words.reverse()
            | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
            | 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading
            | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
            | 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses
            | >>> print('100-110', ', '.join(w.orth_ for w in words[100:110]))
            | 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes
            | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
            | 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged
            | >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010]))
            | 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists
        :markdown-it
          As you can see, the similarity model that these vectors give us is excellent
          --- we're still getting meaningful results at 1000 words, off a single
          prototype!  The only problem is that the list really contains two clusters of
          words: one associated with the legal meaning of "pleaded", and one for the more
          general sense.  Sorting out these clusters is an area of active research.
          A simple work-around is to average the vectors of several words, and use that
          as our target:
        pre.language-python
          code
            | >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested']
            | >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs)
            | >>> words.sort(key=lambda w: cosine(w.repvec * say_vector))
            | >>> words.reverse()
            | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20]))
            | 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired
            | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60]))
            | 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed
            | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010]))
            | 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate
        :markdown-it
          These definitely look like words that King might scold a writer for attaching
          adverbs to.  Recall that our original adverb highlighting function looked like
          this:
        pre.language-python
          code
            | >>> import spacy.en
            | >>> from spacy.parts_of_speech import ADV
            | >>> # Load the pipeline, and call it with some text.
            | >>> nlp = spacy.en.English()
            | >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’",
            |                  tag=True, parse=False)
            | >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens))
            | ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’
        :markdown-it
          We wanted to refine the logic so that only adverbs modifying evocative
          verbs of communication, like "pleaded", were highlighted.  We've now
          built a vector that represents that type of word, so now we can highlight
          adverbs based on subtle logic, honing in on adverbs that seem the most
          stylistically problematic, given our starting assumptions:
        pre.language-python
          code
            | >>> import numpy
            | >>> from numpy import dot
            | >>> from numpy.linalg import norm
            | >>> import spacy.en
            | >>> from spacy.parts_of_speech import ADV, VERB
            | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
            | >>> def is_bad_adverb(token, target_verb, tol):
            | ...   if token.pos != ADV
            | ...     return False
            | ...   elif token.head.pos != VERB:
            | ...     return False
            | ...   elif cosine(token.head.repvec, target_verb) < tol:
            | ...     return False
            | ...   else:
            | ...     return True
        :markdown-it
          This example was somewhat contrived --- and, truth be told, I've never
          really bought the idea that adverbs were a grave stylistic sin.  But
          hopefully it got the message across: the state-of-the-art NLP technologies
          are very powerful. spaCy gives you easy and efficient access to them,
          which lets you build all sorts of useful products and features that
          were previously impossible.
  footer(role='contentinfo')
  script(src='js/prism.js')
--- a/docs/redesign/tute_syntax_search.jade
+++ b/docs/redesign/tute_syntax_search.jade
@ -1,132 +0,0 @@
 doctype html
 html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    main#content(role='main')
      section.intro
        p
          | Example use of the spaCy NLP tools for data exploration.
          | Here we will look for reddit comments that describe Google doing something,
          | i.e. discuss the company's actions. This is difficult, because other senses of
          | "Google" now dominate usage of the word in conversation, particularly references to
          | using Google products.
        p
          | The heuristics used are quick and dirty &ndash; about 5 minutes work.
        //| A better approach is to use the word vector of the verb. But, the
        //  | demo here is just to show what's possible to build up quickly, to
        //  | start to understand some data.
      article.post
        header
          h2 Syntax-specific Search
          .subhead
            | by 
            a(href='#', rel='author') Matthew Honnibal
            |  on 
            time(datetime='2015-08-14') August
        details
          summary: h4 Imports
          pre.language-python
            code
              | from __future__ import unicode_literals
              | from __future__ import print_function
              | import sys
              | 
              | import plac
              | import bz2
              | import ujson
              | import spacy.en
        details
          summary: h4 Load the model and iterate over the data
          pre.language-python
            code 
              | def main(input_loc):
              |     nlp = spacy.en.English()                 # Load the model takes 10-20 seconds.
              |     for line in bz2.BZ2File(input_loc):      # Iterate over the reddit comments from the dump. 
              |         comment_str = ujson.loads(line)['body']  # Parse the json object, and extract the 'body' attribute. 
              |         
        details
          summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want
          pre.language-python
            code
              |         comment_parse = nlp(comment_str) 
              |         for word in comment_parse:  
              |             if google_doing_something(word):
              |                 # Print the clause
              |                 print(''.join(w.string for w in word.head.subtree).strip())
        details
          summary: h4 Define the filter function
          pre.language-python
            code
              | 
              | def google_doing_something(w):
              |     if w.lower_ != 'google':
              |         return False
              |     # Is it the subject of a verb?
              |     elif w.dep_ != 'nsubj': 
              |         return False
              |     # And not 'is'
              |     elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux': 
              |         return False
              |     # Exclude e.g. "Google says..."
              |     elif w.head.lemma_ in ('say', 'show'): 
              |         return False
              |     else:
              |         return True
              | 
              | 
        details
          summary: h4 Call main
          pre.language-python
            code
              | if __name__ == '__main__':
              |     plac.call(main)
        details
          summary: h4 Example output
          p.
            Many false positives remain. Some are from incorrect interpretations
            of the sentence by spaCy, some are flaws in our filtering logic. But
            the results are vastly better than a string-based search, which returns
            almost no examples of the pattern we're looking for.
          code
            | Google dropped support for Android < 4.0 already
            | google drive
            | Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc
            | When Google responds
            | Google translate cyka pasterino.
            | A quick google looks like Synology does have a sync'ing feature which does support block level so that should work 
            | (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible?
            | Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop.
            | Google offers something like this already, but it is truly terrible.
            | google isn't helping me
            | Google tells me: 0 results, 250 pages removed from google.
            | how did Google swoop in and eat our lunch
  script(src="js/prism.js")
  script(src="js/details_polyfill.js")
--- a/docs/redesign/tute_twitter.jade
+++ b/docs/redesign/tute_twitter.jade
@ -1,204 +0,0 @@
 doctype html
 html(lang='en')
  head
    meta(charset='utf-8')
    title spaCy Blog
    meta(name='description', content='')
    meta(name='author', content='Matthew Honnibal')
    link(rel='stylesheet', href='css/style.css')
    //if lt IE 9
      script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
  body#blog
    header(role='banner')
      h1.logo spaCy Blog
      .slogan Blog
    main#content(role='main')
      article.post
        header
          h2 Finding Relevant Tweets
          .subhead
            | by 
            a(href='#', rel='author') Matthew Honnibal
            |  on 
            time(datetime='2015-08-14') December
        details
          summary: h4 Imports
          pre.language-python
            | from __future__ import unicode_literals, print_function
            | import plac
            | import codecs
            | import sys
            | import math
            | 
            | import spacy.en
            | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
            | 
            | from termcolor import colored
            | from twython import TwythonStreamer
            | 
            | from os import path
            | from math import sqrt
            | 
            | from numpy import dot
            | from numpy.linalg import norm
            | 
            | 
        details
          summary: h4 Simple vector-averaging similarity
          pre.language-python: code
            | class Meaning(object):
            |     def __init__(self, vectors):
            |         if vectors:
            |             self.vector = sum(vectors) / len(vectors)
            |             self.norm = norm(self.vector)
            |         else:
            |             self.vector = None
            |             self.norm = 0
            | 
            |     @classmethod
            |     def from_path(cls, nlp, loc):
            |         with codecs.open(loc, 'r', 'utf8') as file_:
            |             terms = file_.read().strip().split()
            |         return cls.from_terms(nlp, terms)
            | 
            |     @classmethod
            |     def from_tokens(cls, nlp, tokens):
            |         vectors = [t.repvec for t in tokens]
            |         return cls(vectors)
            | 
            |     @classmethod
            |     def from_terms(cls, nlp, examples):
            |         lexemes = [nlp.vocab[eg] for eg in examples]
            |         vectors = [eg.repvec for eg in lexemes]
            |         return cls(vectors)
            | 
            |     def similarity(self, other):
            |         if not self.norm or not other.norm:
            |             return -1
            |         return dot(self.vector, other.vector) / (self.norm * other.norm)
            | 
        details
          summary: h4 Print matches
          pre.language-python: code
            | 
            | def print_colored(model, stream=sys.stdout):
            |     if model['is_match']:
            |         color = 'green'
            |     elif model['is_reject']:
            |         color = 'red'
            |     else:
            |         color = 'grey'
            |     
            |     if not model['is_rare'] and model['is_match'] and not model['is_reject']:
            |         match_score = colored('%.3f' % model['match_score'], 'green')
            |         reject_score = colored('%.3f' % model['reject_score'], 'red')
            |         prob = '%.5f' % model['prob']
            | 
            |         print(match_score, reject_score, prob)
            |         print(repr(model['text']), color)
            |         print('')
            | 
            | 
        details
          summary: h4 TextMatcher: Process the tweets using spaCy
          pre.language-python: code
            | class TextMatcher(object):
            |     def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
            |         self.nlp = nlp
            |         self.get_target = get_target
            |         self.get_reject = get_reject
            |         self.min_prob = min_prob
            |         self.min_match = min_match
            |         self.max_reject = max_reject
            | 
            |     def __call__(self, text):
            |         tweet = self.nlp(text)
            |         target_terms = self.get_target()
            |         reject_terms = self.get_reject()
            | 
            |         prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
            |         meaning = Meaning.from_tokens(self, tweet)
            |         
            |         match_score = meaning.similarity(self.get_target())
            |         reject_score = meaning.similarity(self.get_reject())
            |         return {
            |             'text': tweet.string,
            |             'prob': prob,
            |             'match_score': match_score,
            |             'reject_score': reject_score,
            |             'is_rare': prob < self.min_prob,
            |             'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
            |             'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
            |         }
            | 
            | 
        details
          summary: h4 Connect to Twitter and stream tweets
          pre.language-python: code
            | class Connection(TwythonStreamer):
            |     def __init__(self, keys_dir, handler, view):
            |         keys = Secrets(keys_dir)
            |         TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) 
            |         self.handler = handler
            |         self.view = view
            | 
            |     def on_success(self, data):
            |         text = data.get('text', u'')
            |         # Twython returns either bytes or unicode, depending on tweet.
            |         # #APIshaming
            |         try:
            |             model = self.handler(text)
            |         except TypeError:
            |             model = self.handler(text.decode('utf8'))
            |         status = self.view(model, sys.stdin)
            | 
            |     def on_error(self, status_code, data):
            |         print(status_code)
            | 
            | 
            | class Secrets(object):
            |     def __init__(self, key_dir):
            |         self.key = open(path.join(key_dir, 'key.txt')).read().strip()
            |         self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
            |         self.token = open(path.join(key_dir, 'token.txt')).read().strip()
            |         self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
            | 
            | 
        details
          summary: h4 Command-line interface
          pre.language-python: code
            | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
            |     # We don't need the parser for this demo, so may as well save the loading time
            |     nlp = spacy.en.English(Parser=None)
            |     get_target = lambda: Meaning.from_path(nlp, target_loc)
            |     get_reject = lambda: Meaning.from_path(nlp, reject_loc)
            |     matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
            | 
            |     twitter = Connection(keys_dir, matcher, print_colored)
            |     twitter.statuses.filter(track=term)
            | 
            | 
            | if __name__ == '__main__':
            |     plac.call(main)
            |   
  footer(role='contentinfo')
  script(src='js/prism.js')
--- a/docs/redesign/tutorials.jade
+++ b/docs/redesign/tutorials.jade
@ -1,29 +0,0 @@
 mixin Tutorial(title)
  details
    summary
      h4= title 
    block
 +Tutorial("Mark-up all manner adverbs, especially for verbs of speech")
  | Let's say you're developing a proofreading tool, or possibly an IDE for
  | writers.  You're convinced by Stephen King's advice that 
  | adverbs are not your friend
  | so you want to 
  a.readmore(href='tute_adverbs.html') 
    | highlight all adverbs. ►
 +Tutorial("Search Reddit for comments about Google doing something")
  | Example use of the spaCy NLP tools for data exploration.
  | Here we will look for Reddit comments that describe Google doing something,
  | i.e. discuss the company's actions. This is difficult, because other
  | senses of "Google" now dominate usage of the word in conversation,
  | particularly references to using Google products. 
  a.readmore(href='tute_adverbs.html') 
    | ►
 +Tutorial("Use word vectors for semantic search of Twitter")
  | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
  | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore.
  a.readmore(href='tute_twitter.html') 
    | ►
--- a/docs/redesign/usage_examples.jade
+++ b/docs/redesign/usage_examples.jade
@ -1,167 +0,0 @@
 mixin example(name)
  details
    summary
      h4= name
    block
 +example("Load resources and process text")
  pre.language-python: code
    | from __future__ import unicode_literals, print_function
    | from spacy.en import English
    | nlp = English()
    | doc = nlp('Hello, world. Here are two sentences.')
 +example("Get tokens and sentences")
  pre.language-python: code
    | token = doc[0]
    | sentence = doc.sents[0]
    | assert token[0] is sentence[0]
 +example("Use integer IDs for any string")
  pre.language-python: code
    | hello_id = nlp.vocab.strings['Hello']
    | hello_str = nlp.vocab.strings[hello_id]
    | 
    | assert token.orth  == hello_id  == 52
    | assert token.orth_ == hello_str == 'Hello'
 +example("Get and set string views and flags")
  pre.language-python: code
    | assert token.shape_ == 'Xxxx'
    | for lexeme in nlp.vocab:
    |     if lexeme.is_alpha:
    |         lexeme.shape_ = 'W'
    |     elif lexeme.is_digit:
    |         lexeme.shape_ = 'D'
    |     elif lexeme.is_punct:
    |         lexeme.shape_ = 'P'
    |     else:
    |         lexeme.shape_ = 'M'
    | assert token.shape_ == 'W'
 +example("Export to numpy arrays")
  pre.language-python: code
    | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
    | 
    | attr_ids = [ORTH, LIKE_URL, IS_OOV]
    | doc_array = doc.to_array(attr_ids)
    | assert doc_array.shape == (len(doc), len(attrs)
    | assert doc[0].orth == doc_array[0, 0]
    | assert doc[1].orth == doc_array[1, 0]
    | assert doc[0].like_url == doc_array[0, 1]
    | assert doc_array[, 1] == [t.like_url for t in doc]
 +example("Word vectors")
  pre.language-python: code
    | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
    | 
    | apples = doc[0]
    | oranges = doc[1]
    | boots = doc[6]
    | hippos = doc[8]
    | 
    | assert apples.similarity(oranges) > boots.similarity(hippos)
 +example("Part-of-speech tags")
  pre.language-python: code
    | from spacy.parts_of_speech import ADV
    | 
    | def is_adverb(token):
    |     return token.pos == spacy.parts_of_speech.ADV
    | 
    | # These are data-specific, so no constants are provided. You have to look
    | # up the IDs from the StringStore.
    | NNS = nlp.vocab.strings['NNS']
    | NNPS = nlp.vocab.strings['NNPS']
    | def is_plural_noun(token):
    |     return token.tag == NNS or token.tag == NNPS
    | 
    | def print_coarse_pos(token):
    |     print(token.pos_)
    | 
    | def print_fine_pos(token):
    |     print(token.tag_)
 +example("Syntactic dependencies")
  pre.language-python: code
    | def dependency_labels_to_root(token):
    |     '''Walk up the syntactic tree, collecting the arc labels.'''
    |     dep_labels = []
    |     while token.root is not token:
    |         dep_labels.append(token.dep)
    |         token = token.head
    |     return dep_labels
 +example("Named entities")
  pre.language-python: code
    | def iter_products(docs):
    |     for doc in docs:
    |         for ent in doc.ents:
    |             if ent.label_ == 'PRODUCT':
    |                 yield ent
    | 
    | def word_is_in_entity(word):
    |     return word.ent_type != 0
    | 
    | def count_parent_verb_by_person(docs):
    |     counts = defaultdict(defaultdict(int))
    |     for doc in docs:
    |         for ent in doc.ents:
    |             if ent.label_ == 'PERSON' and ent.root.head.pos == VERB:
    |                 counts[ent.orth_][ent.root.head.lemma_] += 1
    |     return counts
  //+example("Define custom NER rules")
  //  pre.language-python: code
  //    | nlp.matcher
 +example("Calculate inline mark-up on original string")
  pre.language-python: code
    | def put_spans_around_tokens(doc, get_classes):
    |     '''Given some function to compute class names, put each token in a
    |     span element, with the appropriate classes computed.
    |  
    |     All whitespace is preserved, outside of the spans. (Yes, I know HTML
    |     won't display it. But the point is no information is lost, so you can
    |     calculate what you need, e.g. <br /> tags, <p> tags, etc.)
    |     '''
    |     output = []
    |     template = '&lt;span classes="{classes}"&gt;{word}&lt;/span&gt;{space}'
    |     for token in doc:
    |         if token.is_space:
    |             output.append(token.orth_)
    |         else:
    |             output.append(
    |               template.format(
    |                 classes=' '.join(get_classes(token)),
    |                 word=token.orth_,
    |                 space=token.whitespace_))
    |     string = ''.join(output)
    |     string = string.replace('\n', '<br />')
    |     string = string.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;'
    |     return string
 +example("Efficient binary serialization")
  pre.language-python: code
    | 
    | byte_string = doc.as_bytes()
    | open('/tmp/moby_dick.bin', 'wb').write(byte_string)
    | 
    | nlp = spacy.en.English()
    | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')):
    |    doc = Doc(nlp.vocab)
    |    doc.from_bytes(byte_string)
 p
  | See the 
  a(href="docs.html") docs page 
  | for 
  a(href="docs.html#api") API documentation, 
  a(href="docs.html#tutorials") tutorials, 
  | and 
  a(href="docs.html#spec") annotation specs.