Merge branch 'gaz' of https://github.com/honnibal/spaCy into gaz

2025-11-01 16:37:45 +03:00 · 2015-09-06 14:07:43 +02:00 · 2015-09-06 14:07:43 +02:00 · 4f765eee79
commit 4f765eee79
parent 7e4fea67d3 56c4e07a59
7 changed files with 999 additions and 2 deletions
--- a/docs/redesign/docs.jade
+++ b/docs/redesign/docs.jade
@ -0,0 +1,705 @@
+- var py_docs = '<a class="reference" href="http://docs.python.org/library/'
+
+-
+  var types = {
+   'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
+   'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
+   'int': py_docs + 'functions.html#int"><em>int</em></a>',
+   'generator': "",
+   'Vocab': "",
+   'Span': "",
+   'Doc': ""
+  }
+
+
+mixin declare_class(name)
+  details
+    summary
+      span.declaration
+        span.label class
+        code #{name}
+    block
+
+mixin method(name, parameters)
+  details(open=attributes.open)
+    summary
+      span.declaration
+        span.label #{name}
+        span.parameters
+          | self, #{parameters}
+    block
+
+
+mixin params
+  ul
+    block
+
+
+mixin param(name, type, value)
+  li
+    if type
+      <strong>#{name}</strong> (!{type}) &#8211;
+    else
+      <strong>#{name}</strong> &#8211;
+    block
+
+
+mixin attribute(name, type, value)
+  details(open=attributes.open)
+    summary
+      span.declaration
+        span.label #{name}
+    block
+
+
+mixin returns(name, type, value)
+  li
+    if type
+      <strong>#{name}</strong> (!{type}) &#8211;
+    else
+      <strong>#{name}</strong> &#8211;
+    block
+
+
+mixin returns(type)
+  | tmp
+
+mixin init
+  details
+    summary: h4 Init
+
+    block
+
+
+mixin callable
+  details
+    summary: h4 Callable
+
+    block
+
+
+mixin sequence
+  details
+    summary: h4 Sequence
+
+    block
+
+
+mixin maptype
+  details
+    summary: h4 Map
+
+    block
+
+
+mixin summary
+  block
+
+mixin en_example
+  pre.language-python
+    code
+      | from spacy.en import English
+      | from spacy._doc_examples import download_war_and_peace
+      | 
+      | unprocessed_unicode = download_war_and_peace()
+      | 
+      | nlp = English()
+      | doc = nlp(unprocessed_unicode)
+
+
+doctype html
+html(lang="en")
+  head
+    meta(charset="utf-8")
+    title spaCy &ndash; Industrial-strength NLP
+    meta(name="description" content="")
+    meta(name="author" content="Matthew Honnibal")
+    link(rel="stylesheet" href="css/style.css")
+    <!--[if lt IE 9]>
+    script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
+    <![endif]-->
+
+  body(id="docs")
+    header(role="banner")
+      h1.logo spaCy &ndash; Industrial-strength NLP
+      div.slogan API
+
+
+    nav(role="navigation")
+      ul
+        li: a(href="#") Home
+        li.active: a(href="#") Docs
+        li: a(href="#") License
+        li: a(href="#") Blog
+
+    main.docs#content
+
+      article
+        +declare_class("English")
+          p Load models into a callable object to process English text.
+
+          +summary
+            +en_example
+
+          +init
+            p
+              | Load the resources.  Loading takes 20 seconds, and the instance
+              | consumes 2 to 3 gigabytes of memory.
+            
+            p 
+              | Intended use is for one instance to be created per process.
+              | You can create more if you're doing something unusual.
+            p
+              | You may wish to make the instance a global variable or "singleton".
+              | We usually instantiate the object in the <code>main()</code>
+              | function and pass it around as an explicit argument. 
+            +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
+
+              +params
+                +param("data_dir")
+                  | The data directory.  May be #{None}, to disable any data loading
+                  | (including the vocabulary).
+
+                +param("Tokenizer")
+                  | A class/function that creates the tokenizer.
+
+                +param("Tagger")
+                  | A class/function that creates the part-of-speech tagger.
+
+                +param("Parser")
+                  | A class/function that creates the dependency parser.
+
+                +param("Entity")
+                  | A class/function that creates the named entity recogniser.
+
+                +param("load_vectors")
+                  | A boolean value to control whether the word vectors are loaded.
+          
+          +callable
+            +method("__call__", "text, tag=True, parse=True, entity=True")
+
+              +params
+                +param("text", types.unicode)
+                  | The text to be processed.  No pre-processing needs to be applied,
+                  | and any length of text can be submitted.  Usually you will submit
+                  | a whole document. Text may be zero-length. An exception is raised
+                  | if byte strings are supplied.
+
+                +param("tag", bool_type)
+                  | Whether to apply the part-of-speech tagger. Required for parsing
+                  | and entity recognition.
+
+                +param("parse", bool_type)
+                  | Whether to apply the syntactic dependency parser.
+
+                +param("entity", bool_type)
+                  | Whether to apply the named entity recognizer.
+
+                pre.language-python
+                  code
+                    | from spacy.en import English
+                    | nlp = English()
+                    | doc = nlp(u'Some text.) # Applies tagger, parser, entity
+                    | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
+                    | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
+                    | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
+                    | doc = nlp(u'') # Zero-length tokens, not an error
+                    | # doc = nlp(b'Some text') <-- Error: need unicode
+                    | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+    
+
+        +declare_class("Doc")
+          p I'm a doc
+
+          +init
+            +method("__init__", "vocab")
+              +params
+                +param("vocab", vocab_type)
+                  | A vocabulary object
+
+          +sequence
+            +method("__getitem__", "i", types.int)
+              +returns(types.Token)
+
+            +method("__getitem__", "start_end", types.slice)
+              +returns(types.Span)
+  
+            +method("__iter__")
+              | Iterate over tokens
+  
+            +method("__len__")
+              | Number of tokens in the document.
+  
+          details
+            summary: h4 Spans
+            
+            +attribute("sents", types.generator)
+              | Iterate over sentences in the document.
+          
+            +attribute("ents", types.generator)
+              | Iterate over named entities in the document.
+    
+            +attribute("noun_chunks", types.generator)
+          
+          details
+            summary: h4 Export/Import
+            
+            +method("to_array", "attr_ids")
+  
+              | Given a list of M attribute IDs, export the tokens to a numpy ndarray
+              | of shape N*M, where N is the length of the sentence.
+  
+              +params
+                +param("attr_ids", "list[int]")
+                  | A list of attribute ID ints.
+  
+              +returns("feat_array")
+                | A feature matrix, with one row per word, and one column per attribute
+                | indicated in the input attr_ids.
+  
+            +method("count_by", "attr_id")
+              | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+              | by the values of the given attribute ID.
+            
+              pre.language-python
+                code
+                  | >>> from spacy.en import English, attrs
+                  | >>> nlp = English()
+                  | >>> tokens = nlp(u'apple apple orange banana')
+                  | >>> tokens.count_by(attrs.ORTH)
+                  | {12800L: 1, 11880L: 2, 7561L: 1}
+                  | >>> tokens.to_array([attrs.ORTH])
+                  | array([[11880],
+                  |         [11880],
+                  |         [7561],
+                  |         [12800]])
+  
+            +method("from_array", "attrs, array")
+              | Load from array
+          
+            +method("from_bytes")
+              | Deserialize, loading from bytes
+  
+            +method("read_bytes")
+              | classmethod
+  
+            //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
+  
+            //  | Merge a multi-word expression into a single token.  Currently
+            //  | experimental; API is likely to change.
+  
+  
+        +declare_class("Token")
+          +init
+            +method("__init__", "vocab, doc, offset")
+              +params
+                +param("vocab", types.Vocab)
+                  p A Vocab object
+  
+                +param("doc", types.Doc)
+                  p The parent sequence
+  
+              +param("offset", types.int)
+                p The index of the token within the document
+  
+          details
+            summary: h4 String Views
+  
+            +attribute("orth / orth_")
+              | The form of the word with no string normalization or processing, as
+              | it appears in the string, without trailing whitespace.
+  
+            +attribute("lemma / lemma_")
+              | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
+              | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that
+              | <em>derivational</em> suffixes are not stripped, e.g. the lemma of
+              | "instutitions" is "institution", not "institute".  Lemmatization is
+              | performed using the WordNet data, but extended to also cover closed-class
+              | words such as pronouns.  By default, the WN lemmatizer returns "hi"
+              | as the lemma of "his". We assign pronouns the lemma -PRON-.
+  
+            +attribute("lower / lower_")
+              | The form of the word, but forced to lower-case, i.e.
+              pre.language-python: code lower = word.orth\_.lower()
+  
+            //+attribute("norm / norm_")
+            //  | The form of the word, after language-specific normalizations has been
+            //  | applied.
+  
+            +attribute("shape / shape_")
+              | A transform of the word's string, to show orthographic features.
+              | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
+              | to d. After these mappings, sequences of 4 or more of the same character
+              | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx,
+              | :) --> :)
+  
+            +attribute("prefix / prefix_")
+              | A length-N substring from the start of the word.  Length may vary by
+              | language; currently for English n=1, i.e.
+              pre.language-python: code prefix = word.orth\_[:1]
+  
+            +attribute("suffix / suffix_")
+              | A length-N substring from the end of the word.  Length may vary by
+              | language; currently for English n=3, i.e.
+              pre.language-python: code suffix = word.orth\_[-3:]
+  
+            //+attribute("lex_id")
+            //  | lex_id
+  
+          details
+            summary: h4 Alignment and Output
+  
+            +attribute("idx")
+              p Start index of the token in the string
+  
+            +method("__len__", "")
+              p Length of the token's orth string, in unicode code-points.
+  
+            +method("__unicode__", "")
+              p Same as token.orth_
+  
+            +method("__str__", "")
+              p Varies between Python 2 and Python 3
+  
+            +attribute("string")
+              p
+                | The form of the word as it appears in the string, <strong>including
+                | trailing whitespace</strong>.  This is useful when you need to use
+                | linguistic features to add inline mark-up to the string.
+  
+            +method("nbor, i=1")
+              +params
+                +param("i")
+                  p Offset relative to token
+    
+          details
+            summary: h4 Distributional Features
+    
+            +attribute("repvec")
+              p
+                | A "word embedding" representation: a dense real-valued vector that supports
+                | similarity queries between words.  By default, spaCy currently loads
+                | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
+                | model.
+    
+            +attribute("cluster")
+              p
+                | The Brown cluster ID of the word.  These are often useful features for
+                | linear models.  If you're using a non-linear model, particularly a
+                | neural net or random forest, consider using the real-valued word
+                | representation vector, in Token.repvec, instead.
+    
+            +attribute("prob")
+              p
+                | The unigram log-probability of the word, estimated from counts from a
+                | large corpus, smoothed using Simple Good Turing estimation.
+    
+          details
+            summary: h4 Syntactic Tags
+    
+            +attribute("pos / pos_")
+              p
+                | A part-of-speech tag, from the Google Universal Tag Set, e.g. 
+                | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for
+                | the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
+    
+            +attribute("tag / tag_")
+              p
+                | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
+                | <code>DT</code>, etc.  These tags are language/corpus specific, and
+                | typically describe part-of-speech and some amount of morphological
+                | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code>
+                | is assigned to a present-tense singular verb.
+    
+            +attribute("dep / dep_")
+              p
+                | The type of syntactic dependency relation between the word and its
+                | syntactic head.
+    
+          details
+            summary: h4 Navigating the Parse Tree
+          
+            +attribute("head")
+              p
+                | The Token that is the immediate syntactic head of the word.  If the
+                | word is the root of the dependency tree, the same word is returned.
+    
+            +attribute("lefts")
+              p
+                | An iterator for the immediate leftward syntactic children of the
+                | word.
+    
+            +attribute("rights")
+              p
+                | An iterator for the immediate rightward syntactic children of the
+                | word.
+    
+            +attribute("n_lefts")
+              p
+                | The number of immediate syntactic children preceding the word in 
+                | the string.
+    
+            +attribute("n_rights")
+              p
+                | The number of immediate syntactic children following the word in
+                | the string.
+    
+            +attribute("children")
+              p
+                | An iterator that yields from lefts, and then yields from rights.
+    
+            +attribute("subtree")
+              p
+                | An iterator for the part of the sentence syntactically governed by
+                | the word, including the word itself.
+    
+            +attribute("left_edge")
+              p The leftmost edge of the token's subtree
+    
+            +attribute("right_edge")
+              p The rightmost edge of the token's subtree
+    
+          details
+            summary: h4 Named Entities
+    
+            +attribute("ent_type")
+              p If the token is part of an entity, its entity type.
+    
+            +attribute("ent_iob")
+              p The IOB (inside, outside, begin) entity recognition tag for the token.
+    
+          details
+            summary: h4 Lexeme Flags
+    
+            +method("check_flag", "flag_id")
+              +params
+                +param("flag_id")
+                  | flag ID
+    
+            +attribute("is_oov")
+            +attribute("is_alpha")
+            +attribute("is_ascii")
+            +attribute("is_digit")
+            +attribute("is_lower")
+            +attribute("is_title")
+            +attribute("is_punct")
+            +attribute("is_space")
+            +attribute("like_url")
+            +attribute("like_num")
+            +attribute("like_email")
+    
+            //+attribute("conjuncts")
+            //  | Conjuncts
+    
+        +declare_class("Span")
+          +init
+            +method("__init__")
+              Temp
+  
+            <code>span = doc[0:4]</code>
+  
+          +sequence
+            +method("__getitem__")
+              p Get item
+  
+            +method("__iter__")
+              p Iter
+                
+            +method("__len__")
+              p Len
+  
+          details
+            summary: h4 Parse
+  
+            +attribute("root")
+              p Syntactic head
+  
+            +attribute("lefts")
+              p Tokens that are:
+              ol
+                li To the left of the span;
+                li Syntactic children of words within the span
+  
+              p i.e.
+  
+              pre.language-python
+                code
+                  | lefts = [span.doc[i] for i in range(0, span.start)
+                  |          if span.doc[i].head in span]
+  
+            +attribute("rights")
+              p Tokens that are:
+                ol 
+                  li To the right of the span;
+                  li Syntactic children of words within the span
+              p i.e.
+              pre.language-python
+                code
+                  | rights = [span.doc[i] for i in range(span.end, len(span.doc))
+                  |           if span.doc[i].head in span]
+  
+  
+            +attribute("subtree")
+              p String
+  
+          details
+            summary: h4 String Views
+  
+            +attribute("string")
+              p String
+    
+            +attribute("lemma / lemma_")
+              p String
+  
+            +attribute("label / label_")
+              p String
+  
+        +declare_class("Lexeme")
+          p
+            | The Lexeme object represents a lexical type, stored in the vocabulary
+            | &ndash; as opposed to a token, occurring in a document.
+          p
+            | Lexemes store various features, so that these features can be computed
+            | once per type, rather than once per token. As job sizes grow, this
+            | can amount to a substantial efficiency improvement.
+  
+          p
+            | All Lexeme attributes are therefore context independent, as a single
+            | lexeme is reused for all usages of that word. Lexemes are keyed by
+            | the “orth” attribute.
+  
+          p
+            All Lexeme attributes are accessible directly on the Token object.
+  
+          +init
+            +method("__init__")
+              p Init
+  
+            details
+              summary: h4 String Features
+  
+                +attribute("orth / orth_")
+                  p
+                    | The form of the word with no string normalization or processing,
+                    | as it appears in the string, without trailing whitespace.
+              
+                +attribute("lower / lower_")
+                  p Tmp
+              
+                +attribute("norm / norm_")
+                  p Tmp
+              
+                +attribute("shape / shape_")
+                  p Tmp
+              
+                +attribute("prefix / prefix_")
+                  p Tmp
+              
+                +attribute("suffix / suffix_")
+                  p TMP
+  
+        +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
+          +sequence
+            +method("__len__")
+              +returns
+                p Number of words in the vocabulary.
+  
+            +method("__iter__")
+              +returns
+                p Lexeme
+    
+          +maptype
+            +method("__getitem__", "key_int")
+              +params
+                +param("key")
+                  p Integer ID
+    
+              +returns: p A Lexeme object
+    
+            +method("__getitem__", "key_str")
+              +params
+                +param("key_str", types.unicode)
+                  p A string in the vocabulary
+    
+              +returns("Lexeme")
+    
+            +method("__setitem__", "orth_str", "props")
+              +params
+                +param("orth_str", types.unicode)
+                  p The orth key
+    
+                +param("props", types.dict)
+                  p A props dictionary
+    
+              +returns("None")
+  
+          details
+            summary: h4 Import/Export
+    
+            +method("dump", "loc")
+              +params
+                +param("loc", types.unicode)
+                  p Path where the vocabulary should be saved
+    
+            +method("load_lexemes", "loc")
+            +params
+              +param("loc", types.unicode)
+                p Path to load the lexemes.bin file from
+    
+            +method("load_vectors", "loc")
+              +params
+                +param("loc", types.unicode)
+                  p Path to load the vectors.bin from
+  
+        +declare_class("StringStore")
+          +init
+            Tmp
+  
+          +sequence
+            +method("__len__")
+              +returns("int")
+                p Number of strings in the string-store
+  
+            +method("__iter__")
+              +returns
+                p Lexeme
+  
+          +maptype
+            +method("__getitem__", "key_int")
+              +params
+                +param("key_int")
+                  p An integer key
+    
+              +returns(types.unicode)
+                p The string that the integer key maps to
+    
+            +method("__getitem__", "key_unicode")
+              +params
+                +param("key_unicode")
+                  p A key, as a unicode string
+    
+              +returns(types.int)
+                p The integer ID of the string.
+    
+            +method("__getitem__", "key_utf8_bytes")
+              +params
+                +param("key_utf8_bytes", types.bytes)
+                  p p A key, as a UTF-8 encoded byte-string
+    
+              +returns(types.int)
+                p The integer ID of the string.
+  
+          details
+            summary: h4 Import/Export
+    
+            +method("dump", "loc")
+              +params
+                +param("loc")
+                  p File path to save the strings.txt to.
+    
+            +method("load")
+              +params
+                +param("loc")
+                  p File path to load the strings.txt from.
+  
+    script(src="js/prism.js")
--- a/docs/redesign/home.jade
+++ b/docs/redesign/home.jade
@ -0,0 +1,106 @@
+extends ./outline.jade
+
+// Notes
+//
+// 1. Where to put version notice? Should say something like
+//   2015-08-12: v0.89
+//   and be a link
+//   
+//   Only needs to appear on home page.
+
+
+- var slogan = "Build Tomorrow's Language Technologies"
+- var tag_line = "spaCy &ndash; " + slogan
+
+mixin lede
+  - var state_of_the_art = '<a href="#">state-of-the-art</a>'
+  - var a_minor_miracle = '<a href="">a minor miracle</a>'
+  - var great_documentation = '<a href="">great documentation</a>'
+  
+  p.
+    <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
+    library for industrial-strength NLP in Python and Cython.  It features
+    !{state_of_the_art} speed and accuracy, a concise API, and great documentation.
+    If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
+    like !{a_minor_miracle}.
+
+mixin overview()
+  p.
+    Overview text
+
+mixin benchmarks()
+  p.
+    Benchmarks
+
+mixin get_started()
+  p.
+    Get Started
+
+
+mixin comparison(name)
+  details
+    summary
+      h4= name
+
+    block
+ 
+mixin columns(...names)
+  tr
+    each name in names
+      th= name
+
+
+mixin row(...cells)
+  tr
+    each cell in cells
+      td= cell
+
+
+mixin social      
+  footer(role="contentinfo")
+    a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
+
+    div.discuss
+      a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
+        | Discuss on Hacker News
+
+      a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
+        | Discuss on Reddit
+
+
+mixin Section(title_text, link_name, include_file)
+  a(name=link_name): h3 #{title_text}
+
+  if (link_name == "example-use")
+    include ./usage_examples.jade
+  else if (link_name == "online-demo")
+    include ./online_demo.jade
+  else if (link_name == "comparisons")
+    include ./comparisons.jade
+  else if (link_name == "install")
+    include ./installation.jade
+
+
+block intro_block
+  section(class="intro")
+    +lede
+
+    nav(role="navigation")
+      ul
+        li: a(href="#example-use" class="button") Examples
+        li: a(href="#online-demo" class="button") Demo
+        li: a(href="#comparisons" class="button") Comparisons
+        li: a(href="#install" class="button") Install v0.89
+
+
+block body_block
+  article(class="page landing-page")
+
+    +Section("Usage by Example", "example-use", "./usage_examples.jade")
+
+    +Section("Online Demo", "online-demo", "./online_demo.jade")
+
+    +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
+
+    +Section("Install", "install", "./install.jade")
+
--- a/docs/redesign/installation.jade
+++ b/docs/redesign/installation.jade
@ -0,0 +1,40 @@
+p With Python 2.7 or Python 3, using Linux or OSX, run:
+
+pre.language-bash: code
+  | $ pip install spacy
+  | $ python -m spacy.en.download</code></pre>
+
+p
+  | The download command fetches and installs about 300mb of data, for
+  | the parser model and word vectors, which it installs within the spacy.en
+  | package directory.
+
+p
+  | If you're stuck using a server with an old version of Python, and you
+  | don't have root access, I've prepared a bootstrap script to help you
+  | compile a local Python install.  Run:
+
+pre.language-bash: code
+  | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
+
+p
+  | The other way to install the package is to clone the github repository,
+  | and build it from source.  This installs an additional dependency,
+  | Cython.  If you're using Python 2, I also recommend installing fabric
+  | and fabtools &ndash; this is how I build the project.
+
+pre.language-bash: code
+  | $ git clone https://github.com/honnibal/spaCy.git
+  | $ cd spaCy
+  | $ virtualenv .env && source .env/bin/activate
+  | $ export PYTHONPATH=`pwd`
+  | $ pip install -r requirements.txt
+  | $ python setup.py build_ext --inplace
+  | $ python -m spacy.en.download
+  | $ pip install pytest
+  | $ py.test tests/
+
+p
+  | Python packaging is awkward at the best of times, and it's particularly tricky
+  | with C extensions, built via Cython, requiring large data files.  So,
+  | please report issues as you encounter them.
--- a/docs/redesign/online_demo.jade
+++ b/docs/redesign/online_demo.jade
--- a/docs/redesign/outline.jade
+++ b/docs/redesign/outline.jade
@ -0,0 +1,37 @@
+- var slogan = "Build Tomorrow's Language Technologies"
+- var tag_line = "spaCy &ndash; " + slogan
+
+
+doctype html
+html(lang="en")
+  head
+    meta(charset="utf-8")
+    title!= tag_line
+    meta(name="description" content="")
+    meta(name="author" content="Matthew Honnibal")
+    link(rel="stylesheet" href="css/style.css")
+    <!--[if lt IE 9]>
+    script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
+    <![endif]-->
+
+  body(id="home" role="document")
+    header(role="banner")
+      h1(class="logo")!= tag_line
+      div(class="slogan")!= slogan
+
+    nav(role="navigation")
+      ul
+        li: a(href="#") Home
+        li: a(href="#") Docs
+        li: a(href="#") License
+        li: a(href="#") More
+
+    main(id="content" role="main")
+      block intro_block
+
+      block body_block
+ 
+  footer(role="contentinfo")
+
+  script(src="js/prism.js")
+  script(src="js/details_polyfill.js")
--- a/docs/redesign/usage_examples.jade
+++ b/docs/redesign/usage_examples.jade
@ -0,0 +1,109 @@
+mixin example(name)
+  details
+    summary
+      h4= name
+    block
+
+
+example("Load resources and process text")
+  pre.language-python: code
+    | from __future__ import unicode_literals, print_function
+    | from spacy.en import English
+    | nlp = English()
+    | doc = nlp('Hello, world. Here are two sentences.')
+
+example("Get tokens and sentences")
+  pre.language-python: code
+    | token = doc[0]
+    | sentence = doc.sents[0]
+    | assert token[0] is sentence[0]
+
+example("Use integer IDs for any string")
+  pre.language-python: code
+    | hello_id = nlp.vocab.strings['Hello']
+    | hello_str = nlp.vocab.strings[hello_id]
+    | 
+    | assert token.orth == hello_id == 52
+    | assert token.orth_ == hello_str == 'Hello'
+
+example("Get and set string views and flags")
+  pre.language-python: code
+    | assert token.shape_ == 'Xxxx'
+    | for lexeme in nlp.vocab:
+    |     if lexeme.is_alpha:
+    |         lexeme.shape_ = 'W'
+    |     elif lexeme.is_digit:
+    |         lexeme.shape_ = 'D'
+    |     elif lexeme.is_punct:
+    |         lexeme.shape_ = 'P'
+    |     else:
+    |         lexeme.shape_ = 'M'
+    | assert token.shape_ == 'W'
+
+example("Export to numpy arrays")
+  pre.language-python: code
+    | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
+    | 
+    | attr_ids = [ORTH, LIKE_URL, IS_OOV]
+    | doc_array = doc.to_array(attr_ids)
+    | assert doc_array.shape == (len(doc), len(attrs)
+    | assert doc[0].orth == doc_array[0, 0]
+    | assert doc[1].orth == doc_array[1, 0]
+    | assert doc[0].like_url == doc_array[0, 1]
+    | assert doc_array[, 1] == [t.like_url for t in doc]
+
+example("Word vectors")
+  pre.language-python: code
+    | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
+    | 
+    | apples = doc[0]
+    | oranges = doc[1]
+    | boots = doc[6]
+    | hippos = doc[8]
+    | 
+    | assert apples.similarity(oranges) > boots.similarity(hippos)
+
+
+example("Part-of-speech tags")
+  pre.language-python: code
+    | doc[0].pos
+    | doc[0].tag
+
+example("Syntactic dependencies")
+  pre.language-python: code
+    | for head in tokens:
+    |     for child in head.lefts:
+    |         assert child.head is head
+    |     for child in head.rights:
+    |         assert child.head is head
+    | sent = nlp('The four wheels on the bus turned quickly.')
+    | wheels = sent[2]
+    | bus = sent[5]
+    | assert len(list(wheels.lefts)) == 2
+    | assert len(list(wheels.rights)) == 1
+    | assert len(list(wheels.children)) == 3
+    | assert len(list(bus.lefts)) == 1
+    | assert len(list(bus.rights)) == 0
+    | assert len(list(bus.children)) == 1
+    | 
+    | assert len(list(wheels.subtree)) == 6 
+
+example("Named entities")
+  pre.language-python: code
+    | doc.ents
+    | token.ent_type
+    | token.ent_iob
+
+example("Define custom NER rules")
+  pre.language-python: code
+    | nlp.matcher
+
+example("Calculate inline mark-up on original string")
+  pre.language-python: code
+    | token.string
+    | token.spacy
+    | token.whitespace_
+
+example("Efficient binary serialization")
+  pre.language-python: code
+    | 
--- a/lang_data/en/gazetteer.json
+++ b/lang_data/en/gazetteer.json
@ -14,8 +14,8 @@
 				{"orth": "9/11"}
 			],
 			[
-				{"lower": "Septmber"},
-				{"lower": "Eleven"}
+				{"lower": "septmber"},
+				{"lower": "eleven"}
 			],
 			[
 				{"lower": "september"},