diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade
new file mode 100644
index 000000000..29f0512e7
--- /dev/null
+++ b/docs/redesign/docs.jade
@@ -0,0 +1,705 @@
+- var py_docs = 'unicode',
+ 'bool': py_docs + 'functions.html#bool">bool',
+ 'int': py_docs + 'functions.html#int">int',
+ 'generator': "",
+ 'Vocab': "",
+ 'Span': "",
+ 'Doc': ""
+ }
+
+
+mixin declare_class(name)
+ details
+ summary
+ span.declaration
+ span.label class
+ code #{name}
+ block
+
+mixin method(name, parameters)
+ details(open=attributes.open)
+ summary
+ span.declaration
+ span.label #{name}
+ span.parameters
+ | self, #{parameters}
+ block
+
+
+mixin params
+ ul
+ block
+
+
+mixin param(name, type, value)
+ li
+ if type
+ #{name} (!{type}) –
+ else
+ #{name} –
+ block
+
+
+mixin attribute(name, type, value)
+ details(open=attributes.open)
+ summary
+ span.declaration
+ span.label #{name}
+ block
+
+
+mixin returns(name, type, value)
+ li
+ if type
+ #{name} (!{type}) –
+ else
+ #{name} –
+ block
+
+
+mixin returns(type)
+ | tmp
+
+mixin init
+ details
+ summary: h4 Init
+
+ block
+
+
+mixin callable
+ details
+ summary: h4 Callable
+
+ block
+
+
+mixin sequence
+ details
+ summary: h4 Sequence
+
+ block
+
+
+mixin maptype
+ details
+ summary: h4 Map
+
+ block
+
+
+mixin summary
+ block
+
+mixin en_example
+ pre.language-python
+ code
+ | from spacy.en import English
+ | from spacy._doc_examples import download_war_and_peace
+ |
+ | unprocessed_unicode = download_war_and_peace()
+ |
+ | nlp = English()
+ | doc = nlp(unprocessed_unicode)
+
+
+doctype html
+html(lang="en")
+ head
+ meta(charset="utf-8")
+ title spaCy – Industrial-strength NLP
+ meta(name="description" content="")
+ meta(name="author" content="Matthew Honnibal")
+ link(rel="stylesheet" href="css/style.css")
+
+
+ body(id="docs")
+ header(role="banner")
+ h1.logo spaCy – Industrial-strength NLP
+ div.slogan API
+
+
+ nav(role="navigation")
+ ul
+ li: a(href="#") Home
+ li.active: a(href="#") Docs
+ li: a(href="#") License
+ li: a(href="#") Blog
+
+ main.docs#content
+
+ article
+ +declare_class("English")
+ p Load models into a callable object to process English text.
+
+ +summary
+ +en_example
+
+ +init
+ p
+ | Load the resources. Loading takes 20 seconds, and the instance
+ | consumes 2 to 3 gigabytes of memory.
+
+ p
+ | Intended use is for one instance to be created per process.
+ | You can create more if you're doing something unusual.
+ p
+ | You may wish to make the instance a global variable or "singleton".
+ | We usually instantiate the object in the main()
+ | function and pass it around as an explicit argument.
+ +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
+
+ +params
+ +param("data_dir")
+ | The data directory. May be #{None}, to disable any data loading
+ | (including the vocabulary).
+
+ +param("Tokenizer")
+ | A class/function that creates the tokenizer.
+
+ +param("Tagger")
+ | A class/function that creates the part-of-speech tagger.
+
+ +param("Parser")
+ | A class/function that creates the dependency parser.
+
+ +param("Entity")
+ | A class/function that creates the named entity recogniser.
+
+ +param("load_vectors")
+ | A boolean value to control whether the word vectors are loaded.
+
+ +callable
+ +method("__call__", "text, tag=True, parse=True, entity=True")
+
+ +params
+ +param("text", types.unicode)
+ | The text to be processed. No pre-processing needs to be applied,
+ | and any length of text can be submitted. Usually you will submit
+ | a whole document. Text may be zero-length. An exception is raised
+ | if byte strings are supplied.
+
+ +param("tag", bool_type)
+ | Whether to apply the part-of-speech tagger. Required for parsing
+ | and entity recognition.
+
+ +param("parse", bool_type)
+ | Whether to apply the syntactic dependency parser.
+
+ +param("entity", bool_type)
+ | Whether to apply the named entity recognizer.
+
+ pre.language-python
+ code
+ | from spacy.en import English
+ | nlp = English()
+ | doc = nlp(u'Some text.) # Applies tagger, parser, entity
+ | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
+ | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
+ | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
+ | doc = nlp(u'') # Zero-length tokens, not an error
+ | # doc = nlp(b'Some text') <-- Error: need unicode
+ | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
+
+
+ +declare_class("Doc")
+ p I'm a doc
+
+ +init
+ +method("__init__", "vocab")
+ +params
+ +param("vocab", vocab_type)
+ | A vocabulary object
+
+ +sequence
+ +method("__getitem__", "i", types.int)
+ +returns(types.Token)
+
+ +method("__getitem__", "start_end", types.slice)
+ +returns(types.Span)
+
+ +method("__iter__")
+ | Iterate over tokens
+
+ +method("__len__")
+ | Number of tokens in the document.
+
+ details
+ summary: h4 Spans
+
+ +attribute("sents", types.generator)
+ | Iterate over sentences in the document.
+
+ +attribute("ents", types.generator)
+ | Iterate over named entities in the document.
+
+ +attribute("noun_chunks", types.generator)
+
+ details
+ summary: h4 Export/Import
+
+ +method("to_array", "attr_ids")
+
+ | Given a list of M attribute IDs, export the tokens to a numpy ndarray
+ | of shape N*M, where N is the length of the sentence.
+
+ +params
+ +param("attr_ids", "list[int]")
+ | A list of attribute ID ints.
+
+ +returns("feat_array")
+ | A feature matrix, with one row per word, and one column per attribute
+ | indicated in the input attr_ids.
+
+ +method("count_by", "attr_id")
+ | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+ | by the values of the given attribute ID.
+
+ pre.language-python
+ code
+ | >>> from spacy.en import English, attrs
+ | >>> nlp = English()
+ | >>> tokens = nlp(u'apple apple orange banana')
+ | >>> tokens.count_by(attrs.ORTH)
+ | {12800L: 1, 11880L: 2, 7561L: 1}
+ | >>> tokens.to_array([attrs.ORTH])
+ | array([[11880],
+ | [11880],
+ | [7561],
+ | [12800]])
+
+ +method("from_array", "attrs, array")
+ | Load from array
+
+ +method("from_bytes")
+ | Deserialize, loading from bytes
+
+ +method("read_bytes")
+ | classmethod
+
+ //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
+
+ // | Merge a multi-word expression into a single token. Currently
+ // | experimental; API is likely to change.
+
+
+ +declare_class("Token")
+ +init
+ +method("__init__", "vocab, doc, offset")
+ +params
+ +param("vocab", types.Vocab)
+ p A Vocab object
+
+ +param("doc", types.Doc)
+ p The parent sequence
+
+ +param("offset", types.int)
+ p The index of the token within the document
+
+ details
+ summary: h4 String Views
+
+ +attribute("orth / orth_")
+ | The form of the word with no string normalization or processing, as
+ | it appears in the string, without trailing whitespace.
+
+ +attribute("lemma / lemma_")
+ | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
+ | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that
+ | derivational suffixes are not stripped, e.g. the lemma of
+ | "instutitions" is "institution", not "institute". Lemmatization is
+ | performed using the WordNet data, but extended to also cover closed-class
+ | words such as pronouns. By default, the WN lemmatizer returns "hi"
+ | as the lemma of "his". We assign pronouns the lemma -PRON-.
+
+ +attribute("lower / lower_")
+ | The form of the word, but forced to lower-case, i.e.
+ pre.language-python: code lower = word.orth\_.lower()
+
+ //+attribute("norm / norm_")
+ // | The form of the word, after language-specific normalizations has been
+ // | applied.
+
+ +attribute("shape / shape_")
+ | A transform of the word's string, to show orthographic features.
+ | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
+ | to d. After these mappings, sequences of 4 or more of the same character
+ | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx,
+ | :) --> :)
+
+ +attribute("prefix / prefix_")
+ | A length-N substring from the start of the word. Length may vary by
+ | language; currently for English n=1, i.e.
+ pre.language-python: code prefix = word.orth\_[:1]
+
+ +attribute("suffix / suffix_")
+ | A length-N substring from the end of the word. Length may vary by
+ | language; currently for English n=3, i.e.
+ pre.language-python: code suffix = word.orth\_[-3:]
+
+ //+attribute("lex_id")
+ // | lex_id
+
+ details
+ summary: h4 Alignment and Output
+
+ +attribute("idx")
+ p Start index of the token in the string
+
+ +method("__len__", "")
+ p Length of the token's orth string, in unicode code-points.
+
+ +method("__unicode__", "")
+ p Same as token.orth_
+
+ +method("__str__", "")
+ p Varies between Python 2 and Python 3
+
+ +attribute("string")
+ p
+ | The form of the word as it appears in the string, including
+ | trailing whitespace. This is useful when you need to use
+ | linguistic features to add inline mark-up to the string.
+
+ +method("nbor, i=1")
+ +params
+ +param("i")
+ p Offset relative to token
+
+ details
+ summary: h4 Distributional Features
+
+ +attribute("repvec")
+ p
+ | A "word embedding" representation: a dense real-valued vector that supports
+ | similarity queries between words. By default, spaCy currently loads
+ | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
+ | model.
+
+ +attribute("cluster")
+ p
+ | The Brown cluster ID of the word. These are often useful features for
+ | linear models. If you're using a non-linear model, particularly a
+ | neural net or random forest, consider using the real-valued word
+ | representation vector, in Token.repvec, instead.
+
+ +attribute("prob")
+ p
+ | The unigram log-probability of the word, estimated from counts from a
+ | large corpus, smoothed using Simple Good Turing estimation.
+
+ details
+ summary: h4 Syntactic Tags
+
+ +attribute("pos / pos_")
+ p
+ | A part-of-speech tag, from the Google Universal Tag Set, e.g.
+ | code>NOUN, VERB
, ADV
. Constants for
+ | the 17 tag values are provided in spacy.parts_of_speech.
+
+ +attribute("tag / tag_")
+ p
+ | A morphosyntactic tag, e.g. NN
, VBZ
,
+ | DT
, etc. These tags are language/corpus specific, and
+ | typically describe part-of-speech and some amount of morphological
+ | information. For instance, in the Penn Treebank tag set, VBZ
+ | is assigned to a present-tense singular verb.
+
+ +attribute("dep / dep_")
+ p
+ | The type of syntactic dependency relation between the word and its
+ | syntactic head.
+
+ details
+ summary: h4 Navigating the Parse Tree
+
+ +attribute("head")
+ p
+ | The Token that is the immediate syntactic head of the word. If the
+ | word is the root of the dependency tree, the same word is returned.
+
+ +attribute("lefts")
+ p
+ | An iterator for the immediate leftward syntactic children of the
+ | word.
+
+ +attribute("rights")
+ p
+ | An iterator for the immediate rightward syntactic children of the
+ | word.
+
+ +attribute("n_lefts")
+ p
+ | The number of immediate syntactic children preceding the word in
+ | the string.
+
+ +attribute("n_rights")
+ p
+ | The number of immediate syntactic children following the word in
+ | the string.
+
+ +attribute("children")
+ p
+ | An iterator that yields from lefts, and then yields from rights.
+
+ +attribute("subtree")
+ p
+ | An iterator for the part of the sentence syntactically governed by
+ | the word, including the word itself.
+
+ +attribute("left_edge")
+ p The leftmost edge of the token's subtree
+
+ +attribute("right_edge")
+ p The rightmost edge of the token's subtree
+
+ details
+ summary: h4 Named Entities
+
+ +attribute("ent_type")
+ p If the token is part of an entity, its entity type.
+
+ +attribute("ent_iob")
+ p The IOB (inside, outside, begin) entity recognition tag for the token.
+
+ details
+ summary: h4 Lexeme Flags
+
+ +method("check_flag", "flag_id")
+ +params
+ +param("flag_id")
+ | flag ID
+
+ +attribute("is_oov")
+ +attribute("is_alpha")
+ +attribute("is_ascii")
+ +attribute("is_digit")
+ +attribute("is_lower")
+ +attribute("is_title")
+ +attribute("is_punct")
+ +attribute("is_space")
+ +attribute("like_url")
+ +attribute("like_num")
+ +attribute("like_email")
+
+ //+attribute("conjuncts")
+ // | Conjuncts
+
+ +declare_class("Span")
+ +init
+ +method("__init__")
+ Temp
+
+ span = doc[0:4]
+
+ +sequence
+ +method("__getitem__")
+ p Get item
+
+ +method("__iter__")
+ p Iter
+
+ +method("__len__")
+ p Len
+
+ details
+ summary: h4 Parse
+
+ +attribute("root")
+ p Syntactic head
+
+ +attribute("lefts")
+ p Tokens that are:
+ ol
+ li To the left of the span;
+ li Syntactic children of words within the span
+
+ p i.e.
+
+ pre.language-python
+ code
+ | lefts = [span.doc[i] for i in range(0, span.start)
+ | if span.doc[i].head in span]
+
+ +attribute("rights")
+ p Tokens that are:
+ ol
+ li To the right of the span;
+ li Syntactic children of words within the span
+ p i.e.
+ pre.language-python
+ code
+ | rights = [span.doc[i] for i in range(span.end, len(span.doc))
+ | if span.doc[i].head in span]
+
+
+ +attribute("subtree")
+ p String
+
+ details
+ summary: h4 String Views
+
+ +attribute("string")
+ p String
+
+ +attribute("lemma / lemma_")
+ p String
+
+ +attribute("label / label_")
+ p String
+
+ +declare_class("Lexeme")
+ p
+ | The Lexeme object represents a lexical type, stored in the vocabulary
+ | – as opposed to a token, occurring in a document.
+ p
+ | Lexemes store various features, so that these features can be computed
+ | once per type, rather than once per token. As job sizes grow, this
+ | can amount to a substantial efficiency improvement.
+
+ p
+ | All Lexeme attributes are therefore context independent, as a single
+ | lexeme is reused for all usages of that word. Lexemes are keyed by
+ | the “orth” attribute.
+
+ p
+ All Lexeme attributes are accessible directly on the Token object.
+
+ +init
+ +method("__init__")
+ p Init
+
+ details
+ summary: h4 String Features
+
+ +attribute("orth / orth_")
+ p
+ | The form of the word with no string normalization or processing,
+ | as it appears in the string, without trailing whitespace.
+
+ +attribute("lower / lower_")
+ p Tmp
+
+ +attribute("norm / norm_")
+ p Tmp
+
+ +attribute("shape / shape_")
+ p Tmp
+
+ +attribute("prefix / prefix_")
+ p Tmp
+
+ +attribute("suffix / suffix_")
+ p TMP
+
+ +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
+ +sequence
+ +method("__len__")
+ +returns
+ p Number of words in the vocabulary.
+
+ +method("__iter__")
+ +returns
+ p Lexeme
+
+ +maptype
+ +method("__getitem__", "key_int")
+ +params
+ +param("key")
+ p Integer ID
+
+ +returns: p A Lexeme object
+
+ +method("__getitem__", "key_str")
+ +params
+ +param("key_str", types.unicode)
+ p A string in the vocabulary
+
+ +returns("Lexeme")
+
+ +method("__setitem__", "orth_str", "props")
+ +params
+ +param("orth_str", types.unicode)
+ p The orth key
+
+ +param("props", types.dict)
+ p A props dictionary
+
+ +returns("None")
+
+ details
+ summary: h4 Import/Export
+
+ +method("dump", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path where the vocabulary should be saved
+
+ +method("load_lexemes", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path to load the lexemes.bin file from
+
+ +method("load_vectors", "loc")
+ +params
+ +param("loc", types.unicode)
+ p Path to load the vectors.bin from
+
+ +declare_class("StringStore")
+ +init
+ Tmp
+
+ +sequence
+ +method("__len__")
+ +returns("int")
+ p Number of strings in the string-store
+
+ +method("__iter__")
+ +returns
+ p Lexeme
+
+ +maptype
+ +method("__getitem__", "key_int")
+ +params
+ +param("key_int")
+ p An integer key
+
+ +returns(types.unicode)
+ p The string that the integer key maps to
+
+ +method("__getitem__", "key_unicode")
+ +params
+ +param("key_unicode")
+ p A key, as a unicode string
+
+ +returns(types.int)
+ p The integer ID of the string.
+
+ +method("__getitem__", "key_utf8_bytes")
+ +params
+ +param("key_utf8_bytes", types.bytes)
+ p p A key, as a UTF-8 encoded byte-string
+
+ +returns(types.int)
+ p The integer ID of the string.
+
+ details
+ summary: h4 Import/Export
+
+ +method("dump", "loc")
+ +params
+ +param("loc")
+ p File path to save the strings.txt to.
+
+ +method("load")
+ +params
+ +param("loc")
+ p File path to load the strings.txt from.
+
+ script(src="js/prism.js")
diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade
new file mode 100644
index 000000000..c89d830cd
--- /dev/null
+++ b/docs/redesign/home.jade
@@ -0,0 +1,106 @@
+extends ./outline.jade
+
+// Notes
+//
+// 1. Where to put version notice? Should say something like
+// 2015-08-12: v0.89
+// and be a link
+//
+// Only needs to appear on home page.
+
+
+- var slogan = "Build Tomorrow's Language Technologies"
+- var tag_line = "spaCy – " + slogan
+
+mixin lede
+ - var state_of_the_art = 'state-of-the-art'
+ - var a_minor_miracle = 'a minor miracle'
+ - var great_documentation = 'great documentation'
+
+ p.
+ spaCy is a
+ library for industrial-strength NLP in Python and Cython. It features
+ !{state_of_the_art} speed and accuracy, a concise API, and great documentation.
+ If you're a small company doing NLP, we want spaCy to seem
+ like !{a_minor_miracle}.
+
+mixin overview()
+ p.
+ Overview text
+
+mixin benchmarks()
+ p.
+ Benchmarks
+
+mixin get_started()
+ p.
+ Get Started
+
+
+mixin comparison(name)
+ details
+ summary
+ h4= name
+
+ block
+
+mixin columns(...names)
+ tr
+ each name in names
+ th= name
+
+
+mixin row(...cells)
+ tr
+ each cell in cells
+ td= cell
+
+
+mixin social
+ footer(role="contentinfo")
+ a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
+
+ div.discuss
+ a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
+ | Discuss on Hacker News
+
+ a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
+ | Discuss on Reddit
+
+
+mixin Section(title_text, link_name, include_file)
+ a(name=link_name): h3 #{title_text}
+
+ if (link_name == "example-use")
+ include ./usage_examples.jade
+ else if (link_name == "online-demo")
+ include ./online_demo.jade
+ else if (link_name == "comparisons")
+ include ./comparisons.jade
+ else if (link_name == "install")
+ include ./installation.jade
+
+
+block intro_block
+ section(class="intro")
+ +lede
+
+ nav(role="navigation")
+ ul
+ li: a(href="#example-use" class="button") Examples
+ li: a(href="#online-demo" class="button") Demo
+ li: a(href="#comparisons" class="button") Comparisons
+ li: a(href="#install" class="button") Install v0.89
+
+
+block body_block
+ article(class="page landing-page")
+
+ +Section("Usage by Example", "example-use", "./usage_examples.jade")
+
+ +Section("Online Demo", "online-demo", "./online_demo.jade")
+
+ +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
+
+ +Section("Install", "install", "./install.jade")
+
diff --git a/docs/redesign/installation.jade b/docs/redesign/installation.jade
new file mode 100644
index 000000000..05f89dd24
--- /dev/null
+++ b/docs/redesign/installation.jade
@@ -0,0 +1,40 @@
+p With Python 2.7 or Python 3, using Linux or OSX, run:
+
+pre.language-bash: code
+ | $ pip install spacy
+ | $ python -m spacy.en.download
+
+p
+ | The download command fetches and installs about 300mb of data, for
+ | the parser model and word vectors, which it installs within the spacy.en
+ | package directory.
+
+p
+ | If you're stuck using a server with an old version of Python, and you
+ | don't have root access, I've prepared a bootstrap script to help you
+ | compile a local Python install. Run:
+
+pre.language-bash: code
+ | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
+
+p
+ | The other way to install the package is to clone the github repository,
+ | and build it from source. This installs an additional dependency,
+ | Cython. If you're using Python 2, I also recommend installing fabric
+ | and fabtools – this is how I build the project.
+
+pre.language-bash: code
+ | $ git clone https://github.com/honnibal/spaCy.git
+ | $ cd spaCy
+ | $ virtualenv .env && source .env/bin/activate
+ | $ export PYTHONPATH=`pwd`
+ | $ pip install -r requirements.txt
+ | $ python setup.py build_ext --inplace
+ | $ python -m spacy.en.download
+ | $ pip install pytest
+ | $ py.test tests/
+
+p
+ | Python packaging is awkward at the best of times, and it's particularly tricky
+ | with C extensions, built via Cython, requiring large data files. So,
+ | please report issues as you encounter them.
diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade
new file mode 100644
index 000000000..e69de29bb
diff --git a/docs/redesign/outline.jade b/docs/redesign/outline.jade
new file mode 100644
index 000000000..2389dc71e
--- /dev/null
+++ b/docs/redesign/outline.jade
@@ -0,0 +1,37 @@
+- var slogan = "Build Tomorrow's Language Technologies"
+- var tag_line = "spaCy – " + slogan
+
+
+doctype html
+html(lang="en")
+ head
+ meta(charset="utf-8")
+ title!= tag_line
+ meta(name="description" content="")
+ meta(name="author" content="Matthew Honnibal")
+ link(rel="stylesheet" href="css/style.css")
+
+
+ body(id="home" role="document")
+ header(role="banner")
+ h1(class="logo")!= tag_line
+ div(class="slogan")!= slogan
+
+ nav(role="navigation")
+ ul
+ li: a(href="#") Home
+ li: a(href="#") Docs
+ li: a(href="#") License
+ li: a(href="#") More
+
+ main(id="content" role="main")
+ block intro_block
+
+ block body_block
+
+ footer(role="contentinfo")
+
+ script(src="js/prism.js")
+ script(src="js/details_polyfill.js")
diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade
new file mode 100644
index 000000000..d429339d4
--- /dev/null
+++ b/docs/redesign/usage_examples.jade
@@ -0,0 +1,109 @@
+mixin example(name)
+ details
+ summary
+ h4= name
+ block
+
+
++example("Load resources and process text")
+ pre.language-python: code
+ | from __future__ import unicode_literals, print_function
+ | from spacy.en import English
+ | nlp = English()
+ | doc = nlp('Hello, world. Here are two sentences.')
+
++example("Get tokens and sentences")
+ pre.language-python: code
+ | token = doc[0]
+ | sentence = doc.sents[0]
+ | assert token[0] is sentence[0]
+
++example("Use integer IDs for any string")
+ pre.language-python: code
+ | hello_id = nlp.vocab.strings['Hello']
+ | hello_str = nlp.vocab.strings[hello_id]
+ |
+ | assert token.orth == hello_id == 52
+ | assert token.orth_ == hello_str == 'Hello'
+
++example("Get and set string views and flags")
+ pre.language-python: code
+ | assert token.shape_ == 'Xxxx'
+ | for lexeme in nlp.vocab:
+ | if lexeme.is_alpha:
+ | lexeme.shape_ = 'W'
+ | elif lexeme.is_digit:
+ | lexeme.shape_ = 'D'
+ | elif lexeme.is_punct:
+ | lexeme.shape_ = 'P'
+ | else:
+ | lexeme.shape_ = 'M'
+ | assert token.shape_ == 'W'
+
++example("Export to numpy arrays")
+ pre.language-python: code
+ | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
+ |
+ | attr_ids = [ORTH, LIKE_URL, IS_OOV]
+ | doc_array = doc.to_array(attr_ids)
+ | assert doc_array.shape == (len(doc), len(attrs)
+ | assert doc[0].orth == doc_array[0, 0]
+ | assert doc[1].orth == doc_array[1, 0]
+ | assert doc[0].like_url == doc_array[0, 1]
+ | assert doc_array[, 1] == [t.like_url for t in doc]
+
++example("Word vectors")
+ pre.language-python: code
+ | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
+ |
+ | apples = doc[0]
+ | oranges = doc[1]
+ | boots = doc[6]
+ | hippos = doc[8]
+ |
+ | assert apples.similarity(oranges) > boots.similarity(hippos)
+
+
++example("Part-of-speech tags")
+ pre.language-python: code
+ | doc[0].pos
+ | doc[0].tag
+
++example("Syntactic dependencies")
+ pre.language-python: code
+ | for head in tokens:
+ | for child in head.lefts:
+ | assert child.head is head
+ | for child in head.rights:
+ | assert child.head is head
+ | sent = nlp('The four wheels on the bus turned quickly.')
+ | wheels = sent[2]
+ | bus = sent[5]
+ | assert len(list(wheels.lefts)) == 2
+ | assert len(list(wheels.rights)) == 1
+ | assert len(list(wheels.children)) == 3
+ | assert len(list(bus.lefts)) == 1
+ | assert len(list(bus.rights)) == 0
+ | assert len(list(bus.children)) == 1
+ |
+ | assert len(list(wheels.subtree)) == 6
+
++example("Named entities")
+ pre.language-python: code
+ | doc.ents
+ | token.ent_type
+ | token.ent_iob
+
++example("Define custom NER rules")
+ pre.language-python: code
+ | nlp.matcher
+
++example("Calculate inline mark-up on original string")
+ pre.language-python: code
+ | token.string
+ | token.spacy
+ | token.whitespace_
+
++example("Efficient binary serialization")
+ pre.language-python: code
+ |
diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json
index 1aa6b9514..dce2e1f2a 100644
--- a/lang_data/en/gazetteer.json
+++ b/lang_data/en/gazetteer.json
@@ -14,8 +14,8 @@
{"orth": "9/11"}
],
[
- {"lower": "Septmber"},
- {"lower": "Eleven"}
+ {"lower": "septmber"},
+ {"lower": "eleven"}
],
[
{"lower": "september"},