Merge branch 'gaz' of https://github.com/honnibal/spaCy into gaz

2025-10-30 23:47:31 +03:00 · 2015-09-06 14:07:43 +02:00 · 2015-09-06 14:07:43 +02:00 · 4f765eee79
commit 4f765eee79
parent 7e4fea67d3 56c4e07a59
7 changed files with 999 additions and 2 deletions
--- a/docs/redesign/docs.jade
+++ b/docs/redesign/docs.jade
@ -0,0 +1,705 @@
 - var py_docs = '<a class="reference" href="http://docs.python.org/library/'
 -
  var types = {
   'unicode': py_docs + 'functions.html#unicode"><em>unicode</em></a>',
   'bool': py_docs + 'functions.html#bool"><em>bool</em></a>',
   'int': py_docs + 'functions.html#int"><em>int</em></a>',
   'generator': "",
   'Vocab': "",
   'Span': "",
   'Doc': ""
  }
 mixin declare_class(name)
  details
    summary
      span.declaration
        span.label class
        code #{name}
    block
 mixin method(name, parameters)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
        span.parameters
          | self, #{parameters}
    block
 mixin params
  ul
    block
 mixin param(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin attribute(name, type, value)
  details(open=attributes.open)
    summary
      span.declaration
        span.label #{name}
    block
 mixin returns(name, type, value)
  li
    if type
      <strong>#{name}</strong> (!{type}) &#8211;
    else
      <strong>#{name}</strong> &#8211;
    block
 mixin returns(type)
  | tmp
 mixin init
  details
    summary: h4 Init
    block
 mixin callable
  details
    summary: h4 Callable
    block
 mixin sequence
  details
    summary: h4 Sequence
    block
 mixin maptype
  details
    summary: h4 Map
    block
 mixin summary
  block
 mixin en_example
  pre.language-python
    code
      | from spacy.en import English
      | from spacy._doc_examples import download_war_and_peace
      | 
      | unprocessed_unicode = download_war_and_peace()
      | 
      | nlp = English()
      | doc = nlp(unprocessed_unicode)
 doctype html
 html(lang="en")
  head
    meta(charset="utf-8")
    title spaCy &ndash; Industrial-strength NLP
    meta(name="description" content="")
    meta(name="author" content="Matthew Honnibal")
    link(rel="stylesheet" href="css/style.css")
    <!--[if lt IE 9]>
    script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
    <![endif]-->
  body(id="docs")
    header(role="banner")
      h1.logo spaCy &ndash; Industrial-strength NLP
      div.slogan API
    nav(role="navigation")
      ul
        li: a(href="#") Home
        li.active: a(href="#") Docs
        li: a(href="#") License
        li: a(href="#") Blog
    main.docs#content
      article
        +declare_class("English")
          p Load models into a callable object to process English text.
          +summary
            +en_example
          +init
            p
              | Load the resources.  Loading takes 20 seconds, and the instance
              | consumes 2 to 3 gigabytes of memory.
            p 
              | Intended use is for one instance to be created per process.
              | You can create more if you're doing something unusual.
            p
              | You may wish to make the instance a global variable or "singleton".
              | We usually instantiate the object in the <code>main()</code>
              | function and pass it around as an explicit argument. 
            +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true")
              +params
                +param("data_dir")
                  | The data directory.  May be #{None}, to disable any data loading
                  | (including the vocabulary).
                +param("Tokenizer")
                  | A class/function that creates the tokenizer.
                +param("Tagger")
                  | A class/function that creates the part-of-speech tagger.
                +param("Parser")
                  | A class/function that creates the dependency parser.
                +param("Entity")
                  | A class/function that creates the named entity recogniser.
                +param("load_vectors")
                  | A boolean value to control whether the word vectors are loaded.
          +callable
            +method("__call__", "text, tag=True, parse=True, entity=True")
              +params
                +param("text", types.unicode)
                  | The text to be processed.  No pre-processing needs to be applied,
                  | and any length of text can be submitted.  Usually you will submit
                  | a whole document. Text may be zero-length. An exception is raised
                  | if byte strings are supplied.
                +param("tag", bool_type)
                  | Whether to apply the part-of-speech tagger. Required for parsing
                  | and entity recognition.
                +param("parse", bool_type)
                  | Whether to apply the syntactic dependency parser.
                +param("entity", bool_type)
                  | Whether to apply the named entity recognizer.
                pre.language-python
                  code
                    | from spacy.en import English
                    | nlp = English()
                    | doc = nlp(u'Some text.) # Applies tagger, parser, entity
                    | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser
                    | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity
                    | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser
                    | doc = nlp(u'') # Zero-length tokens, not an error
                    | # doc = nlp(b'Some text') <-- Error: need unicode
                    | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first.
        +declare_class("Doc")
          p I'm a doc
          +init
            +method("__init__", "vocab")
              +params
                +param("vocab", vocab_type)
                  | A vocabulary object
          +sequence
            +method("__getitem__", "i", types.int)
              +returns(types.Token)
            +method("__getitem__", "start_end", types.slice)
              +returns(types.Span)
            +method("__iter__")
              | Iterate over tokens
            +method("__len__")
              | Number of tokens in the document.
          details
            summary: h4 Spans
            +attribute("sents", types.generator)
              | Iterate over sentences in the document.
            +attribute("ents", types.generator)
              | Iterate over named entities in the document.
            +attribute("noun_chunks", types.generator)
          details
            summary: h4 Export/Import
            +method("to_array", "attr_ids")
              | Given a list of M attribute IDs, export the tokens to a numpy ndarray
              | of shape N*M, where N is the length of the sentence.
              +params
                +param("attr_ids", "list[int]")
                  | A list of attribute ID ints.
              +returns("feat_array")
                | A feature matrix, with one row per word, and one column per attribute
                | indicated in the input attr_ids.
            +method("count_by", "attr_id")
              | Produce a dict of {attribute (int): count (ints)} frequencies, keyed
              | by the values of the given attribute ID.
              pre.language-python
                code
                  | >>> from spacy.en import English, attrs
                  | >>> nlp = English()
                  | >>> tokens = nlp(u'apple apple orange banana')
                  | >>> tokens.count_by(attrs.ORTH)
                  | {12800L: 1, 11880L: 2, 7561L: 1}
                  | >>> tokens.to_array([attrs.ORTH])
                  | array([[11880],
                  |         [11880],
                  |         [7561],
                  |         [12800]])
            +method("from_array", "attrs, array")
              | Load from array
            +method("from_bytes")
              | Deserialize, loading from bytes
            +method("read_bytes")
              | classmethod
            //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type")
            //  | Merge a multi-word expression into a single token.  Currently
            //  | experimental; API is likely to change.
        +declare_class("Token")
          +init
            +method("__init__", "vocab, doc, offset")
              +params
                +param("vocab", types.Vocab)
                  p A Vocab object
                +param("doc", types.Doc)
                  p The parent sequence
              +param("offset", types.int)
                p The index of the token within the document
          details
            summary: h4 String Views
            +attribute("orth / orth_")
              | The form of the word with no string normalization or processing, as
              | it appears in the string, without trailing whitespace.
            +attribute("lemma / lemma_")
              | The "base" of the word, with no inflectional suffixes, e.g. the lemma of
              | "developing" is "develop", the lemma of "geese" is "goose", etc.  Note that
              | <em>derivational</em> suffixes are not stripped, e.g. the lemma of
              | "instutitions" is "institution", not "institute".  Lemmatization is
              | performed using the WordNet data, but extended to also cover closed-class
              | words such as pronouns.  By default, the WN lemmatizer returns "hi"
              | as the lemma of "his". We assign pronouns the lemma -PRON-.
            +attribute("lower / lower_")
              | The form of the word, but forced to lower-case, i.e.
              pre.language-python: code lower = word.orth\_.lower()
            //+attribute("norm / norm_")
            //  | The form of the word, after language-specific normalizations has been
            //  | applied.
            +attribute("shape / shape_")
              | A transform of the word's string, to show orthographic features.
              | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped
              | to d. After these mappings, sequences of 4 or more of the same character
              | are truncated to length 4.  Examples: C3Po --> XdXx, favorite --> xxxx,
              | :) --> :)
            +attribute("prefix / prefix_")
              | A length-N substring from the start of the word.  Length may vary by
              | language; currently for English n=1, i.e.
              pre.language-python: code prefix = word.orth\_[:1]
            +attribute("suffix / suffix_")
              | A length-N substring from the end of the word.  Length may vary by
              | language; currently for English n=3, i.e.
              pre.language-python: code suffix = word.orth\_[-3:]
            //+attribute("lex_id")
            //  | lex_id
          details
            summary: h4 Alignment and Output
            +attribute("idx")
              p Start index of the token in the string
            +method("__len__", "")
              p Length of the token's orth string, in unicode code-points.
            +method("__unicode__", "")
              p Same as token.orth_
            +method("__str__", "")
              p Varies between Python 2 and Python 3
            +attribute("string")
              p
                | The form of the word as it appears in the string, <strong>including
                | trailing whitespace</strong>.  This is useful when you need to use
                | linguistic features to add inline mark-up to the string.
            +method("nbor, i=1")
              +params
                +param("i")
                  p Offset relative to token
          details
            summary: h4 Distributional Features
            +attribute("repvec")
              p
                | A "word embedding" representation: a dense real-valued vector that supports
                | similarity queries between words.  By default, spaCy currently loads
                | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec
                | model.
            +attribute("cluster")
              p
                | The Brown cluster ID of the word.  These are often useful features for
                | linear models.  If you're using a non-linear model, particularly a
                | neural net or random forest, consider using the real-valued word
                | representation vector, in Token.repvec, instead.
            +attribute("prob")
              p
                | The unigram log-probability of the word, estimated from counts from a
                | large corpus, smoothed using Simple Good Turing estimation.
          details
            summary: h4 Syntactic Tags
            +attribute("pos / pos_")
              p
                | A part-of-speech tag, from the Google Universal Tag Set, e.g. 
                | code>NOUN</code>, <code>VERB</code>, <code>ADV</code>.  Constants for
                | the 17 tag values are provided in <code>spacy.parts_of_speech.</code>
            +attribute("tag / tag_")
              p
                | A morphosyntactic tag, e.g. <code>NN</code>, <code>VBZ</code>,
                | <code>DT</code>, etc.  These tags are language/corpus specific, and
                | typically describe part-of-speech and some amount of morphological
                | information.  For instance, in the Penn Treebank tag set, <code>VBZ</code>
                | is assigned to a present-tense singular verb.
            +attribute("dep / dep_")
              p
                | The type of syntactic dependency relation between the word and its
                | syntactic head.
          details
            summary: h4 Navigating the Parse Tree
            +attribute("head")
              p
                | The Token that is the immediate syntactic head of the word.  If the
                | word is the root of the dependency tree, the same word is returned.
            +attribute("lefts")
              p
                | An iterator for the immediate leftward syntactic children of the
                | word.
            +attribute("rights")
              p
                | An iterator for the immediate rightward syntactic children of the
                | word.
            +attribute("n_lefts")
              p
                | The number of immediate syntactic children preceding the word in 
                | the string.
            +attribute("n_rights")
              p
                | The number of immediate syntactic children following the word in
                | the string.
            +attribute("children")
              p
                | An iterator that yields from lefts, and then yields from rights.
            +attribute("subtree")
              p
                | An iterator for the part of the sentence syntactically governed by
                | the word, including the word itself.
            +attribute("left_edge")
              p The leftmost edge of the token's subtree
            +attribute("right_edge")
              p The rightmost edge of the token's subtree
          details
            summary: h4 Named Entities
            +attribute("ent_type")
              p If the token is part of an entity, its entity type.
            +attribute("ent_iob")
              p The IOB (inside, outside, begin) entity recognition tag for the token.
          details
            summary: h4 Lexeme Flags
            +method("check_flag", "flag_id")
              +params
                +param("flag_id")
                  | flag ID
            +attribute("is_oov")
            +attribute("is_alpha")
            +attribute("is_ascii")
            +attribute("is_digit")
            +attribute("is_lower")
            +attribute("is_title")
            +attribute("is_punct")
            +attribute("is_space")
            +attribute("like_url")
            +attribute("like_num")
            +attribute("like_email")
            //+attribute("conjuncts")
            //  | Conjuncts
        +declare_class("Span")
          +init
            +method("__init__")
              Temp
            <code>span = doc[0:4]</code>
          +sequence
            +method("__getitem__")
              p Get item
            +method("__iter__")
              p Iter
            +method("__len__")
              p Len
          details
            summary: h4 Parse
            +attribute("root")
              p Syntactic head
            +attribute("lefts")
              p Tokens that are:
              ol
                li To the left of the span;
                li Syntactic children of words within the span
              p i.e.
              pre.language-python
                code
                  | lefts = [span.doc[i] for i in range(0, span.start)
                  |          if span.doc[i].head in span]
            +attribute("rights")
              p Tokens that are:
                ol 
                  li To the right of the span;
                  li Syntactic children of words within the span
              p i.e.
              pre.language-python
                code
                  | rights = [span.doc[i] for i in range(span.end, len(span.doc))
                  |           if span.doc[i].head in span]
            +attribute("subtree")
              p String
          details
            summary: h4 String Views
            +attribute("string")
              p String
            +attribute("lemma / lemma_")
              p String
            +attribute("label / label_")
              p String
        +declare_class("Lexeme")
          p
            | The Lexeme object represents a lexical type, stored in the vocabulary
            | &ndash; as opposed to a token, occurring in a document.
          p
            | Lexemes store various features, so that these features can be computed
            | once per type, rather than once per token. As job sizes grow, this
            | can amount to a substantial efficiency improvement.
          p
            | All Lexeme attributes are therefore context independent, as a single
            | lexeme is reused for all usages of that word. Lexemes are keyed by
            | the “orth” attribute.
          p
            All Lexeme attributes are accessible directly on the Token object.
          +init
            +method("__init__")
              p Init
            details
              summary: h4 String Features
                +attribute("orth / orth_")
                  p
                    | The form of the word with no string normalization or processing,
                    | as it appears in the string, without trailing whitespace.
                +attribute("lower / lower_")
                  p Tmp
                +attribute("norm / norm_")
                  p Tmp
                +attribute("shape / shape_")
                  p Tmp
                +attribute("prefix / prefix_")
                  p Tmp
                +attribute("suffix / suffix_")
                  p TMP
        +declare_class("Vocab", "data_dir=None, lex_props_getter=None")
          +sequence
            +method("__len__")
              +returns
                p Number of words in the vocabulary.
            +method("__iter__")
              +returns
                p Lexeme
          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key")
                  p Integer ID
              +returns: p A Lexeme object
            +method("__getitem__", "key_str")
              +params
                +param("key_str", types.unicode)
                  p A string in the vocabulary
              +returns("Lexeme")
            +method("__setitem__", "orth_str", "props")
              +params
                +param("orth_str", types.unicode)
                  p The orth key
                +param("props", types.dict)
                  p A props dictionary
              +returns("None")
          details
            summary: h4 Import/Export
            +method("dump", "loc")
              +params
                +param("loc", types.unicode)
                  p Path where the vocabulary should be saved
            +method("load_lexemes", "loc")
            +params
              +param("loc", types.unicode)
                p Path to load the lexemes.bin file from
            +method("load_vectors", "loc")
              +params
                +param("loc", types.unicode)
                  p Path to load the vectors.bin from
        +declare_class("StringStore")
          +init
            Tmp
          +sequence
            +method("__len__")
              +returns("int")
                p Number of strings in the string-store
            +method("__iter__")
              +returns
                p Lexeme
          +maptype
            +method("__getitem__", "key_int")
              +params
                +param("key_int")
                  p An integer key
              +returns(types.unicode)
                p The string that the integer key maps to
            +method("__getitem__", "key_unicode")
              +params
                +param("key_unicode")
                  p A key, as a unicode string
              +returns(types.int)
                p The integer ID of the string.
            +method("__getitem__", "key_utf8_bytes")
              +params
                +param("key_utf8_bytes", types.bytes)
                  p p A key, as a UTF-8 encoded byte-string
              +returns(types.int)
                p The integer ID of the string.
          details
            summary: h4 Import/Export
            +method("dump", "loc")
              +params
                +param("loc")
                  p File path to save the strings.txt to.
            +method("load")
              +params
                +param("loc")
                  p File path to load the strings.txt from.
    script(src="js/prism.js")
--- a/docs/redesign/home.jade
+++ b/docs/redesign/home.jade
@ -0,0 +1,106 @@
 extends ./outline.jade
 // Notes
 //
 // 1. Where to put version notice? Should say something like
 //   2015-08-12: v0.89
 //   and be a link
 //   
 //   Only needs to appear on home page.
 - var slogan = "Build Tomorrow's Language Technologies"
 - var tag_line = "spaCy &ndash; " + slogan
 mixin lede
  - var state_of_the_art = '<a href="#">state-of-the-art</a>'
  - var a_minor_miracle = '<a href="">a minor miracle</a>'
  - var great_documentation = '<a href="">great documentation</a>'
  p.
    <a href="https://github.com/honnibal/spaCy"><strong>spaCy</strong></a> is a
    library for industrial-strength NLP in Python and Cython.  It features
    !{state_of_the_art} speed and accuracy, a concise API, and great documentation.
    If you're a small company doing NLP, we want <strong>spaCy</strong> to seem
    like !{a_minor_miracle}.
 mixin overview()
  p.
    Overview text
 mixin benchmarks()
  p.
    Benchmarks
 mixin get_started()
  p.
    Get Started
 mixin comparison(name)
  details
    summary
      h4= name
    block
 mixin columns(...names)
  tr
    each name in names
      th= name
 mixin row(...cells)
  tr
    each cell in cells
      td= cell
 mixin social      
  footer(role="contentinfo")
    a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter
    div.discuss
      a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn")
        | Discuss on Hacker News
      a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit")
        | Discuss on Reddit
 mixin Section(title_text, link_name, include_file)
  a(name=link_name): h3 #{title_text}
  if (link_name == "example-use")
    include ./usage_examples.jade
  else if (link_name == "online-demo")
    include ./online_demo.jade
  else if (link_name == "comparisons")
    include ./comparisons.jade
  else if (link_name == "install")
    include ./installation.jade
 block intro_block
  section(class="intro")
    +lede
    nav(role="navigation")
      ul
        li: a(href="#example-use" class="button") Examples
        li: a(href="#online-demo" class="button") Demo
        li: a(href="#comparisons" class="button") Comparisons
        li: a(href="#install" class="button") Install v0.89
 block body_block
  article(class="page landing-page")
    +Section("Usage by Example", "example-use", "./usage_examples.jade")
    +Section("Online Demo", "online-demo", "./online_demo.jade")
    +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade")
    +Section("Install", "install", "./install.jade")
--- a/docs/redesign/installation.jade
+++ b/docs/redesign/installation.jade
@ -0,0 +1,40 @@
 p With Python 2.7 or Python 3, using Linux or OSX, run:
 pre.language-bash: code
  | $ pip install spacy
  | $ python -m spacy.en.download</code></pre>
 p
  | The download command fetches and installs about 300mb of data, for
  | the parser model and word vectors, which it installs within the spacy.en
  | package directory.
 p
  | If you're stuck using a server with an old version of Python, and you
  | don't have root access, I've prepared a bootstrap script to help you
  | compile a local Python install.  Run:
 pre.language-bash: code
  | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate
 p
  | The other way to install the package is to clone the github repository,
  | and build it from source.  This installs an additional dependency,
  | Cython.  If you're using Python 2, I also recommend installing fabric
  | and fabtools &ndash; this is how I build the project.
 pre.language-bash: code
  | $ git clone https://github.com/honnibal/spaCy.git
  | $ cd spaCy
  | $ virtualenv .env && source .env/bin/activate
  | $ export PYTHONPATH=`pwd`
  | $ pip install -r requirements.txt
  | $ python setup.py build_ext --inplace
  | $ python -m spacy.en.download
  | $ pip install pytest
  | $ py.test tests/
 p
  | Python packaging is awkward at the best of times, and it's particularly tricky
  | with C extensions, built via Cython, requiring large data files.  So,
  | please report issues as you encounter them.
--- a/docs/redesign/online_demo.jade
+++ b/docs/redesign/online_demo.jade
--- a/docs/redesign/outline.jade
+++ b/docs/redesign/outline.jade
@ -0,0 +1,37 @@
 - var slogan = "Build Tomorrow's Language Technologies"
 - var tag_line = "spaCy &ndash; " + slogan
 doctype html
 html(lang="en")
  head
    meta(charset="utf-8")
    title!= tag_line
    meta(name="description" content="")
    meta(name="author" content="Matthew Honnibal")
    link(rel="stylesheet" href="css/style.css")
    <!--[if lt IE 9]>
    script(src="http://html5shiv.googlecode.com/svn/trunk/html5.js")
    <![endif]-->
  body(id="home" role="document")
    header(role="banner")
      h1(class="logo")!= tag_line
      div(class="slogan")!= slogan
    nav(role="navigation")
      ul
        li: a(href="#") Home
        li: a(href="#") Docs
        li: a(href="#") License
        li: a(href="#") More
    main(id="content" role="main")
      block intro_block
      block body_block
  footer(role="contentinfo")
  script(src="js/prism.js")
  script(src="js/details_polyfill.js")
--- a/docs/redesign/usage_examples.jade
+++ b/docs/redesign/usage_examples.jade
@ -0,0 +1,109 @@
 mixin example(name)
  details
    summary
      h4= name
    block
 +example("Load resources and process text")
  pre.language-python: code
    | from __future__ import unicode_literals, print_function
    | from spacy.en import English
    | nlp = English()
    | doc = nlp('Hello, world. Here are two sentences.')
 +example("Get tokens and sentences")
  pre.language-python: code
    | token = doc[0]
    | sentence = doc.sents[0]
    | assert token[0] is sentence[0]
 +example("Use integer IDs for any string")
  pre.language-python: code
    | hello_id = nlp.vocab.strings['Hello']
    | hello_str = nlp.vocab.strings[hello_id]
    | 
    | assert token.orth == hello_id == 52
    | assert token.orth_ == hello_str == 'Hello'
 +example("Get and set string views and flags")
  pre.language-python: code
    | assert token.shape_ == 'Xxxx'
    | for lexeme in nlp.vocab:
    |     if lexeme.is_alpha:
    |         lexeme.shape_ = 'W'
    |     elif lexeme.is_digit:
    |         lexeme.shape_ = 'D'
    |     elif lexeme.is_punct:
    |         lexeme.shape_ = 'P'
    |     else:
    |         lexeme.shape_ = 'M'
    | assert token.shape_ == 'W'
 +example("Export to numpy arrays")
  pre.language-python: code
    | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV
    | 
    | attr_ids = [ORTH, LIKE_URL, IS_OOV]
    | doc_array = doc.to_array(attr_ids)
    | assert doc_array.shape == (len(doc), len(attrs)
    | assert doc[0].orth == doc_array[0, 0]
    | assert doc[1].orth == doc_array[1, 0]
    | assert doc[0].like_url == doc_array[0, 1]
    | assert doc_array[, 1] == [t.like_url for t in doc]
 +example("Word vectors")
  pre.language-python: code
    | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.")
    | 
    | apples = doc[0]
    | oranges = doc[1]
    | boots = doc[6]
    | hippos = doc[8]
    | 
    | assert apples.similarity(oranges) > boots.similarity(hippos)
 +example("Part-of-speech tags")
  pre.language-python: code
    | doc[0].pos
    | doc[0].tag
 +example("Syntactic dependencies")
  pre.language-python: code
    | for head in tokens:
    |     for child in head.lefts:
    |         assert child.head is head
    |     for child in head.rights:
    |         assert child.head is head
    | sent = nlp('The four wheels on the bus turned quickly.')
    | wheels = sent[2]
    | bus = sent[5]
    | assert len(list(wheels.lefts)) == 2
    | assert len(list(wheels.rights)) == 1
    | assert len(list(wheels.children)) == 3
    | assert len(list(bus.lefts)) == 1
    | assert len(list(bus.rights)) == 0
    | assert len(list(bus.children)) == 1
    | 
    | assert len(list(wheels.subtree)) == 6 
 +example("Named entities")
  pre.language-python: code
    | doc.ents
    | token.ent_type
    | token.ent_iob
 +example("Define custom NER rules")
  pre.language-python: code
    | nlp.matcher
 +example("Calculate inline mark-up on original string")
  pre.language-python: code
    | token.string
    | token.spacy
    | token.whitespace_
 +example("Efficient binary serialization")
  pre.language-python: code
    | 
--- a/lang_data/en/gazetteer.json
+++ b/lang_data/en/gazetteer.json
@ -14,8 +14,8 @@
 				{"orth": "9/11"}
 			],
 			[
-				{"lower": "Septmber"},
+				{"lower": "septmber"},
-				{"lower": "Eleven"}
+				{"lower": "eleven"}
 			],
 			[
 				{"lower": "september"},