Merge remote-tracking branch 'origin/develop' into feature/better-parser

2025-12-03 00:06:02 +03:00 · 2017-10-27 12:34:48 +00:00 · 2017-10-27 12:34:48 +00:00 · 531142a933
commit 531142a933
parent 19a2b9bf27 52f1bf2729
78 changed files with 1655 additions and 3278 deletions
--- a/README.rst
+++ b/README.rst
@ -42,7 +42,7 @@ integration. It's commercial open-source software, released under the MIT licens
 ===================  ===
 `spaCy 101`_         New to spaCy? Here's everything you need to know!
 `Usage Guides`_      How to use spaCy and its features.
-`New in v2.0`_       New features, backwards incompatibilitiies and migration guide.
+`New in v2.0`_       New features, backwards incompatibilities and migration guide.
 `API Reference`_     The detailed reference for spaCy's API.
 `Models`_            Download statistical language models for spaCy.
 `Resources`_         Libraries, extensions, demos, books and courses.
--- a/examples/README.md
+++ b/examples/README.md
@ -2,20 +2,18 @@
 # spaCy examples
-The examples are Python scripts with well-behaved command line interfaces. For a full list of spaCy tutorials and code snippets, see the [documentation](https://spacy.io/docs/usage/tutorials).
+The examples are Python scripts with well-behaved command line interfaces. For
 more detailed usage guides, see the [documentation](https://alpha.spacy.io/usage/).
-## How to run an example
+To see the available arguments, you can use the `--help` or `-h` flag:
 For example, to run the [`nn_text_class.py`](nn_text_class.py) script, do:
 ```bash
-$ python examples/nn_text_class.py
+$ python examples/training/train_ner.py --help
 usage: nn_text_class.py [-h] [-d 3] [-H 300] [-i 5] [-w 40000] [-b 24]
                        [-r 0.3] [-p 1e-05] [-e 0.005]
                        data_dir
 nn_text_class.py: error: too few arguments
 ```
-You can print detailed help with the `-h` argument.
+While we try to keep the examples up to date, they are not currently exercised
-
+by the test suite, as some of them require significant data downloads or take
-While we try to keep the examples up to date, they are not currently exercised by the test suite, as some of them require significant data downloads or take time to train. If you find that an example is no longer running, [please tell us](https://github.com/explosion/spaCy/issues)! We know there's nothing worse than trying to figure out what you're doing wrong, and it turns out your code was never the problem.
+time to train. If you find that an example is no longer running,
 [please tell us](https://github.com/explosion/spaCy/issues)! We know there's
 nothing worse than trying to figure out what you're doing wrong, and it turns
 out your code was never the problem.
--- a/examples/_handler.py
+++ b/examples/_handler.py
@ -1,37 +0,0 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 from math import sqrt
 from numpy import dot
 from numpy.linalg import norm
 def handle_tweet(spacy, tweet_data, query):
    text = tweet_data.get('text', u'')
    # Twython returns either bytes or unicode, depending on tweet.
    # ಠ_ಠ #APIshaming
    try:
        match_tweet(spacy, text, query)
    except TypeError:
        match_tweet(spacy, text.decode('utf8'), query)
 def match_tweet(spacy, text, query):
    def get_vector(word):
        return spacy.vocab[word].repvec
    tweet = spacy(text)
    tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
    if tweet:
        accept = map(get_vector, 'child classroom teach'.split())
        reject = map(get_vector, 'mouth hands giveaway'.split())
        y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
        n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
        if (y / (y + n)) >= 0.5 or True:
            print(text)
 def cos(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))
--- a/examples/get_parse_subregions.py
+++ b/examples/get_parse_subregions.py
@ -1,59 +0,0 @@
 """Issue #252
 Question:
 In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat.
 Lets take the example sentence on https://displacy.spacy.io/displacy/index.html
 displaCy uses CSS and JavaScript to show you how computers understand language
 This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 [displaCy] uses CSS and Javascript [to + show]
 &
 show you how computers understand [language]
 I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function.
 def dependency_labels_to_root(token):
    '''Walk up the syntactic tree, collecting the arc labels.'''
    dep_labels = []
    while token.head is not token:
        dep_labels.append(token.dep)
        token = token.head
    return dep_labels
 """
 from __future__ import print_function, unicode_literals
 # Answer:
 # The easiest way is to find the head of the subtree you want, and then use the
 # `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the
 # one that does what you're asking for most directly:
 from spacy.en import English
 nlp = English()
 doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        print(''.join(w.text_with_ws for w in word.subtree))
 # It'd probably be better for `word.subtree` to return a `Span` object instead 
 # of a generator over the tokens. If you want the `Span` you can get it via the 
 # `.right_edge` and `.left_edge` properties. The `Span` object is nice because 
 # you can easily get a vector, merge it, etc.
 doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language')
 for word in doc:
    if word.dep_ in ('xcomp', 'ccomp'):
        subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
        print(subtree_span.text, '|', subtree_span.root.text)
        print(subtree_span.similarity(doc))
        print(subtree_span.similarity(subtree_span.root))
 # You might also want to select a head, and then select a start and end position by
 # walking along its children. You could then take the `.left_edge` and `.right_edge`
 # of those tokens, and use it to calculate a span.
--- a/examples/information_extraction.py
+++ b/examples/information_extraction.py
@ -1,59 +0,0 @@
 import plac
 from spacy.en import English
 from spacy.parts_of_speech import NOUN
 from spacy.parts_of_speech import ADP as PREP
 def _span_to_tuple(span):
    start = span[0].idx
    end = span[-1].idx + len(span[-1])
    tag = span.root.tag_
    text = span.text
    label = span.label_
    return (start, end, tag, text, label)
 def merge_spans(spans, doc):
    # This is a bit awkward atm. What we're doing here is merging the entities,
    # so that each only takes up a single token. But an entity is a Span, and
    # each Span is a view into the doc. When we merge a span, we invalidate
    # the other spans. This will get fixed --- but for now the solution
    # is to gather the information first, before merging.
    tuples = [_span_to_tuple(span) for span in spans]
    for span_tuple in tuples:
        doc.merge(*span_tuple)
 def extract_currency_relations(doc):
    merge_spans(doc.ents, doc)
    merge_spans(doc.noun_chunks, doc)
    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations
 def main():
    nlp = English()
    texts = [
        u'Net income was $9.4 million compared to the prior year of $2.7 million.',
        u'Revenue exceeded twelve billion dollars, with a loss of $1b.',
    ]
    for text in texts:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print(r1.text, r2.ent_type_, r2.text)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -0,0 +1,62 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 A simple example of extracting relations between phrases and entities using
 spaCy's named entity recognizer and the dependency parse. Here, we extract
 money and currency values (entities labelled as MONEY) and then check the
 dependency tree to find the noun phrase they are referring to – for example:
 $9.4 million --> Net income.
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import spacy
 TEXTS = [
    'Net income was $9.4 million compared to the prior year of $2.7 million.',
    'Revenue exceeded twelve billion dollars, with a loss of $1b.',
 ]
@plac.annotations(
    model=("Model to load (needs parser and NER)", "positional", None, str))
 def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    print("Processing %d texts" % len(TEXTS))
    for text in TEXTS:
        doc = nlp(text)
        relations = extract_currency_relations(doc)
        for r1, r2 in relations:
            print('{:<10}\t{}\t{}'.format(r1.text, r2.ent_type_, r2.text))
 def extract_currency_relations(doc):
    # merge entities and noun chunks into one token
    for span in [*list(doc.ents), *list(doc.noun_chunks)]:
        span.merge()
    relations = []
    for money in filter(lambda w: w.ent_type_ == 'MONEY', doc):
        if money.dep_ in ('attr', 'dobj'):
            subject = [w for w in money.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, money))
        elif money.dep_ == 'pobj' and money.head.dep_ == 'prep':
            relations.append((money.head.head, money))
    return relations
 if __name__ == '__main__':
    plac.call(main)
    # Expected output:
    # Net income      MONEY   $9.4 million
    # the prior year  MONEY   $2.7 million
    # Revenue         MONEY   twelve billion dollars
    # a loss          MONEY   1b
--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@ -0,0 +1,65 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 This example shows how to navigate the parse tree including subtrees attached
 to a word.
 Based on issue #252:
 "In the documents and tutorials the main thing I haven't found is
 examples on how to break sentences down into small sub thoughts/chunks. The
 noun_chunks is handy, but having examples on using the token.head to find small
 (near-complete) sentence chunks would be neat. Lets take the example sentence:
 "displaCy uses CSS and JavaScript to show you how computers understand language"
 This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
 [displaCy] uses CSS and Javascript [to + show]
 show you how computers understand [language]
 I'm assuming that we can use the token.head to build these groups."
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import spacy
@plac.annotations(
    model=("Model to load", "positional", None, str))
 def main(model='en_core_web_sm'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)
    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
               "understand language")
    # The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            print(''.join(w.text_with_ws for w in word.subtree))
    # It'd probably be better for `word.subtree` to return a `Span` object
    # instead of a generator over the tokens. If you want the `Span` you can
    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
    # object is nice because you can easily get a vector, merge it, etc.
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
            print(subtree_span.text, '|', subtree_span.root.text)
    # You might also want to select a head, and then select a start and end
    # position by walking along its children. You could then take the
    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
    # a span.
 if __name__ == '__main__':
    plac.call(main)
    # Expected output:
    # to show you how computers understand language
    # how computers understand language
    # to show you how computers understand language | show
    # how computers understand language | understand
--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@ -4,22 +4,24 @@ The idea is to associate each word in the vocabulary with a tag, noting whether
 they begin, end, or are inside at least one pattern. An additional tag is used
 for single-word patterns. Complete patterns are also stored in a hash set.
-When we process a document, we look up the words in the vocabulary, to associate
+When we process a document, we look up the words in the vocabulary, to
-the words with the tags.  We then search for tag-sequences that correspond to
+associate the words with the tags.  We then search for tag-sequences that
-valid candidates. Finally, we look up the candidates in the hash set.
+correspond to valid candidates. Finally, we look up the candidates in the hash
 set.
-For instance, to search for the phrases "Barack Hussein Obama" and "Hilary Clinton", we
+For instance, to search for the phrases "Barack Hussein Obama" and "Hilary
-would associate "Barack" and "Hilary" with the B tag, Hussein with the I tag,
+Clinton", we would associate "Barack" and "Hilary" with the B tag, Hussein with
-and Obama and Clinton with the L tag.
+the I tag, and Obama and Clinton with the L tag.
 The document "Barack Clinton and Hilary Clinton" would have the tag sequence
-[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second candidate
+[{B}, {L}, {}, {B}, {L}], so we'd get two matches. However, only the second
-is in the phrase dictionary, so only one is returned as a match.
+candidate is in the phrase dictionary, so only one is returned as a match.
-The algorithm is O(n) at run-time for document of length n because we're only ever
+The algorithm is O(n) at run-time for document of length n because we're only
-matching over the tag patterns. So no matter how many phrases we're looking for,
+ever matching over the tag patterns. So no matter how many phrases we're
-our pattern set stays very small (exact size depends on the maximum length we're
+looking for, our pattern set stays very small (exact size depends on the
-looking for, as the query language currently has no quantifiers)
+maximum length we're looking for, as the query language currently has no
 quantifiers).
 The example expects a .bz2 file from the Reddit corpus, and a patterns file,
 formatted in jsonl as a sequence of entries like this:
@ -32,11 +34,9 @@ formatted in jsonl as a sequence of entries like this:
 {"text":"Argentina"}
 """
 from __future__ import print_function, unicode_literals, division
 from bz2 import BZ2File
 import time
 import math
 import codecs
 import plac
 import ujson
@ -44,6 +44,24 @@ from spacy.matcher import PhraseMatcher
 import spacy
@plac.annotations(
    patterns_loc=("Path to gazetteer", "positional", None, str),
    text_loc=("Path to Reddit corpus file", "positional", None, str),
    n=("Number of texts to read", "option", "n", int),
    lang=("Language class to initialise", "option", "l", str))
 def main(patterns_loc, text_loc, n=10000, lang='en'):
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
    count = 0
    t1 = time.time()
    for ent_id, text in get_matches(nlp.tokenizer, phrases,
                                    read_text(text_loc, n=n)):
        count += 1
    t2 = time.time()
    print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
 def read_gazetteer(tokenizer, loc, n=-1):
    for i, line in enumerate(open(loc)):
        data = ujson.loads(line.strip())
@ -75,18 +93,6 @@ def get_matches(tokenizer, phrases, texts, max_length=6):
            yield (ent_id, doc[start:end].text)
 def main(patterns_loc, text_loc, n=10000):
    nlp = spacy.blank('en')
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
    count = 0
    t1 = time.time()
    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
        count += 1
    t2 = time.time()
    print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
 if __name__ == '__main__':
    if False:
        import cProfile
--- a/examples/inventory_count/Instructions.md
+++ b/examples/inventory_count/Instructions.md
@ -1,5 +0,0 @@
 An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module.
 In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement).
--- a/examples/inventory_count/inventory.py
+++ b/examples/inventory_count/inventory.py
@ -1,35 +0,0 @@
 class Inventory:
    """
        Inventory class - a struct{} like feature to house inventory counts
        across modules.
    """
    originalQuery = None
    item = ""
    unit = ""
    amount = ""
    def __init__(self, statement):
        """
        Constructor - only takes in the original query/statement
        :return: new Inventory object
        """
        self.originalQuery = statement
        pass
    def __str__(self):
        return str(self.amount) + ' ' + str(self.unit) + ' ' + str(self.item)
    def printInfo(self):
        print '-------------Inventory Count------------'
        print "Original Query:  " + str(self.originalQuery)
        print 'Amount:  ' + str(self.amount)
        print 'Unit:    ' + str(self.unit)
        print 'Item:    ' + str(self.item)
        print '----------------------------------------'
    def isValid(self):
        if not self.item or not self.unit or not self.amount or not self.originalQuery:
            return False
        else:
            return True
--- a/examples/inventory_count/inventoryCount.py
+++ b/examples/inventory_count/inventoryCount.py
@ -1,92 +0,0 @@
 from inventory import Inventory
 def runTest(nlp):
    testset = []
    testset += [nlp(u'6 lobster cakes')]
    testset += [nlp(u'6 avacados')]
    testset += [nlp(u'fifty five carrots')]
    testset += [nlp(u'i have 55 carrots')]
    testset += [nlp(u'i got me some 9 cabbages')]
    testset += [nlp(u'i got 65 kgs of carrots')]
    result = []
    for doc in testset:
        c = decodeInventoryEntry_level1(doc)
        if not c.isValid():
            c = decodeInventoryEntry_level2(doc)
        result.append(c)
    for i in result:
        i.printInfo()
 def decodeInventoryEntry_level1(document):
    """
    Decodes a basic entry such as: '6 lobster cake' or '6' cakes
    @param document : NLP Doc object
    :return: Status if decoded correctly (true, false), and Inventory object
    """
    count = Inventory(str(document))
    for token in document:
        if token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
            item = str(token)
            for child in token.children:
                if child.dep_ == u'compound' or child.dep_ == u'ad':
                    item = str(child) + str(item)
                elif child.dep_ == u'nummod':
                    count.amount = str(child).strip()
                    for numerical_child in child.children:
                        # this isn't arithmetic rather than treating it such as a string
                        count.amount = str(numerical_child) + str(count.amount).strip()
                else:
                    print "WARNING: unknown child: " + str(child) + ':'+str(child.dep_)
            count.item = item
            count.unit = item
    return count
 def decodeInventoryEntry_level2(document):
    """
    Entry level 2, a more complicated parsing scheme that covers examples such as
    'i have 80 boxes of freshly baked pies'
    @document @param document : NLP Doc object
    :return: Status if decoded correctly (true, false), and Inventory object-
    """
    count = Inventory(str(document))
    for token in document:
        #  Look for a preposition object that is a noun (this is the item we are counting).
        #  If found, look at its' dependency (if a preposition that is not indicative of
        #  inventory location, the dependency of the preposition must be a noun
        if token.dep_ == (u'pobj' or u'meta') and token.pos_ == (u'NOUN' or u'NNS' or u'NN'):
            item = ''
            #  Go through all the token's children, these are possible adjectives and other add-ons
            #  this deals with cases such as 'hollow rounded waffle pancakes"
            for i in token.children:
                item += ' ' + str(i)
            item += ' ' + str(token)
            count.item = item
            # Get the head of the item:
            if token.head.dep_ != u'prep':
                #  Break out of the loop, this is a confusing entry
                break
            else:
                amountUnit = token.head.head
                count.unit = str(amountUnit)
                for inner in amountUnit.children:
                    if inner.pos_ == u'NUM':
                        count.amount += str(inner)
    return count
--- a/examples/inventory_count/main.py
+++ b/examples/inventory_count/main.py
@ -1,30 +0,0 @@
 import inventoryCount as mainModule
 import os
 from spacy.en import English
 if __name__ == '__main__':
    """
    Main module for this example - loads the English main NLP class,
    and keeps it in RAM while waiting for the user to re-run it. Allows the
    developer to re-edit their module under testing without having
    to wait as long to load the English class
    """
    #  Set the NLP object here for the parameters you want to see,
    #  or just leave it blank and get all the opts
    print "Loading English module... this will take a while."
    nlp = English()
    print "Done loading English module."
    while True:
        try:
            reload(mainModule)
            mainModule.runTest(nlp)
            raw_input('================ To reload main module, press Enter ================')
        except Exception, e:
            print "Unexpected error: " + str(e)
            continue
--- a/examples/matcher_example.py
+++ b/examples/matcher_example.py
@ -1,161 +0,0 @@
 from __future__ import unicode_literals, print_function
 import spacy.en
 import spacy.matcher
 from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
 import plac
 def main():
    nlp = spacy.en.English()
    example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
    before = nlp(example)
    print("Before")
    for ent in before.ents:
        print(ent.text, ent.label_, [w.tag_ for w in ent])
    # Output:
    # Google ORG [u'NNP']
    # google ORG [u'VB']
    # google ORG [u'NNP']
    nlp.matcher.add(
        "GoogleNow", # Entity ID: Not really used at the moment.
        "PRODUCT",   # Entity type: should be one of the types in the NER data
        {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
        [  # List of patterns that can be Surface Forms of the entity
            # This Surface Form matches "Google Now", verbatim
            [ # Each Surface Form is a list of Token Specifiers.
                { # This Token Specifier matches tokens whose orth field is "Google"
                    ORTH: "Google"
                },
                { # This Token Specifier matches tokens whose orth field is "Now"
                    ORTH: "Now"
                }
            ],
            [ # This Surface Form matches "google now", verbatim, and requires
              # "google" to have the NNP tag. This helps prevent the pattern from
              # matching cases like "I will google now to look up the time"
                {
                    ORTH: "google",
                    TAG: "NNP"
                },
                {
                    ORTH: "now"
                }
            ]
        ]
    )
    after = nlp(example)
    print("After")
    for ent in after.ents:
        print(ent.text, ent.label_, [w.tag_ for w in ent])
    # Output
    # Google Now PRODUCT [u'NNP', u'RB']
    # google ORG [u'VB']
    # google now PRODUCT [u'NNP', u'RB']
    #
    # You can customize attribute values in the lexicon, and then refer to the
    # new attributes in your Token Specifiers.
    # This is particularly good for word-set membership.
    # 
    australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
                           'Darwin', 'Adelaide', 'Perth']
    # Internally, the tokenizer immediately maps each token to a pointer to a 
    # LexemeC struct. These structs hold various features, e.g. the integer IDs
    # of the normalized string forms.
    # For our purposes, the key attribute is a 64-bit integer, used as a bit field.
    # spaCy currently only uses 12 of the bits for its built-in features, so
    # the others are available for use. It's best to use the higher bits, as
    # future versions of spaCy may add more flags. For instance, we might add
    # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
    # FLAG63 here.
    is_australian_capital = FLAG63
    # Now we need to set the flag value. It's False on all tokens by default,
    # so we just need to set it to True for the tokens we want.
    # Here we iterate over the strings, and set it on only the literal matches.
    for string in australian_capitals:
        lexeme = nlp.vocab[string]
        lexeme.set_flag(is_australian_capital, True)
    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
    # If we want case-insensitive matching, we have to be a little bit more
    # round-about, as there's no case-insensitive index to the vocabulary. So
    # we have to iterate over the vocabulary.
    # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
    target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
    for lexeme in nlp.vocab:
        if lexeme.lower in target_ids:
            lexeme.set_flag(is_australian_capital, True)
    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
    print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
    # Output
    # Sydney True
    # sydney False
    # Sydney True
    # sydney True
    # SYDNEY True
    #
    # The key thing to note here is that we're setting these attributes once,
    # over the vocabulary --- and then reusing them at run-time. This means the
    # amortized complexity of anything we do this way is going to be O(1). You
    # can match over expressions that need to have sets with tens of thousands
    # of values, e.g. "all the street names in Germany", and you'll still have
    # O(1) complexity. Most regular expression algorithms don't scale well to
    # this sort of problem.
    #
    # Now, let's use this in a pattern
    nlp.matcher.add("AuCitySportsTeam", "ORG", {},
        [
            [
                {LOWER: "the"},
                {is_australian_capital: True},
                {TAG: "NNS"}
            ],
            [
                {LOWER: "the"},
                {is_australian_capital: True},
                {TAG: "NNPS"}
            ],
            [
                {LOWER: "the"},
                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
                {is_australian_capital: True},
                {TAG: "NNS"}
            ],
            [
                {LOWER: "the"},
                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
                {is_australian_capital: True},
                {TAG: "NNPS"}
            ]
        ])
    doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
    for ent in doc.ents:
        print(ent.text, ent.label_)
    # Output
    # the Brisbane Broncos ORG
    # the South Darwin Spiders ORG
 # Output
 # Before
 # Google ORG [u'NNP']
 # google ORG [u'VB']
 # google ORG [u'NNP']
 # After
 # Google Now PRODUCT [u'NNP', u'RB']
 # google ORG [u'VB']
 # google now PRODUCT [u'NNP', u'RB']
 # Sydney True
 # sydney False
 # Sydney True
 # sydney True
 # SYDNEY True
 # the Brisbane Broncos ORG
 # the South Darwin Spiders ORG
 if __name__ == '__main__':
    main()
--- a/examples/parallel_parse.py
+++ b/examples/parallel_parse.py
@ -1,74 +0,0 @@
 from __future__ import print_function, unicode_literals, division
 import io
 import bz2
 import logging
 from toolz import partition
 from os import path
 import re
 import spacy.en
 from spacy.tokens import Doc
 from joblib import Parallel, delayed
 import plac
 import ujson
 def parallelize(func, iterator, n_jobs, extra, backend='multiprocessing'):
    extra = tuple(extra)
    return Parallel(n_jobs=n_jobs, backend=backend)(delayed(func)(*(item + extra))
                    for item in iterator)
 def iter_comments(loc):
    with bz2.BZ2File(loc) as file_:
        for i, line in enumerate(file_):
            yield ujson.loads(line)['body']
 pre_format_re = re.compile(r'^[\`\*\~]')
 post_format_re = re.compile(r'[\`\*\~]$')
 url_re = re.compile(r'\[([^]]+)\]\(%%URL\)')
 link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)')
 def strip_meta(text):
    text = link_re.sub(r'\1', text)
    text = text.replace('&gt;', '>').replace('&lt;', '<')
    text = pre_format_re.sub('', text)
    text = post_format_re.sub('', text)
    return text.strip()
 def save_parses(batch_id, input_, out_dir, n_threads, batch_size):
    out_loc = path.join(out_dir, '%d.bin' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English()
    nlp.matcher = None
    with open(out_loc, 'wb') as file_:
        texts = (strip_meta(text) for text in input_)
        texts = (text for text in texts if text.strip())
        for doc in nlp.pipe(texts, batch_size=batch_size, n_threads=n_threads):
            file_.write(doc.to_bytes())
@plac.annotations(
    in_loc=("Location of input file"),
    out_dir=("Location of input file"),
    n_process=("Number of processes", "option", "p", int),
    n_thread=("Number of threads per process", "option", "t", int),
    batch_size=("Number of texts to accumulate in a buffer", "option", "b", int)
 )
 def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100):
    if not path.exists(out_dir):
        path.join(out_dir)
    if n_process >= 2:
        texts = partition(200000, iter_comments(in_loc))
        parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size],
                   backend='multiprocessing')
    else:
        save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -1,35 +1,60 @@
 #!/usr/bin/env python
 # coding: utf-8
 """This example contains several snippets of methods that can be set via custom
 Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 they're "bound" to the object and are partially applied – i.e. the object
-they're called on is passed in as the first argument."""
+they're called on is passed in as the first argument.
 from __future__ import unicode_literals
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span
 from spacy import displacy
 from pathlib import Path
@plac.annotations(
    output_dir=("Output directory for saved HTML", "positional", None, Path))
 def main(output_dir=None):
    nlp = English()  # start off with blank English class
    Doc.set_extension('overlap', method=overlap_tokens)
    doc1 = nlp(u"Peach emoji is where it has always been.")
    doc2 = nlp(u"Peach is the superior emoji.")
    print("Text 1:", doc1.text)
    print("Text 2:", doc2.text)
    print("Overlapping tokens:", doc1._.overlap(doc2))
    Doc.set_extension('to_html', method=to_html)
    doc = nlp(u"This is a sentence about Apple.")
    # add entity manually for demo purposes, to make it work without a model
    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
    print("Text:", doc.text)
    doc._.to_html(output=output_dir, style='ent')
 def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
    output_path = Path(output) / file_name
    html = displacy.render(doc, style=style, page=True)  # render markup
-    output_path.open('w', encoding='utf-8').write(html)  # save to file
+    if output is not None:
-    print('Saved HTML to {}'.format(output_path))
+        output_path = Path(output)
-
+        if not output_path.exists():
-
+            output_path.mkdir()
-Doc.set_extension('to_html', method=to_html)
+        output_file = Path(output) / file_name
-
+        output_file.open('w', encoding='utf-8').write(html)  # save to file
-nlp = English()
+        print('Saved HTML to {}'.format(output_file))
-doc = nlp(u"This is a sentence about Apple.")
+    else:
-# add entity manually for demo purposes, to make it work without a model
+        print(html)
 doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
 doc._.to_html(style='ent')
 def overlap_tokens(doc, other_doc):
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
    return overlap
-Doc.set_extension('overlap', method=overlap_tokens)
+if __name__ == '__main__':
    plac.call(main)
-nlp = English()
+    # Expected output:
-doc1 = nlp(u"Peach emoji is where it has always been.")
+    # Text 1: Peach emoji is where it has always been.
-doc2 = nlp(u"Peach is the superior emoji.")
+    # Text 2: Peach is the superior emoji.
-tokens = doc1._.overlap(doc2)
+    # Overlapping tokens: [Peach, emoji, is, .]
 print(tokens)
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -1,21 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
-from __future__ import unicode_literals
+# coding: utf8
 """Example of a spaCy v2.0 pipeline component that requests all countries via
 the REST Countries API, merges country names into one token, assigns entity
 labels and sets attributes on country tokens, e.g. the capital and lat/lng
 coordinates. Can be extended with more details from the API.
 * REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import requests
-
+import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
-class RESTCountriesComponent(object):
+def main():
-    """Example of a spaCy v2.0 pipeline component that requests all countries
+    # For simplicity, we start off with only the blank English Language class
-    via the REST Countries API, merges country names into one token, assigns
+    # and no model or pre-defined pipeline loaded.
-    entity labels and sets attributes on country tokens, e.g. the capital and
+    nlp = English()
-    lat/lng coordinates. Can be extended with more details from the API.
+    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries) # add it to the pipeline
    doc = nlp(u"Some text about Colombia and the Czech Republic")
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Doc has countries', doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(token.text, token._.country_capital, token._.country_latlng,
                token._.country_flag)  # country data
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities
-    REST Countries API: https://restcountries.eu
+
-    API License: Mozilla Public License MPL 2.0
+class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """
    name = 'rest_countries' # component name, will show up in the pipeline
@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
        return any([t._.get('is_country') for t in tokens])
-# For simplicity, we start off with only the blank English Language class and
+if __name__ == '__main__':
-# no model or pre-defined pipeline loaded.
+    plac.call(main)
-nlp = English()
+    # Expected output:
-rest_countries = RESTCountriesComponent(nlp)  # initialise component
+    # Pipeline ['rest_countries']
-nlp.add_pipe(rest_countries) # add it to the pipeline
+    # Doc has countries True
-
+    # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
-doc = nlp(u"Some text about Colombia and the Czech Republic")
+    # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
-
+    # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Doc has countries', doc._.has_country)  # Doc contains countries
 for token in doc:
    if token._.is_country:
        print(token.text, token._.country_capital, token._.country_latlng,
              token._.country_flag)  # country data
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -1,11 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
-from __future__ import unicode_literals
+# coding: utf8
 """Example of a spaCy v2.0 pipeline component that sets entity annotations
 based on list of single or multiple-word company names. Companies are
 labelled as ORG and their spans are merged into one token. Additionally,
 ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
 respectively.
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
 Developed for: spaCy 2.0.0a17
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token
@plac.annotations(
    text=("Text to process", "positional", None, str),
    companies=("Names of technology companies", "positional", None, str))
 def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline
    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
 class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
        return any([t._.get('is_tech_org') for t in tokens])
-# For simplicity, we start off with only the blank English Language class and
+if __name__ == '__main__':
-# no model or pre-defined pipeline loaded.
+    plac.call(main)
-nlp = English()
+    # Expected output:
-companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+    # Pipeline ['tech_companies']
-component = TechCompanyRecognizer(nlp, companies)  # initialise component
+    # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
-nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
+    # Doc has_tech_org True
-
+    # Token 0 is_tech_org True
-doc = nlp(u"Alphabet Inc. is the company behind Google.")
+    # Token 1 is_tech_org False
-
+    # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]
 print('Pipeline', nlp.pipe_names)  # pipeline contains component name
 print('Tokens', [t.text for t in doc])  # company names from the list are merged
 print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
 print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
 print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
 print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -0,0 +1,73 @@
 """
 Example of multi-processing with Joblib. Here, we're exporting
 part-of-speech-tagged, true-cased, (very roughly) sentence-separated text, with
 each "sentence" on a newline, and spaces between tokens. Data is loaded from
 the IMDB movie reviews dataset and will be loaded automatically via Thinc's
 built-in dataset loader.
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import print_function, unicode_literals
 from toolz import partition_all
 from pathlib import Path
 from joblib import Parallel, delayed
 import thinc.extra.datasets
 import plac
 import spacy
@plac.annotations(
    output_dir=("Output directory", "positional", None, Path),
    model=("Model name (needs tagger)", "positional", None, str),
    n_jobs=("Number of workers", "option", "n", int),
    batch_size=("Batch-size for each process", "option", "b", int),
    limit=("Limit of entries from the dataset", "option", "l", int))
 def main(output_dir, model='en_core_web_sm', n_jobs=4, batch_size=1000,
         limit=10000):
    nlp = spacy.load(model)  # load spaCy model
    print("Loaded model '%s'" % model)
    if not output_dir.exists():
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
    data, _ = thinc.extra.datasets.imdb()
    texts, _ = zip(*data[-limit:])
    partitions = partition_all(batch_size, texts)
    items = ((i, [nlp(text) for text in texts], output_dir) for i, texts
             in enumerate(partitions))
    Parallel(n_jobs=n_jobs)(delayed(transform_texts)(*item) for item in items)
 def transform_texts(batch_id, docs, output_dir):
    out_path = Path(output_dir) / ('%d.txt' % batch_id)
    if out_path.exists():  # return None in case same batch is called again
        return None
    print('Processing batch', batch_id)
    with out_path.open('w', encoding='utf8') as f:
        for doc in docs:
            f.write(' '.join(represent_word(w) for w in doc if not w.is_space))
            f.write('\n')
    print('Saved {} texts to {}.txt'.format(len(docs), batch_id))
 def represent_word(word):
    text = word.text
    # True-case, i.e. try to normalize sentence-initial capitals.
    # Only do this if the lower-cased form is more probable.
    if text.istitle() and is_sent_begin(word) \
       and word.prob < word.doc.vocab[text.lower()].prob:
        text = text.lower()
    return text + '|' + word.tag_
 def is_sent_begin(word):
    if word.i == 0:
        return True
    elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
        return True
    else:
        return False
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/pos_tag.py
+++ b/examples/pos_tag.py
@ -1,90 +0,0 @@
 """
 Print part-of-speech tagged, true-cased, (very roughly) sentence-separated
 text, with each "sentence" on a newline, and spaces between tokens. Supports
 multi-processing.
 """
 from __future__ import print_function, unicode_literals, division
 import io
 import bz2
 import logging
 from toolz import partition
 from os import path
 import spacy.en
 from joblib import Parallel, delayed
 import plac
 import ujson
 def parallelize(func, iterator, n_jobs, extra):
    extra = tuple(extra)
    return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator)
 def iter_texts_from_json_bz2(loc):
    """
    Iterator of unicode strings, one per document (here, a comment).
    Expects a a path to a BZ2 file, which should be new-line delimited JSON. The
    document text should be in a string field titled 'body'.
    This is the data format of the Reddit comments corpus.
    """
    with bz2.BZ2File(loc) as file_:
        for i, line in enumerate(file_):
            yield ujson.loads(line)['body']
 def transform_texts(batch_id, input_, out_dir):
    out_loc = path.join(out_dir, '%d.txt' % batch_id)
    if path.exists(out_loc):
        return None
    print('Batch', batch_id)
    nlp = spacy.en.English(parser=False, entity=False)
    with io.open(out_loc, 'w', encoding='utf8') as file_:
        for text in input_:
            doc = nlp(text)
            file_.write(' '.join(represent_word(w) for w in doc if not w.is_space))
            file_.write('\n')
 def represent_word(word):
    text = word.text
    # True-case, i.e. try to normalize sentence-initial capitals.
    # Only do this if the lower-cased form is more probable.
    if text.istitle() \
    and is_sent_begin(word) \
    and word.prob < word.doc.vocab[text.lower()].prob:
        text = text.lower()
    return text + '|' + word.tag_
 def is_sent_begin(word):
    # It'd be nice to have some heuristics like these in the library, for these
    # times where we don't care so much about accuracy of SBD, and we don't want
    # to parse
    if word.i == 0:
        return True
    elif word.i >= 2 and word.nbor(-1).text in ('.', '!', '?', '...'):
        return True
    else:
        return False
@plac.annotations(
    in_loc=("Location of input file"),
    out_dir=("Location of input file"),
    n_workers=("Number of workers", "option", "n", int),
    batch_size=("Batch-size for each process", "option", "b", int)
 )
 def main(in_loc, out_dir, n_workers=4, batch_size=100000):
    if not path.exists(out_dir):
        path.join(out_dir)
    texts = partition(batch_size, iter_texts_from_json_bz2(in_loc))
    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/training/load_ner.py
+++ b/examples/training/load_ner.py
@ -1,22 +0,0 @@
 # Load NER
 from __future__ import unicode_literals
 import spacy
 import pathlib
 from spacy.pipeline import EntityRecognizer
 from spacy.vocab import Vocab
 def load_model(model_dir):
    model_dir = pathlib.Path(model_dir)
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
    return (nlp, ner)
 (nlp, ner) = load_model('ner')
 doc = nlp.make_doc('Who is Shaka Khan?')
 nlp.tagger(doc)
 ner(doc)
 for word in doc:
    print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -0,0 +1,157 @@
 #!/usr/bin/env python
 # coding: utf-8
 """Using the parser to recognise your own semantics
 spaCy's parser component can be used to trained to predict any type of tree
 structure over your input text. You can also predict trees over whole documents
 or chat logs, with connections between the sentence-roots used to annotate
 discourse structure. In this example, we'll build a message parser for a common
 "chat intent": finding local businesses. Our message semantics will have the
 following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
 "show me the best hotel in berlin"
 ('show', 'ROOT', 'show')
 ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
 ('hotel', 'PLACE', 'show') --> show PLACE hotel
 ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
 import spacy
 from spacy.gold import GoldParse
 from spacy.tokens import Doc
 from pathlib import Path
 # training data: words, head and dependency labels
 # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
 TRAIN_DATA = [
    (
        ['find', 'a', 'cafe', 'with', 'great', 'wifi'],
        [0, 2, 0, 5, 5, 2],  # index of token head
        ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
    ),
    (
        ['find', 'a', 'hotel', 'near', 'the', 'beach'],
        [0, 2, 0, 5, 5, 2],
        ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
    ),
    (
        ['find', 'me', 'the', 'closest', 'gym', 'that', "'s", 'open', 'late'],
        [0, 0, 4, 4, 0, 6, 4, 6, 6],
        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
    ),
    (
        ['show', 'me', 'the', 'cheapest', 'store', 'that', 'sells', 'flowers'],
        [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
    ),
    (
        ['find', 'a', 'nice', 'restaurant', 'in', 'london'],
        [0, 3, 3, 0, 3, 3],
        ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    ),
    (
        ['show', 'me', 'the', 'coolest', 'hostel', 'in', 'berlin'],
        [0, 0, 4, 4, 0, 4, 4],
        ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    ),
    (
        ['find', 'a', 'good', 'italian', 'restaurant', 'near', 'work'],
        [0, 4, 4, 4, 0, 4, 5],
        ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
    )
 ]
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe('parser')
    for _, _, deps in TRAIN_DATA:
        for dep in deps:
            parser.add_label(dep)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training(lambda: [])
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for words, heads, deps in TRAIN_DATA:
                doc = Doc(nlp.vocab, words=words)
                gold = GoldParse(doc, heads=heads, deps=deps)
                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
            print(losses)
    # test the trained model
    test_model(nlp)
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
 def test_model(nlp):
    texts = ["find a hotel with good wifi",
             "find me the cheapest gym near work",
             "show me the best hotel in berlin"]
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
 if __name__ == '__main__':
    plac.call(main)
    # Expected output:
    # find a hotel with good wifi
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('hotel', 'PLACE', 'find'),
    #   ('good', 'QUALITY', 'wifi'),
    #   ('wifi', 'ATTRIBUTE', 'hotel')
    # ]
    # find me the cheapest gym near work
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('cheapest', 'QUALITY', 'gym'),
    #   ('gym', 'PLACE', 'find')
    # ]
    # show me the best hotel in berlin
    # [
    #   ('show', 'ROOT', 'show'),
    #   ('best', 'QUALITY', 'hotel'),
    #   ('hotel', 'PLACE', 'show'),
    #   ('berlin', 'LOCATION', 'hotel')
    # ]
--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -1,13 +1,103 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 Example of training spaCy's named entity recognizer, starting off with an
 existing model or a blank model.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
 from pathlib import Path
-from spacy.lang.en import English
+import spacy
 from spacy.gold import GoldParse, biluo_tags_from_offsets
 # training data
 TRAIN_DATA = [
    ('Who is Shaka Khan?', [(7, 17, 'PERSON')]),
    ('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
 ]
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # function that allows begin_training to get the training data
    get_data = lambda: reformat_train_data(nlp.tokenizer, TRAIN_DATA)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training(get_data)
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for raw_text, entity_offsets in TRAIN_DATA:
                doc = nlp.make_doc(raw_text)
                gold = GoldParse(doc, entities=entity_offsets)
                nlp.update(
                    [doc], # Batch of Doc objects
                    [gold], # Batch of GoldParse objects
                    drop=0.5, # Dropout -- make it harder to memorise data
                    sgd=optimizer, # Callable to update weights
                    losses=losses)
            print(losses)
    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
            print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 def reformat_train_data(tokenizer, examples):
-    """Reformat data to match JSON format"""
+    """Reformat data to match JSON format.
    https://alpha.spacy.io/api/annotation#json-input
    tokenizer (Tokenizer): Tokenizer to process the raw text.
    examples (list): The trainig data.
    RETURNS (list): The reformatted training data."""
    output = []
    for i, (text, entity_offsets) in enumerate(examples):
        doc = tokenizer(text)
@ -21,49 +111,5 @@ def reformat_train_data(tokenizer, examples):
    return output
 def main(model_dir=None):
    train_data = [
        (
            'Who is Shaka Khan?',
            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
        ),
        (
            'I like London and Berlin.',
            [(len('I like '), len('I like London'), 'LOC'),
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
    nlp = English(pipeline=['tensorizer', 'ner'])
    get_data = lambda: reformat_train_data(nlp.tokenizer, train_data)
    optimizer = nlp.begin_training(get_data)
    for itn in range(100):
        random.shuffle(train_data)
        losses = {}
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            nlp.update(
                [doc], # Batch of Doc objects
                [gold], # Batch of GoldParse objects
                drop=0.5, # Dropout -- make it harder to memorise data
                sgd=optimizer, # Callable to update weights
                losses=losses)
        print(losses)
    print("Save to", model_dir)
    nlp.to_disk(model_dir)
    print("Load from", model_dir)
    nlp = spacy.lang.en.English(pipeline=['tensorizer', 'ner'])
    nlp.from_disk(model_dir)
    for raw_text, _ in train_data:
        doc = nlp(raw_text)
        for word in doc:
            print(word.text, word.ent_type_, word.ent_iob_)
 if __name__ == '__main__':
    import plac
    plac.call(main)
    # Who "" 2
    # is "" 2
    # Shaka "" PERSON 3
    # Khan "" PERSON 1
    # ? "" 2
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -1,206 +0,0 @@
 #!/usr/bin/env python
 '''Example of training a named entity recognition system from scratch using spaCy
 This example is written to be self-contained and reasonably transparent.
 To achieve that, it duplicates some of spaCy's internal functionality.
 Specifically, in this example, we don't use spaCy's built-in Language class to
 wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
 our own simple Pipeline class, so that it's easier to see how the pieces
 interact.
 Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 Developed for: spaCy 1.7.1
 Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
 import tqdm
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 from spacy.vocab import Vocab
 from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
 from spacy.gold import iob_to_biluo
 from spacy.gold import minibatch
 from spacy.scorer import Scorer
 import spacy.util
 try:
    unicode
 except NameError:
    unicode = str
 spacy.util.set_env_log(True)
 def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
            NORM: lambda string: string.lower(),
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
        })
 class Pipeline(object):
    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if entity is None:
            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.entity = entity
        self.pipeline = [self.entity]
    def begin_training(self):
        for model in self.pipeline:
            model.begin_training([])
        optimizer = Adam(NumpyOps(), 0.001)
        return optimizer
    def __call__(self, input_):
        doc = self.make_doc(input_)
        for process in self.pipeline:
            process(doc)
        return doc
    def make_doc(self, input_):
        if isinstance(input_, bytes):
            input_ = input_.decode('utf8')
        if isinstance(input_, unicode):
            return self.tokenizer(input_)
        else:
            return Doc(self.vocab, words=input_)
    def make_gold(self, input_, annotations):
        doc = self.make_doc(input_)
        gold = GoldParse(doc, entities=annotations)
        return gold
    def update(self, inputs, annots, sgd, losses=None, drop=0.):
        if losses is None:
            losses = {}
        docs = [self.make_doc(input_) for input_ in inputs]
        golds = [self.make_gold(input_, annot) for input_, annot in
                 zip(inputs, annots)]
        self.entity.update(docs, golds, drop=drop,
                           sgd=sgd, losses=losses)
        return losses
    def evaluate(self, examples):
        scorer = Scorer()
        for input_, annot in examples:
            gold = self.make_gold(input_, annot)
            doc = self(input_)
            scorer.score(doc, gold)
        return scorer.scores
    def to_disk(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
        self.vocab.to_disk(path / 'vocab')
        self.entity.to_disk(path / 'ner')
    def from_disk(self, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        self.vocab = self.vocab.from_disk(path / 'vocab')
        self.entity = self.entity.from_disk(path / 'ner')
 def train(nlp, train_examples, dev_examples, nr_epoch=5):
    sgd = nlp.begin_training()
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
        random.shuffle(train_examples)
        losses = {}
        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
            inputs, annots = zip(*batch)
            nlp.update(list(inputs), list(annots), sgd, losses=losses)
        scores = nlp.evaluate(dev_examples)
        report_scores(i+1, losses['ner'], scores)
 def report_scores(i, loss, scores):
    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
    print('Epoch %d: %d %s %s %s' % (
        i, int(loss), precision, recall, f_measure))
 def read_examples(path):
    path = Path(path)
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
            sent = sent.strip()
            if not sent:
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
                tokens.pop(0)
            words = []
            iob = []
            for token in tokens:
                if token.strip():
                    pieces = token.split('\t')
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
 def get_labels(examples):
    labels = set()
    for words, tags in examples:
        for tag in tags:
            if '-' in tag:
                labels.add(tag.split('-')[1])
    return sorted(labels)
@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
 def main(model_dir, train_loc, dev_loc, nr_epoch=30):
    print(model_dir, train_loc, dev_loc)
    train_examples = list(read_examples(train_loc))
    dev_examples = read_examples(dev_loc)
    nlp = Pipeline()
    for label in get_labels(train_examples):
        nlp.entity.add_label(label)
        print("Add label", label)
    train(nlp, train_examples, list(dev_examples), nr_epoch)
    nlp.to_disk(model_dir)
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -21,26 +21,114 @@ After training your model, you can save it to a directory. We recommend
 wrapping models as Python packages, for ease of deployment.
 For more details, see the documentation:
-* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
+* Training: https://alpha.spacy.io/usage/training
-* Saving and loading models: https://spacy.io/docs/usage/saving-loading
+* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
-Developed for: spaCy 1.7.6
+Developed for: spaCy 2.0.0a18
-Last updated for: spaCy 2.0.0a13
+Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
 from pathlib import Path
 import random
 import spacy
 from spacy.gold import GoldParse, minibatch
-from spacy.pipeline import NeuralEntityRecognizer
+
-from spacy.pipeline import TokenVectorEncoder
+
 # new entity label
 LABEL = 'ANIMAL'
 # training data
 TRAIN_DATA = [
    ("Horses are too tall and they pretend to care about your feelings",
     [(0, 6, 'ANIMAL')]),
    ("Do they bite?", []),
    ("horses are too tall and they pretend to care about your feelings",
     [(0, 6, 'ANIMAL')]),
    ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]),
    ("they pretend to care about your feelings, those horses",
     [(48, 54, 'ANIMAL')]),
    ("horses?", [(0, 6, 'ANIMAL')])
 ]
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(model=None, new_model_name='animal', output_dir=None, n_iter=50):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    ner.add_label(LABEL)   # add new entity label to entity recognizer
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        random.seed(0)
        optimizer = nlp.begin_training(lambda: [])
        for itn in range(n_iter):
            losses = {}
            gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA)
            for batch in minibatch(gold_parses, size=3):
                docs, golds = zip(*batch)
                nlp.update(docs, golds, losses=losses, sgd=optimizer,
                           drop=0.35)
            print(losses)
    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
 def get_gold_parses(tokenizer, train_data):
-    '''Shuffle and create GoldParse objects'''
+    """Shuffle and create GoldParse objects.
    tokenizer (Tokenizer): Tokenizer to processs the raw text.
    train_data (list): The training data.
    YIELDS (tuple): (doc, gold) tuples.
    """
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = tokenizer(raw_text)
@ -48,77 +136,5 @@ def get_gold_parses(tokenizer, train_data):
        yield doc, gold
 def train_ner(nlp, train_data, output_dir):
    random.seed(0)
    optimizer = nlp.begin_training(lambda: [])
    nlp.meta['name'] = 'en_ent_animal'
    for itn in range(50):
        losses = {}
        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
            docs, golds = zip(*batch)
            nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35)
        print(losses)
    if not output_dir:
        return
    elif not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
 def main(model_name, output_directory=None):
    print("Creating initial model", model_name)
    nlp = spacy.blank(model_name)
    if output_directory is not None:
        output_directory = Path(output_directory)
    train_data = [
        (
            "Horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')],
        ),
        (
            "Do they bite?", 
            [],
        ),
        (
            "horses are too tall and they pretend to care about your feelings",
            [(0, 6, 'ANIMAL')]
        ),
        (
            "horses pretend to care about your feelings",
            [(0, 6, 'ANIMAL')]
        ),
        (
            "they pretend to care about your feelings, those horses",
            [(48, 54, 'ANIMAL')]
        ),
        (
            "horses?",
            [(0, 6, 'ANIMAL')]
        )
    ]
    nlp.add_pipe(TokenVectorEncoder(nlp.vocab))
    ner = NeuralEntityRecognizer(nlp.vocab)
    ner.add_label('ANIMAL')
    nlp.add_pipe(ner)
    train_ner(nlp, train_data, output_directory)
    # Test that the entity is recognized
    text = 'Do you like horses?'
    print("Ents in 'Do you like horses?':")
    doc = nlp(text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
    if output_directory:
        print("Loading from", output_directory)
        nlp2 = spacy.load(output_directory)
        doc2 = nlp2('Do you like horses?')
        for ent in doc2.ents:
            print(ent.label_, ent.text)
 if __name__ == '__main__':
    import plac
    plac.call(main)
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -1,75 +1,109 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 Example of training spaCy dependency parser, starting off with an existing model
 or a blank model.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
-import json
+
-import pathlib
+import plac
 import random
 from pathlib import Path
 import spacy
 from spacy.pipeline import DependencyParser
 from spacy.gold import GoldParse
 from spacy.tokens import Doc
-def train_parser(nlp, train_data, left_labels, right_labels):
+# training data
-    parser = DependencyParser(
+TRAIN_DATA = [
-                nlp.vocab,
+    (
-                left_labels=left_labels,
+        ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
-                right_labels=right_labels)
+        [1, 1, 4, 4, 5, 1, 1],
-    for itn in range(1000):
+        ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
-        random.shuffle(train_data)
+    ),
-        loss = 0
+    (
-        for words, heads, deps in train_data:
+        ['I', 'like', 'London', 'and', 'Berlin', '.'],
-            doc = Doc(nlp.vocab, words=words)
+        [1, 1, 1, 2, 2, 1],
-            gold = GoldParse(doc, heads=heads, deps=deps)
+        ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
-            loss += parser.update(doc, gold)
+    )
-    parser.model.end_training()
+]
    return parser
-def main(model_dir=None):
+@plac.annotations(
-    if model_dir is not None:
+    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
-        model_dir = pathlib.Path(model_dir)
+    output_dir=("Optional output directory", "option", "o", Path),
-        if not model_dir.exists():
+    n_iter=("Number of training iterations", "option", "n", int))
-            model_dir.mkdir()
+def main(model=None, output_dir=None, n_iter=1000):
-        assert model_dir.is_dir()
+    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
-    nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False)
+    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe('parser')
-    train_data = [
+    # add labels to the parser
-        (
+    for _, _, deps in TRAIN_DATA:
-            ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
+        for dep in deps:
-            [1, 1, 4, 4, 5, 1, 1],
+            parser.add_label(dep)
            ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
        ),
        (
            ['I', 'like', 'London', 'and', 'Berlin', '.'],
            [1, 1, 1, 2, 2, 1],
            ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
        )
    ]
    left_labels = set()
    right_labels = set()
    for _, heads, deps in train_data:
        for i, (head, dep) in enumerate(zip(heads, deps)):
            if i < head:
                left_labels.add(dep)
            elif i > head:
                right_labels.add(dep)
    parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
-    doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
+    # get names of other pipes to disable them during training
-    parser(doc)
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
-    for word in doc:
+    with nlp.disable_pipes(*other_pipes):  # only train parser
-        print(word.text, word.dep_, word.head.text)
+        optimizer = nlp.begin_training(lambda: [])
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for words, heads, deps in TRAIN_DATA:
                doc = Doc(nlp.vocab, words=words)
                gold = GoldParse(doc, heads=heads, deps=deps)
                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
            print(losses)
-    if model_dir is not None:
+    # test the trained model
-        with (model_dir / 'config.json').open('w') as file_:
+    test_text = "I like securities."
-            json.dump(parser.cfg, file_)
+    doc = nlp(test_text)
-        parser.model.dump(str(model_dir / 'model'))
+    print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
 if __name__ == '__main__':
-    main()
+    plac.call(main)
-    # I nsubj like
+
-    # like ROOT like
+    # expected result:
-    # securities dobj like
+    # [
-    # . cc securities
+    #   ('I', 'nsubj', 'like'),
    #   ('like', 'ROOT', 'like'),
    #   ('securities', 'dobj', 'like'),
    #   ('.', 'punct', 'like')
    # ]
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -1,18 +1,28 @@
-"""A quick example for training a part-of-speech tagger, without worrying
+#!/usr/bin/env python
-about the tokenization, or other language-specific customizations."""
+# coding: utf8
 """
 A simple example for training a part-of-speech tagger with a custom tag map.
 To allow us to update the tag map with our custom one, this example starts off
 with a blank Language class and modifies its defaults.
-from __future__ import unicode_literals
+For more details, see the documentation:
-from __future__ import print_function
+* Training: https://alpha.spacy.io/usage/training
 * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
 from pathlib import Path
-from spacy.vocab import Vocab
+import spacy
-from spacy.tagger import Tagger
+from spacy.util import get_lang_class
 from spacy.tokens import Doc
 from spacy.gold import GoldParse
 import random
 # You need to define a mapping from your data's part-of-speech tag names to the
 # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
@ -28,54 +38,67 @@ TAG_MAP = {
 # Usually you'll read this in, of course. Data formats vary.
 # Ensure your strings are unicode.
-DATA = [
+TRAIN_DATA = [
-    (
+    (["I", "like", "green", "eggs"], ["N", "V", "J", "N"]),
-        ["I", "like", "green", "eggs"],
+    (["Eat", "blue", "ham"], ["V", "J", "N"])
        ["N", "V", "J", "N"]
    ),
    (
        ["Eat", "blue", "ham"],
        ["V", "J", "N"]
    )
 ]
-def ensure_dir(path):
+@plac.annotations(
-    if not path.exists():
+    lang=("ISO Code of language to use", "option", "l", str),
-        path.mkdir()
+    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(lang='en', output_dir=None, n_iter=25):
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    lang_cls = get_lang_class(lang)  # get Language class
    lang_cls.Defaults.tag_map.update(TAG_MAP)  # add tag map to defaults
    nlp = lang_cls()  # initialise Language class
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe('tagger')
    nlp.add_pipe(tagger)
-def main(output_dir=None):
+    optimizer = nlp.begin_training(lambda: [])
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for words, tags in TRAIN_DATA:
            doc = Doc(nlp.vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            nlp.update([doc], [gold], sgd=optimizer, losses=losses)
        print(losses)
    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
-        ensure_dir(output_dir)
+        if not output_dir.exists():
-        ensure_dir(output_dir / "pos")
+            output_dir.mkdir()
-        ensure_dir(output_dir / "vocab")
+        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
-    vocab = Vocab(tag_map=TAG_MAP)
+        # test the save model
-    # The default_templates argument is where features are specified. See
+        print("Loading from", output_dir)
-    # spacy/tagger.pyx for the defaults.
+        nlp2 = spacy.load(output_dir)
-    tagger = Tagger(vocab)
+        doc = nlp2(test_text)
-    for i in range(25):
+        print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
        for words, tags in DATA:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            tagger.update(doc, gold)
        random.shuffle(DATA)
    tagger.model.end_training()
    doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4))
    tagger(doc)
    for word in doc:
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
 if __name__ == '__main__':
    plac.call(main)
-    # I V VERB
+
-    # like V VERB
+    # Expected output:
-    # blue N NOUN
+    # [
-    # eggs N NOUN
+    #   ('I', 'N', 'NOUN'),
    #   ('like', 'V', 'VERB'),
    #   ('blue', 'J', 'ADJ'),
    #   ('eggs', 'N', 'NOUN')
    # ]
--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -1,58 +1,119 @@
-'''Train a multi-label convolutional neural network text classifier,
+#!/usr/bin/env python
-using the spacy.pipeline.TextCategorizer component. The model is then added
+# coding: utf8
-to spacy.pipeline, and predictions are available at `doc.cats`.
+"""Train a multi-label convolutional neural network text classifier on the
-'''
+IMDB dataset, using the TextCategorizer component. The dataset will be loaded
-from __future__ import unicode_literals
+automatically via Thinc's built-in dataset loader. The model is added to
 spacy.pipeline, and predictions are available via `doc.cats`.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Text classification: https://alpha.spacy.io/usage/text-classification
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
 import plac
 import random
-import tqdm
+from pathlib import Path
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import thinc.extra.datasets
-import spacy.lang.en
+import spacy
 from spacy.gold import GoldParse, minibatch
 from spacy.util import compounding
 from spacy.pipeline import TextCategorizer
 # TODO: Remove this once we're not supporting models trained with thinc <6.9.0
 import thinc.neural._classes.layernorm
 thinc.neural._classes.layernorm.set_compat_six_eight(False)
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
 def main(model=None, output_dir=None, n_iter=20):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
-def train_textcat(tokenizer, textcat,
+    # add the text classifier to the pipeline if it doesn't exist
-                  train_texts, train_cats, dev_texts, dev_cats,
+    # nlp.create_pipe works for built-ins that are registered with spaCy
-                  n_iter=20):
+    if 'textcat' not in nlp.pipe_names:
-    '''
+        # textcat = nlp.create_pipe('textcat')
-    Train the TextCategorizer without associated pipeline.
+        textcat = TextCategorizer(nlp.vocab, labels=['POSITIVE'])
-    '''
+        nlp.add_pipe(textcat, last=True)
-    textcat.begin_training()
+    # otherwise, get it, so we can add labels to it
-    optimizer = Adam(NumpyOps(), 0.001)
+    else:
-    train_docs = [tokenizer(text) for text in train_texts]
+        textcat = nlp.get_pipe('textcat')
    # add label to text classifier
    # textcat.add_label('POSITIVE')
    # load the IMBD dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
    train_docs = [nlp.tokenizer(text) for text in train_texts]
    train_gold = [GoldParse(doc, cats=cats) for doc, cats in
                  zip(train_docs, train_cats)]
    train_data = list(zip(train_docs, train_gold))
-    batch_sizes = compounding(4., 128., 1.001)
+
-    for i in range(n_iter):
+    # get names of other pipes to disable them during training
-        losses = {}
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
-        # Progress bar and minibatching
+    with nlp.disable_pipes(*other_pipes):  # only train textcat
-        batches = minibatch(tqdm.tqdm(train_data, leave=False), size=batch_sizes)
+        optimizer = nlp.begin_training(lambda: [])
-        for batch in batches:
+        print("Training the model...")
-            docs, golds = zip(*batch)
+        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
-            textcat.update(docs, golds, sgd=optimizer, drop=0.2,
+        for i in range(n_iter):
-                losses=losses)
+            losses = {}
-        with textcat.model.use_params(optimizer.averages):
+            # batch up the examples using spaCy's minibatch
-            scores = evaluate(tokenizer, textcat, dev_texts, dev_cats)
+            batches = minibatch(train_data, size=compounding(4., 128., 1.001))
-        yield losses['textcat'], scores
+            for batch in batches:
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))
    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
 def load_data(limit=0, split=0.8):
    """Load data from the IMDB dataset."""
    # Partition off part of the train data for evaluation
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])
 def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
-    tp = 1e-8 # True positives
+    tp = 1e-8  # True positives
-    fp = 1e-8 # False positives
+    fp = 1e-8  # False positives
-    fn = 1e-8 # False negatives
+    fn = 1e-8  # False negatives
-    tn = 1e-8 # True negatives
+    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
@ -66,55 +127,10 @@ def evaluate(tokenizer, textcat, texts, cats):
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
-    precis = tp / (tp + fp)
+    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
-    fscore = 2 * (precis * recall) / (precis + recall)
+    f_score = 2 * (precision * recall) / (precision + recall)
-    return {'textcat_p': precis, 'textcat_r': recall, 'textcat_f': fscore}
+    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}
 def load_data(limit=0):
    # Partition off part of the train data --- avoid running experiments
    # against test.
    train_data, _ = thinc.extra.datasets.imdb()
    random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(y)} for y in labels]
    split = int(len(train_data) * 0.8)
    train_texts = texts[:split]
    train_cats = cats[:split]
    dev_texts = texts[split:]
    dev_cats = cats[split:]
    return (train_texts, train_cats), (dev_texts, dev_cats)
 def main(model_loc=None):
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['POSITIVE'])
    print("Load IMDB data")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
    print("Itn.\tLoss\tP\tR\tF")
    progress = '{i:d} {loss:.3f} {textcat_p:.3f} {textcat_r:.3f} {textcat_f:.3f}'
    for i, (loss, scores) in enumerate(train_textcat(tokenizer, textcat,
                                       train_texts, train_cats,
                                       dev_texts, dev_cats, n_iter=20)):
        print(progress.format(i=i, loss=loss, **scores))
    # How to save, load and use
    nlp.pipeline.append(textcat)
    if model_loc is not None:
        nlp.to_disk(model_loc)
        nlp = spacy.load(model_loc)
        doc = nlp(u'This movie sucked!')
        print(doc.cats)
 if __name__ == '__main__':
--- a/examples/twitter_filter.py
+++ b/examples/twitter_filter.py
@ -1,36 +0,0 @@
 # encoding: utf8
 from __future__ import unicode_literals, print_function
 import plac
 import codecs
 import pathlib
 import random
 import twython
 import spacy.en
 import _handler
 class Connection(twython.TwythonStreamer):
    def __init__(self, keys_dir, nlp, query):
        keys_dir = pathlib.Path(keys_dir)
        read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
        api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
        twython.TwythonStreamer.__init__(self, *api_key)
        self.nlp = nlp
        self.query = query
    def on_success(self, data):
        _handler.handle_tweet(self.nlp, data, self.query)
        if random.random() >= 0.1:
            reload(_handler)
 def main(keys_dir, term):
    nlp = spacy.en.English()
    twitter = Connection(keys_dir, nlp, term)
    twitter.statuses.filter(track=term, language='en')
 if __name__ == '__main__':
    plac.call(main)
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -1,16 +1,19 @@
-'''Load vectors for a language trained using FastText
+#!/usr/bin/env python
-
+# coding: utf8
 """Load vectors for a language trained using fastText
 https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
-'''
+"""
 from __future__ import unicode_literals
 import plac
 import numpy
-import spacy.language
+import from spacy.language import Language
@plac.annotations(
    vectors_loc=("Path to vectors", "positional", None, str))
 def main(vectors_loc):
-    nlp = spacy.language.Language()
+    nlp = Language()
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
--- a/setup.py
+++ b/setup.py
@ -30,19 +30,14 @@ MOD_NAMES = [
    'spacy.syntax._state',
    'spacy.syntax._beam_utils',
    'spacy.tokenizer',
    'spacy._cfile',
    'spacy.syntax.parser',
    'spacy.syntax.nn_parser',
    'spacy.syntax.beam_parser',
    'spacy.syntax.nonproj',
    'spacy.syntax.transition_system',
    'spacy.syntax.arc_eager',
    'spacy.syntax._parse_features',
    'spacy.gold',
    'spacy.tokens.doc',
    'spacy.tokens.span',
    'spacy.tokens.token',
    'spacy.cfile',
    'spacy.matcher',
    'spacy.syntax.ner',
    'spacy.symbols',
--- a/spacy/_cfile.pxd
+++ b/spacy/_cfile.pxd
@ -1,26 +0,0 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 cdef class CFile:
    cdef FILE* fp
    cdef bint is_open
    cdef Pool mem
    cdef int size # For compatibility with subclass
    cdef int _capacity # For compatibility with subclass
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
 cdef class StringCFile(CFile):
    cdef unsigned char* data
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/_cfile.pyx
+++ b/spacy/_cfile.pyx
@ -1,88 +0,0 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from libc.string cimport memcpy
 cdef class CFile:
    def __init__(self, loc, mode, on_open_error=None):
        if isinstance(mode, unicode):
            mode_str = mode.encode('ascii')
        else:
            mode_str = mode
        if hasattr(loc, 'as_posix'):
            loc = loc.as_posix()
        self.mem = Pool()
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self.fp = fopen(<char*>bytes_loc, mode_str)
        if self.fp == NULL:
            if on_open_error is not None:
                on_open_error()
            else:
                raise IOError("Could not open binary file %s" % bytes_loc)
        self.is_open = True
    def __dealloc__(self):
        if self.is_open:
            fclose(self.fp)
    def close(self):
        fclose(self.fp)
        self.is_open = False
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        st = fread(dest, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
        st = fwrite(src, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
 cdef class StringCFile:
    def __init__(self, mode, bytes data=b'', on_open_error=None):
        self.mem = Pool()
        self.is_open = 'w' in mode
        self._capacity = max(len(data), 8)
        self.size = len(data)
        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
        for i in range(len(data)):
            self.data[i] = data[i]
    def close(self):
        self.is_open = False
    def string_data(self):
        return (self.data-self.size)[:self.size]
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        memcpy(dest, self.data, elem_size * number)
        self.data += elem_size * number
    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
        write_size = number * elem_size
        if (self.size + write_size) >= self._capacity:
            self._capacity = (self.size + write_size) * 2
            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
        memcpy(&self.data[self.size], src, elem_size * number)
        self.size += write_size
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -96,7 +96,6 @@ def _zero_init(model):
@layerize
 def _preprocess_doc(docs, drop=0.):
    keys = [doc.to_array([LOWER]) for doc in docs]
    keys = [a[:, 0] for a in keys]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
    keys = ops.xp.concatenate(keys)
@ -464,7 +463,6 @@ def zero_init(model):
@layerize
 def preprocess_doc(docs, drop=0.):
    keys = [doc.to_array([LOWER]) for doc in docs]
    keys = [a[:, 0] for a in keys]
    ops = Model.ops
    lengths = ops.asarray([arr.shape[0] for arr in keys])
    keys = ops.xp.concatenate(keys)
--- a/spacy/cfile.pxd
+++ b/spacy/cfile.pxd
@ -1,33 +0,0 @@
 from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 cdef class CFile:
    cdef FILE* fp
    cdef unsigned char* data
    cdef int is_open
    cdef Pool mem
    cdef int size # For compatibility with subclass
    cdef int i # For compatibility with subclass
    cdef int _capacity # For compatibility with subclass
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
 cdef class StringCFile:
    cdef unsigned char* data
    cdef int is_open
    cdef Pool mem
    cdef int size # For compatibility with subclass
    cdef int i # For compatibility with subclass
    cdef int _capacity # For compatibility with subclass
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -1,103 +0,0 @@
 # coding: utf8
 from __future__ import unicode_literals
 from libc.stdio cimport fopen, fclose, fread, fwrite
 from libc.string cimport memcpy
 cdef class CFile:
    def __init__(self, loc, mode, on_open_error=None):
        if isinstance(mode, unicode):
            mode_str = mode.encode('ascii')
        else:
            mode_str = mode
        if hasattr(loc, 'as_posix'):
            loc = loc.as_posix()
        self.mem = Pool()
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
        self.fp = fopen(<char*>bytes_loc, mode_str)
        if self.fp == NULL:
            if on_open_error is not None:
                on_open_error()
            else:
                raise IOError("Could not open binary file %s" % bytes_loc)
        self.is_open = True
    def __dealloc__(self):
        if self.is_open:
            fclose(self.fp)
    def close(self):
        fclose(self.fp)
        self.is_open = False
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        st = fread(dest, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1:
        st = fwrite(src, elem_size, number, self.fp)
        if st != number:
            raise IOError
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
 cdef class StringCFile:
    def __init__(self, bytes data, mode, on_open_error=None):
        self.mem = Pool()
        self.is_open = 1 if 'w' in mode else 0
        self._capacity = max(len(data), 8)
        self.size = len(data)
        self.i = 0
        self.data = <unsigned char*>self.mem.alloc(1, self._capacity)
        for i in range(len(data)):
            self.data[i] = data[i]
    def __dealloc__(self):
        # Important to override this -- or
        # we try to close a non-existant file pointer!
        pass
    def close(self):
        self.is_open = False
    def string_data(self):
        cdef bytes byte_string = b'\0' * (self.size)
        bytes_ptr = <char*>byte_string
        for i in range(self.size):
            bytes_ptr[i] = self.data[i]
        print(byte_string)
        return byte_string
    cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1:
        if self.i+(number * elem_size) < self.size:
            memcpy(dest, &self.data[self.i], elem_size * number)
            self.i += elem_size * number
    cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1:
        write_size = number * elem_size
        if (self.size + write_size) >= self._capacity:
            self._capacity = (self.size + write_size) * 2
            self.data = <unsigned char*>self.mem.realloc(self.data, self._capacity)
        memcpy(&self.data[self.size], src, write_size)
        self.size += write_size
    cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *:
        cdef void* dest = mem.alloc(number, elem_size)
        self.read_into(dest, number, elem_size)
        return dest
    def write_unicode(self, unicode value):
        cdef bytes py_bytes = value.encode('utf8')
        cdef char* chars = <char*>py_bytes
        self.write(sizeof(char), len(py_bytes), chars)
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -43,7 +43,7 @@ def package(cmd, input_dir, output_dir, meta_path=None, create_meta=False, force
        prints(meta_path, title="Reading meta.json from file")
        meta = util.read_json(meta_path)
    else:
-        meta = generate_meta()
+        meta = generate_meta(input_dir)
    meta = validate_meta(meta, ['lang', 'name', 'version'])
    model_name = meta['lang'] + '_' + meta['name']
@ -77,7 +77,8 @@ def create_file(file_path, contents):
    file_path.open('w', encoding='utf-8').write(contents)
-def generate_meta():
+def generate_meta(model_path):
    meta = {}
    settings = [('lang', 'Model language', 'en'),
                ('name', 'Model name', 'model'),
                ('version', 'Model version', '0.0.0'),
@ -87,31 +88,21 @@ def generate_meta():
                ('email', 'Author email', False),
                ('url', 'Author website', False),
                ('license', 'License', 'CC BY-NC 3.0')]
-    prints("Enter the package settings for your model.", title="Generating meta.json")
+    nlp = util.load_model_from_path(Path(model_path))
-    meta = {}
+    meta['pipeline'] = nlp.pipe_names
    meta['vectors'] = {'width': nlp.vocab.vectors_length,
                       'entries': len(nlp.vocab.vectors)}
    prints("Enter the package settings for your model. The following "
           "information will be read from your model data: pipeline, vectors.",
           title="Generating meta.json")
    for setting, desc, default in settings:
        response = util.get_raw_input(desc, default)
        meta[setting] = default if response == '' and default else response
    meta['pipeline'] = generate_pipeline()
    if about.__title__ != 'spacy':
        meta['parent_package'] = about.__title__
    return meta
 def generate_pipeline():
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
           "the pipeline will be disabled. Components should be specified as a "
           "comma-separated list of component names, e.g. tagger, "
           "parser, ner. For more information, see the docs on processing pipelines.",
           title="Enter your model's pipeline components")
    pipeline = util.get_raw_input("Pipeline components", True)
    subs = {'True': True, 'False': False}
    if pipeline in subs:
        return subs[pipeline]
    else:
        return [p.strip() for p in pipeline.split(',')]
 def validate_meta(meta, keys):
    for key in keys:
        if key not in meta or meta[key] == '':
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -144,7 +144,10 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
                    file_.write(json_dumps(scorer.scores))
                meta_loc = output_path / ('model%d' % i) / 'meta.json'
                meta['accuracy'] = scorer.scores
-                meta['speed'] = {'nwords': nwords, 'cpu':cpu_wps, 'gpu': gpu_wps}
+                meta['speed'] = {'nwords': nwords, 'cpu': cpu_wps,
                                 'gpu': gpu_wps}
                meta['vectors'] = {'width': nlp.vocab.vectors_length,
                                   'entries': len(nlp.vocab.vectors)}
                meta['lang'] = nlp.lang
                meta['pipeline'] = pipeline
                meta['spacy_version'] = '>=%s' % about.__version__
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -3,6 +3,16 @@ from __future__ import unicode_literals
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.
    term (unicode): The term to explain.
    RETURNS (unicode): The explanation, or `None` if not found in the glossary.
    EXAMPLE:
        >>> spacy.explain(u'NORP')
        >>> doc = nlp(u'Hello world')
        >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
    """
    if term in GLOSSARY:
        return GLOSSARY[term]
@ -283,6 +293,7 @@ GLOSSARY = {
    'PRODUCT':      'Objects, vehicles, foods, etc. (not services)',
    'EVENT':        'Named hurricanes, battles, wars, sports events, etc.',
    'WORK_OF_ART':  'Titles of books, songs, etc.',
    'LAW':          'Named documents made into laws.',
    'LANGUAGE':     'Any named language',
    'DATE':         'Absolute or relative dates or periods',
    'TIME':         'Times smaller than a day',
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals
 from contextlib import contextmanager
 import copy
 from thinc.neural import Model
 import random
@ -15,10 +16,9 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
 from .lemmatizer import Lemmatizer
 from .syntax.parser import get_templates
-from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
+from .pipeline import DependencyParser, Tensorizer, Tagger
-from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
+from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
 from .compat import Optimizer
 from .compat import json_dumps, izip, copy_reg
@ -75,9 +75,6 @@ class BaseDefaults(object):
    infixes = tuple(TOKENIZER_INFIXES)
    tag_map = dict(TAG_MAP)
    tokenizer_exceptions = {}
    parser_features = get_templates('parser')
    entity_features = get_templates('ner')
    tagger_features = Tagger.feature_templates # TODO -- fix this
    stop_words = set()
    lemma_rules = {}
    lemma_exc = {}
@ -102,9 +99,9 @@ class Language(object):
    factories = {
        'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
        'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
-        'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
+        'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
-        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
+        'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
-        'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
+        'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
        'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
        'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
    }
@ -150,7 +147,7 @@ class Language(object):
    @property
    def meta(self):
        self._meta.setdefault('lang', self.vocab.lang)
-        self._meta.setdefault('name', '')
+        self._meta.setdefault('name', 'model')
        self._meta.setdefault('version', '0.0.0')
        self._meta.setdefault('spacy_version', about.__version__)
        self._meta.setdefault('description', '')
@ -334,6 +331,29 @@ class Language(object):
            doc = proc(doc)
        return doc
    def disable_pipes(self, *names):
        '''Disable one or more pipeline components.
        If used as a context manager, the pipeline will be restored to the initial
        state at the end of the block. Otherwise, a DisabledPipes object is
        returned, that has a `.restore()` method you can use to undo your
        changes.
        EXAMPLE:
            >>> nlp.add_pipe('parser')
            >>> nlp.add_pipe('tagger')
            >>> with nlp.disable_pipes('parser', 'tagger'):
            >>>     assert not nlp.has_pipe('parser')
            >>> assert nlp.has_pipe('parser')
            >>> disabled = nlp.disable_pipes('parser')
            >>> assert len(disabled) == 1
            >>> assert not nlp.has_pipe('parser')
            >>> disabled.restore()
            >>> assert nlp.has_pipe('parser')
        '''
        return DisabledPipes(self, *names)
    def make_doc(self, text):
        return self.tokenizer(text)
@ -662,6 +682,42 @@ class Language(object):
        return self
 class DisabledPipes(list):
    '''Manager for temporary pipeline disabling.'''
    def __init__(self, nlp, *names):
        self.nlp = nlp
        self.names = names
        # Important! Not deep copy -- we just want the container (but we also
        # want to support people providing arbitrarily typed nlp.pipeline
        # objects.)
        self.original_pipeline = copy.copy(nlp.pipeline)
        list.__init__(self)
        self.extend(nlp.remove_pipe(name) for name in names)
    def __enter__(self):
        return self
    def __exit__(self, *args):
        self.restore()
    def restore(self):
        '''Restore the pipeline to its state when DisabledPipes was created.'''
        current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
        unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
        if unexpected:
            # Don't change the pipeline if we're raising an error.
            self.nlp.pipeline = current
            msg = (
                "Some current components would be lost when restoring "
                "previous pipeline state. If you added components after "
                "calling nlp.disable_pipes(), you should remove them "
                "explicitly with nlp.remove_pipe() before the pipeline is "
                "restore. Names of the new components: %s"
            )
            raise ValueError(msg % unexpected)
        self[:] = []
 def unpickle_language(vocab, meta, bytes_data):
    lang = Language(vocab=vocab)
    lang.from_bytes(bytes_data)
--- a/spacy/pipeline.pxd
+++ b/spacy/pipeline.pxd
@ -1,21 +0,0 @@
 from .syntax.parser cimport Parser
 #from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
 from .syntax.arc_eager cimport ArcEager
 from .tagger cimport Tagger
 cdef class EntityRecognizer(Parser):
    pass
 cdef class DependencyParser(Parser):
    pass
 #cdef class BeamEntityRecognizer(BeamParser):
 #    pass
 #
 #
 #cdef class BeamDependencyParser(BeamParser):
 #    pass
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -26,11 +26,8 @@ from thinc.neural.util import to_categorical
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .tokens.doc cimport Doc
-from .syntax.parser cimport Parser as LinearParser
+from .syntax.nn_parser cimport Parser
 from .syntax.nn_parser cimport Parser as NeuralParser
 from .syntax import nonproj
 from .syntax.parser import get_templates as get_feature_templates
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
 from .syntax.arc_eager cimport ArcEager
 from .tagger import Tagger
@ -86,7 +83,7 @@ class SentenceSegmenter(object):
            yield doc[start : len(doc)]
-class BaseThincComponent(object):
+class Pipe(object):
    name = None
    @classmethod
@ -217,7 +214,7 @@ def _load_cfg(path):
        return {}
-class TokenVectorEncoder(BaseThincComponent):
+class Tensorizer(Pipe):
    """Assign position-sensitive vectors to tokens, using a CNN or RNN."""
    name = 'tensorizer'
@ -329,7 +326,7 @@ class TokenVectorEncoder(BaseThincComponent):
        link_vectors_to_models(self.vocab)
-class NeuralTagger(BaseThincComponent):
+class Tagger(Pipe):
    name = 'tagger'
    def __init__(self, vocab, model=True, **cfg):
        self.vocab = vocab
@ -420,8 +417,6 @@ class NeuralTagger(BaseThincComponent):
                        new_tag_map[tag] = orig_tag_map[tag]
                    else:
                        new_tag_map[tag] = {POS: X}
        if 'SP' not in new_tag_map:
            new_tag_map['SP'] = orig_tag_map.get('SP', {POS: X})
        cdef Vocab vocab = self.vocab
        if new_tag_map:
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
@ -513,7 +508,11 @@ class NeuralTagger(BaseThincComponent):
        return self
-class NeuralLabeller(NeuralTagger):
+class MultitaskObjective(Tagger):
    '''Assist training of a parser or tagger, by training a side-objective.
    Experimental
    '''
    name = 'nn_labeller'
    def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
        self.vocab = vocab
@ -532,7 +531,7 @@ class NeuralLabeller(NeuralTagger):
            self.make_label = target
        else:
            raise ValueError(
-                "NeuralLabeller target should be function or one of "
+                "MultitaskObjective target should be function or one of "
                "['dep', 'tag', 'ent', 'dep_tag_offset', 'ent_tag']")
        self.cfg = dict(cfg)
        self.cfg.setdefault('cnn_maxout_pieces', 2)
@ -622,7 +621,7 @@ class NeuralLabeller(NeuralTagger):
            return '%s-%s' % (tags[i], ents[i])
-class SimilarityHook(BaseThincComponent):
+class SimilarityHook(Pipe):
    """
    Experimental
@ -674,7 +673,7 @@ class SimilarityHook(BaseThincComponent):
            link_vectors_to_models(self.vocab)
-class TextCategorizer(BaseThincComponent):
+class TextCategorizer(Pipe):
    name = 'textcat'
    @classmethod
@ -752,45 +751,7 @@ class TextCategorizer(BaseThincComponent):
            link_vectors_to_models(self.vocab)
-cdef class EntityRecognizer(LinearParser):
+cdef class DependencyParser(Parser):
    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown
    feature_templates = get_feature_templates('ner')
    def add_label(self, label):
        LinearParser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
 cdef class BeamEntityRecognizer(BeamParser):
    """Annotate named entities on Doc objects."""
    TransitionSystem = BiluoPushDown
    feature_templates = get_feature_templates('ner')
    def add_label(self, label):
        LinearParser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
 cdef class DependencyParser(LinearParser):
    TransitionSystem = ArcEager
    feature_templates = get_feature_templates('basic')
    def add_label(self, label):
        LinearParser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
    @property
    def postprocesses(self):
        return [nonproj.deprojectivize]
 cdef class NeuralDependencyParser(NeuralParser):
    name = 'parser'
    TransitionSystem = ArcEager
@ -800,17 +761,17 @@ cdef class NeuralDependencyParser(NeuralParser):
    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
        for target in []:
-            labeller = NeuralLabeller(self.vocab, target=target)
+            labeller = MultitaskObjective(self.vocab, target=target)
            tok2vec = self.model[0]
            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
            pipeline.append(labeller)
            self._multitasks.append(labeller)
    def __reduce__(self):
-        return (NeuralDependencyParser, (self.vocab, self.moves, self.model), None, None)
+        return (DependencyParser, (self.vocab, self.moves, self.model), None, None)
-cdef class NeuralEntityRecognizer(NeuralParser):
+cdef class EntityRecognizer(Parser):
    name = 'ner'
    TransitionSystem = BiluoPushDown
@ -818,31 +779,14 @@ cdef class NeuralEntityRecognizer(NeuralParser):
    def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
        for target in []:
-            labeller = NeuralLabeller(self.vocab, target=target)
+            labeller = MultitaskObjective(self.vocab, target=target)
            tok2vec = self.model[0]
            labeller.begin_training(gold_tuples, pipeline=pipeline, tok2vec=tok2vec)
            pipeline.append(labeller)
            self._multitasks.append(labeller)
    def __reduce__(self):
-        return (NeuralEntityRecognizer, (self.vocab, self.moves, self.model), None, None)
+        return (EntityRecognizer, (self.vocab, self.moves, self.model), None, None)
-cdef class BeamDependencyParser(BeamParser):
+__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer']
    TransitionSystem = ArcEager
    feature_templates = get_feature_templates('basic')
    def add_label(self, label):
        Parser.add_label(self, label)
        if isinstance(label, basestring):
            label = self.vocab.strings[label]
    @property
    def postprocesses(self):
        return [nonproj.deprojectivize]
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
           'BeamEntityRecognizer', 'TokenVectorEnoder']
--- a/spacy/syntax/_parse_features.pxd
+++ b/spacy/syntax/_parse_features.pxd
@ -1,259 +0,0 @@
 from thinc.typedefs cimport atom_t
 from .stateclass cimport StateClass
 from ._state cimport StateC
 cdef int fill_context(atom_t* context, const StateC* state) nogil
 # Context elements
 # Ensure each token's attributes are listed: w, p, c, c6, c4. The order
 # is referenced by incrementing the enum...
 # Tokens are listed in left-to-right order.
 #cdef size_t* SLOTS = [
 #    S2w, S1w,
 #    S0l0w, S0l2w, S0lw,
 #    S0w,
 #    S0r0w, S0r2w, S0rw,
 #    N0l0w, N0l2w, N0lw,
 #    P2w, P1w,
 #    N0w, N1w, N2w, N3w, 0
 #]
 # NB: The order of the enum is _NOT_ arbitrary!!
 cpdef enum:
    S2w
    S2W
    S2p
    S2c
    S2c4
    S2c6
    S2L
    S2_prefix
    S2_suffix
    S2_shape
    S2_ne_iob
    S2_ne_type
    S1w
    S1W
    S1p
    S1c
    S1c4
    S1c6
    S1L
    S1_prefix
    S1_suffix
    S1_shape
    S1_ne_iob
    S1_ne_type
    S1rw
    S1rW
    S1rp
    S1rc
    S1rc4
    S1rc6
    S1rL
    S1r_prefix
    S1r_suffix
    S1r_shape
    S1r_ne_iob
    S1r_ne_type
    S0lw
    S0lW
    S0lp
    S0lc
    S0lc4
    S0lc6
    S0lL
    S0l_prefix
    S0l_suffix
    S0l_shape
    S0l_ne_iob
    S0l_ne_type
    S0l2w
    S0l2W
    S0l2p
    S0l2c
    S0l2c4
    S0l2c6
    S0l2L
    S0l2_prefix
    S0l2_suffix
    S0l2_shape
    S0l2_ne_iob
    S0l2_ne_type
    S0w
    S0W
    S0p
    S0c
    S0c4
    S0c6
    S0L
    S0_prefix
    S0_suffix
    S0_shape
    S0_ne_iob
    S0_ne_type
    S0r2w
    S0r2W
    S0r2p
    S0r2c
    S0r2c4
    S0r2c6
    S0r2L
    S0r2_prefix
    S0r2_suffix
    S0r2_shape
    S0r2_ne_iob
    S0r2_ne_type
    S0rw
    S0rW
    S0rp
    S0rc
    S0rc4
    S0rc6
    S0rL
    S0r_prefix
    S0r_suffix
    S0r_shape
    S0r_ne_iob
    S0r_ne_type
    N0l2w
    N0l2W
    N0l2p
    N0l2c
    N0l2c4
    N0l2c6
    N0l2L
    N0l2_prefix
    N0l2_suffix
    N0l2_shape
    N0l2_ne_iob
    N0l2_ne_type
    N0lw
    N0lW
    N0lp
    N0lc
    N0lc4
    N0lc6
    N0lL
    N0l_prefix
    N0l_suffix
    N0l_shape
    N0l_ne_iob
    N0l_ne_type
    N0w
    N0W
    N0p
    N0c
    N0c4
    N0c6
    N0L
    N0_prefix
    N0_suffix
    N0_shape
    N0_ne_iob
    N0_ne_type
    N1w
    N1W
    N1p
    N1c
    N1c4
    N1c6
    N1L
    N1_prefix
    N1_suffix
    N1_shape
    N1_ne_iob
    N1_ne_type
    N2w
    N2W
    N2p
    N2c
    N2c4
    N2c6
    N2L
    N2_prefix
    N2_suffix
    N2_shape
    N2_ne_iob
    N2_ne_type
    P1w
    P1W
    P1p
    P1c
    P1c4
    P1c6
    P1L
    P1_prefix
    P1_suffix
    P1_shape
    P1_ne_iob
    P1_ne_type
    P2w
    P2W
    P2p
    P2c
    P2c4
    P2c6
    P2L
    P2_prefix
    P2_suffix
    P2_shape
    P2_ne_iob
    P2_ne_type
    E0w
    E0W
    E0p
    E0c
    E0c4
    E0c6
    E0L
    E0_prefix
    E0_suffix
    E0_shape
    E0_ne_iob
    E0_ne_type
    E1w
    E1W
    E1p
    E1c
    E1c4
    E1c6
    E1L
    E1_prefix
    E1_suffix
    E1_shape
    E1_ne_iob
    E1_ne_type
    # Misc features at the end
    dist
    N0lv
    S0lv
    S0rv
    S1lv
    S1rv
    S0_has_head
    S1_has_head
    S2_has_head
    CONTEXT_SIZE
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -1,419 +0,0 @@
 """
 Fill an array, context, with every _atomic_ value our features reference.
 We then write the _actual features_ as tuples of the atoms. The machinery
 that translates from the tuples to feature-extractors (which pick the values
 out of "context") is in features/extractor.pyx
 The atomic feature names are listed in a big enum, so that the feature tuples
 can refer to them.
 """
 # coding: utf-8
 from __future__ import unicode_literals
 from libc.string cimport memset
 from itertools import combinations
 from cymem.cymem cimport Pool
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ._state cimport StateC
 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
        context[0] = 0
        context[1] = 0
        context[2] = 0
        context[3] = 0
        context[4] = 0
        context[5] = 0
        context[6] = 0
        context[7] = 0
        context[8] = 0
        context[9] = 0
        context[10] = 0
        context[11] = 0
    else:
        context[0] = token.lex.orth
        context[1] = token.lemma
        context[2] = token.tag
        context[3] = token.lex.cluster
        # We've read in the string little-endian, so now we can take & (2**n)-1
        # to get the first n bits of the cluster.
        # e.g. s = "1110010101"
        # s = ''.join(reversed(s))
        # first_4_bits = int(s, 2)
        # print first_4_bits
        # 5
        # print "{0:b}".format(prefix).ljust(4, '0')
        # 1110
        # What we're doing here is picking a number where all bits are 1, e.g.
        # 15 is 1111, 63 is 111111 and doing bitwise AND, so getting all bits in
        # the source that are set to 1.
        context[4] = token.lex.cluster & 15
        context[5] = token.lex.cluster & 63
        context[6] = token.dep if token.head != 0 else 0
        context[7] = token.lex.prefix
        context[8] = token.lex.suffix
        context[9] = token.lex.shape
        context[10] = token.ent_iob
        context[11] = token.ent_type
 cdef int fill_context(atom_t* ctxt, const StateC* st) nogil:
    # Take care to fill every element of context!
    # We could memset, but this makes it very easy to have broken features that
    # make almost no impact on accuracy. If instead they're unset, the impact
    # tends to be dramatic, so we get an obvious regression to fix...
    fill_token(&ctxt[S2w], st.S_(2))
    fill_token(&ctxt[S1w], st.S_(1))
    fill_token(&ctxt[S1rw], st.R_(st.S(1), 1))
    fill_token(&ctxt[S0lw], st.L_(st.S(0), 1))
    fill_token(&ctxt[S0l2w], st.L_(st.S(0), 2))
    fill_token(&ctxt[S0w], st.S_(0))
    fill_token(&ctxt[S0r2w], st.R_(st.S(0), 2))
    fill_token(&ctxt[S0rw], st.R_(st.S(0), 1))
    fill_token(&ctxt[N0lw], st.L_(st.B(0), 1))
    fill_token(&ctxt[N0l2w], st.L_(st.B(0), 2))
    fill_token(&ctxt[N0w], st.B_(0))
    fill_token(&ctxt[N1w], st.B_(1))
    fill_token(&ctxt[N2w], st.B_(2))
    fill_token(&ctxt[P1w], st.safe_get(st.B(0)-1))
    fill_token(&ctxt[P2w], st.safe_get(st.B(0)-2))
    fill_token(&ctxt[E0w], st.E_(0))
    fill_token(&ctxt[E1w], st.E_(1))
    if st.stack_depth() >= 1 and not st.eol():
        ctxt[dist] = min_(st.B(0) - st.E(0), 5)
    else:
        ctxt[dist] = 0
    ctxt[N0lv] = min_(st.n_L(st.B(0)), 5)
    ctxt[S0lv] = min_(st.n_L(st.S(0)), 5)
    ctxt[S0rv] = min_(st.n_R(st.S(0)), 5)
    ctxt[S1lv] = min_(st.n_L(st.S(1)), 5)
    ctxt[S1rv] = min_(st.n_R(st.S(1)), 5)
    ctxt[S0_has_head] = 0
    ctxt[S1_has_head] = 0
    ctxt[S2_has_head] = 0
    if st.stack_depth() >= 1:
        ctxt[S0_has_head] = st.has_head(st.S(0)) + 1
        if st.stack_depth() >= 2:
            ctxt[S1_has_head] = st.has_head(st.S(1)) + 1
            if st.stack_depth() >= 3:
                ctxt[S2_has_head] = st.has_head(st.S(2)) + 1
 cdef inline int min_(int a, int b) nogil:
    return a if a > b else b
 ner = (
    (N0W,),
    (P1W,),
    (N1W,),
    (P2W,),
    (N2W,),
    (P1W, N0W,),
    (N0W, N1W),
    (N0_prefix,),
    (N0_suffix,),
    (P1_shape,),
    (N0_shape,),
    (N1_shape,),
    (P1_shape, N0_shape,),
    (N0_shape, P1_shape,),
    (P1_shape, N0_shape, N1_shape),
    (N2_shape,),
    (P2_shape,),
    #(P2_norm, P1_norm, W_norm),
    #(P1_norm, W_norm, N1_norm),
    #(W_norm, N1_norm, N2_norm)
    (P2p,),
    (P1p,),
    (N0p,),
    (N1p,),
    (N2p,),
    (P1p, N0p),
    (N0p, N1p),
    (P2p, P1p, N0p),
    (P1p, N0p, N1p),
    (N0p, N1p, N2p),
    (P2c,),
    (P1c,),
    (N0c,),
    (N1c,),
    (N2c,),
    (P1c, N0c),
    (N0c, N1c),
    (E0W,),
    (E0c,),
    (E0p,),
    (E0W, N0W),
    (E0c, N0W),
    (E0p, N0W),
    (E0p, P1p, N0p),
    (E0c, P1c, N0c),
    (E0w, P1c),
    (E0p, P1p),
    (E0c, P1c),
    (E0p, E1p),
    (E0c, P1p),
    (E1W,),
    (E1c,),
    (E1p,),
    (E0W, E1W),
    (E0W, E1p,),
    (E0p, E1W,),
    (E0p, E1W),
    (P1_ne_iob,),
    (P1_ne_iob, P1_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),
    (N0_shape,),
    (N1_shape,),
    (N2_shape,),
    (P1_shape,),
    (P2_shape,),
    (N0_prefix,),
    (N0_suffix,),
    (P1_ne_iob,),
    (P2_ne_iob,),
    (P1_ne_iob, P2_ne_iob),
    (P1_ne_iob, P1_ne_type),
    (P2_ne_iob, P2_ne_type),
    (N0w, P1_ne_iob, P1_ne_type),
    (N0w, N1w),
 )
 unigrams = (
    (S2W, S2p),
    (S2c6, S2p),
    (S1W, S1p),
    (S1c6, S1p),
    (S0W, S0p),
    (S0c6, S0p),
    (N0W, N0p),
    (N0p,),
    (N0c,),
    (N0c6, N0p),
    (N0L,),
    (N1W, N1p),
    (N1c6, N1p),
    (N2W, N2p),
    (N2c6, N2p),
    (S0r2W, S0r2p),
    (S0r2c6, S0r2p),
    (S0r2L,),
    (S0rW, S0rp),
    (S0rc6, S0rp),
    (S0rL,),
    (S0l2W, S0l2p),
    (S0l2c6, S0l2p),
    (S0l2L,),
    (S0lW, S0lp),
    (S0lc6, S0lp),
    (S0lL,),
    (N0l2W, N0l2p),
    (N0l2c6, N0l2p),
    (N0l2L,),
    (N0lW, N0lp),
    (N0lc6, N0lp),
    (N0lL,),
 )
 s0_n0 = (
    (S0W, S0p, N0W, N0p),
    (S0c, S0p, N0c, N0p),
    (S0c6, S0p, N0c6, N0p),
    (S0c4, S0p, N0c4, N0p),
    (S0p, N0p),
    (S0W, N0p),
    (S0p, N0W),
    (S0W, N0c),
    (S0c, N0W),
    (S0p, N0c),
    (S0c, N0p),
    (S0W, S0rp, N0p),
    (S0p, S0rp, N0p),
    (S0p, N0lp, N0W),
    (S0p, N0lp, N0p),
    (S0L, N0p),
    (S0p, S0rL, N0p),
    (S0p, N0lL, N0p),
    (S0p, S0rv, N0p),
    (S0p, N0lv, N0p),
    (S0c6, S0rL, S0r2L, N0p),
    (S0p, N0lL, N0l2L, N0p),
 )
 s1_s0 = (
    (S1p, S0p),
    (S1p, S0p, S0_has_head),
    (S1W, S0p),
    (S1W, S0p, S0_has_head),
    (S1c, S0p),
    (S1c, S0p, S0_has_head),
    (S1p, S1rL, S0p),
    (S1p, S1rL, S0p, S0_has_head),
    (S1p, S0lL, S0p),
    (S1p, S0lL, S0p, S0_has_head),
    (S1p, S0lL, S0l2L, S0p),
    (S1p, S0lL, S0l2L, S0p, S0_has_head),
    (S1L, S0L, S0W),
    (S1L, S0L, S0p),
    (S1p, S1L, S0L, S0p),
    (S1p, S0p),
 )
 s1_n0 = (
    (S1p, N0p),
    (S1c, N0c),
    (S1c, N0p),
    (S1p, N0c),
    (S1W, S1p, N0p),
    (S1p, N0W, N0p),
    (S1c6, S1p, N0c6, N0p),
    (S1L, N0p),
    (S1p, S1rL, N0p),
    (S1p, S1rp, N0p),
 )
 s0_n1 = (
    (S0p, N1p),
    (S0c, N1c),
    (S0c, N1p),
    (S0p, N1c),
    (S0W, S0p, N1p),
    (S0p, N1W, N1p),
    (S0c6, S0p, N1c6, N1p),
    (S0L, N1p),
    (S0p, S0rL, N1p),
 )
 n0_n1 = (
    (N0W, N0p, N1W, N1p),
    (N0W, N0p, N1p),
    (N0p, N1W, N1p),
    (N0c, N0p, N1c, N1p),
    (N0c6, N0p, N1c6, N1p),
    (N0c, N1c),
    (N0p, N1c),
 )
 tree_shape = (
    (dist,),
    (S0p, S0_has_head, S1_has_head, S2_has_head),
    (S0p, S0lv, S0rv),
    (N0p, N0lv),
 )
 trigrams = (
    (N0p, N1p, N2p),
    (S0p, S0lp, S0l2p),
    (S0p, S0rp, S0r2p),
    (S0p, S1p, S2p),
    (S1p, S0p, N0p),
    (S0p, S0lp, N0p),
    (S0p, N0p, N0lp),
    (N0p, N0lp, N0l2p),
    (S0W, S0p, S0rL, S0r2L),
    (S0p, S0rL, S0r2L),
    (S0W, S0p, S0lL, S0l2L),
    (S0p, S0lL, S0l2L),
    (N0W, N0p, N0lL, N0l2L),
    (N0p, N0lL, N0l2L),
 )
 words = (
    S2w,
    S1w,
    S1rw,
    S0lw,
    S0l2w,
    S0w,
    S0r2w,
    S0rw,
    N0lw,
    N0l2w,
    N0w,
    N1w,
    N2w,
    P1w,
    P2w
 )
 tags = (
    S2p,
    S1p,
    S1rp,
    S0lp,
    S0l2p,
    S0p,
    S0r2p,
    S0rp,
    N0lp,
    N0l2p,
    N0p,
    N1p,
    N2p,
    P1p,
    P2p
 )
 labels = (
    S2L,
    S1L,
    S1rL,
    S0lL,
    S0l2L,
    S0L,
    S0r2L,
    S0rL,
    N0lL,
    N0l2L,
    N0L,
    N1L,
    N2L,
    P1L,
    P2L
 )
--- a/spacy/syntax/beam_parser.pxd
+++ b/spacy/syntax/beam_parser.pxd
@ -1,10 +0,0 @@
 from .parser cimport Parser
 from ..structs cimport TokenC
 from thinc.typedefs cimport weight_t
 cdef class BeamParser(Parser):
    cdef public int beam_width
    cdef public weight_t beam_density
    cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -1,239 +0,0 @@
 """
 MALT-style dependency parser
 """
 # cython: profile=True
 # cython: experimental_cpp_class_def=True
 # cython: cdivision=True
 # cython: infer_types=True
 # coding: utf-8
 from __future__ import unicode_literals, print_function
 cimport cython
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport rand
 from libc.math cimport log, exp, isnan, isinf
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport real_hash64 as hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.features cimport ConjunctionExtracter
 from thinc.structs cimport FeatureC, ExampleC
 from thinc.extra.search cimport Beam, MaxViolation
 from thinc.extra.eg cimport Example
 from thinc.extra.mb cimport Minibatch
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParse
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from .parser cimport Parser
 DEBUG = False
 def set_debug(val):
    global DEBUG
    DEBUG = val
 def get_templates(name):
    pf = _parse_features
    if name == 'ner':
        return pf.ner
    elif name == 'debug':
        return pf.unigrams
    else:
        return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                pf.tree_shape + pf.trigrams)
 cdef int BEAM_WIDTH = 16
 cdef weight_t BEAM_DENSITY = 0.001
 cdef class BeamParser(Parser):
    def __init__(self, *args, **kwargs):
        self.beam_width = kwargs.get('beam_width', BEAM_WIDTH)
        self.beam_density = kwargs.get('beam_density', BEAM_DENSITY)
        Parser.__init__(self, *args, **kwargs)
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
        with gil:
            self._parseC(tokens, length, nr_feat, self.moves.n_moves)
    cdef int _parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) except -1:
        cdef Beam beam = Beam(self.moves.n_moves, self.beam_width, min_density=self.beam_density)
        # TODO: How do we handle new labels here? This increases nr_class
        beam.initialize(self.moves.init_beam_state, length, tokens)
        beam.check_done(_check_final_state, NULL)
        if beam.is_done:
            _cleanup(beam)
            return 0
        while not beam.is_done:
            self._advance_beam(beam, None, False)
        state = <StateClass>beam.at(0)
        self.moves.finalize_state(state.c)
        for i in range(length):
            tokens[i] = state.c._sent[i]
        _cleanup(beam)
    def update(self, Doc tokens, GoldParse gold_parse, itn=0):
        self.moves.preprocess_gold(gold_parse)
        cdef Beam pred = Beam(self.moves.n_moves, self.beam_width)
        pred.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
        pred.check_done(_check_final_state, NULL)
        # Hack for NER
        for i in range(pred.size):
            stcls = <StateClass>pred.at(i)
            self.moves.initialize_state(stcls.c)
        cdef Beam gold = Beam(self.moves.n_moves, self.beam_width, min_density=0.0)
        gold.initialize(self.moves.init_beam_state, tokens.length, tokens.c)
        gold.check_done(_check_final_state, NULL)
        violn = MaxViolation()
        while not pred.is_done and not gold.is_done:
            # We search separately here, to allow for ambiguity in the gold parse.
            self._advance_beam(pred, gold_parse, False)
            self._advance_beam(gold, gold_parse, True)
            violn.check_crf(pred, gold)
            if pred.loss > 0 and pred.min_score > (gold.score + self.model.time):
                break
        else:
            # The non-monotonic oracle makes it difficult to ensure final costs are
            # correct. Therefore do final correction
            for i in range(pred.size):
                if self.moves.is_gold_parse(<StateClass>pred.at(i), gold_parse):
                    pred._states[i].loss = 0.0
                elif pred._states[i].loss == 0.0:
                    pred._states[i].loss = 1.0
            violn.check_crf(pred, gold)
        if pred.size < 1:
            raise Exception("No candidates", tokens.length)
        if gold.size < 1:
            raise Exception("No gold", tokens.length)
        if pred.loss == 0:
            self.model.update_from_histories(self.moves, tokens, [(0.0, [])])
        elif True:
            #_check_train_integrity(pred, gold, gold_parse, self.moves)
            histories = list(zip(violn.p_probs, violn.p_hist)) + \
                        list(zip(violn.g_probs, violn.g_hist))
            self.model.update_from_histories(self.moves, tokens, histories, min_grad=0.001**(itn+1))
        else:
            self.model.update_from_histories(self.moves, tokens,
                [(1.0, violn.p_hist[0]), (-1.0, violn.g_hist[0])])
        _cleanup(pred)
        _cleanup(gold)
        return pred.loss
    def _advance_beam(self, Beam beam, GoldParse gold, bint follow_gold):
        cdef atom_t[CONTEXT_SIZE] context
        cdef Pool mem = Pool()
        features = <FeatureC*>mem.alloc(self.model.nr_feat, sizeof(FeatureC))
        if False:
            mb = Minibatch(self.model.widths, beam.size)
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if stcls.c.is_final():
                    nr_feat = 0
                else:
                    nr_feat = self.model.set_featuresC(context, features, stcls.c)
                    self.moves.set_valid(beam.is_valid[i], stcls.c)
                mb.c.push_back(features, nr_feat, beam.costs[i], beam.is_valid[i], 0)
            self.model(mb)
            for i in range(beam.size):
                memcpy(beam.scores[i], mb.c.scores(i), mb.c.nr_out() * sizeof(beam.scores[i][0]))
        else:
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if not stcls.is_final():
                    nr_feat = self.model.set_featuresC(context, features, stcls.c)
                    self.moves.set_valid(beam.is_valid[i], stcls.c)
                    self.model.set_scoresC(beam.scores[i], features, nr_feat)
        if gold is not None:
            n_gold = 0
            lines = []
            for i in range(beam.size):
                stcls = <StateClass>beam.at(i)
                if not stcls.c.is_final():
                    self.moves.set_costs(beam.is_valid[i], beam.costs[i], stcls, gold)
                    if follow_gold:
                        for j in range(self.moves.n_moves):
                            if beam.costs[i][j] >= 1:
                                beam.is_valid[i][j] = 0
                                lines.append((stcls.B(0), stcls.B(1),
                                    stcls.B_(0).ent_iob, stcls.B_(1).ent_iob,
                                    stcls.B_(1).sent_start,
                                    j,
                                    beam.is_valid[i][j], 'set invalid',
                                    beam.costs[i][j], self.moves.c[j].move, self.moves.c[j].label))
                            n_gold += 1 if beam.is_valid[i][j] else 0
            if follow_gold and n_gold == 0:
                raise Exception("No gold")
        if follow_gold:
            beam.advance(_transition_state, NULL, <void*>self.moves.c)
        else:
            beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
        beam.check_done(_check_final_state, NULL)
 # These are passed as callbacks to thinc.search.Beam
 cdef int _transition_state(void* _dest, void* _src, class_t clas, void* _moves) except -1:
    dest = <StateClass>_dest
    src = <StateClass>_src
    moves = <const Transition*>_moves
    dest.clone(src)
    moves[clas].do(dest.c, moves[clas].label)
 cdef int _check_final_state(void* _state, void* extra_args) except -1:
    return (<StateClass>_state).is_final()
 def _cleanup(Beam beam):
    for i in range(beam.width):
        Py_XDECREF(<PyObject*>beam._states[i].content)
        Py_XDECREF(<PyObject*>beam._parents[i].content)
 cdef hash_t _hash_state(void* _state, void* _) except 0:
    state = <StateClass>_state
    if state.c.is_final():
        return 1
    else:
        return state.c.hash()
 def _check_train_integrity(Beam pred, Beam gold, GoldParse gold_parse, TransitionSystem moves):
    for i in range(pred.size):
        if not pred._states[i].is_done or pred._states[i].loss == 0:
            continue
        state = <StateClass>pred.at(i)
        if moves.is_gold_parse(state, gold_parse) == True:
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Cost", pred._states[i].loss)
            for j in range(gold_parse.length):
                print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
            acts = [moves.c[clas].move for clas in pred.histories[i]]
            labels = [moves.c[clas].label for clas in pred.histories[i]]
            print([moves.move_name(move, label) for move, label in zip(acts, labels)])
            raise Exception("Predicted state is gold-standard")
    for i in range(gold.size):
        if not gold._states[i].is_done:
            continue
        state = <StateClass>gold.at(i)
        if moves.is_gold(state, gold_parse) == False:
            print("Truth")
            for dep in gold_parse.orig_annot:
                print(dep[1], dep[3], dep[4])
            print("Predicted good")
            for j in range(gold_parse.length):
                print(gold_parse.orig_annot[j][1], state.H(j), moves.strings[state.safe_get(j).dep])
            raise Exception("Gold parse is not gold-standard")
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -53,9 +53,6 @@ from .._ml import link_vectors_to_models
 from .._ml import HistoryFeatures
 from ..compat import json_dumps, copy_array
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from . import nonproj
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -1,24 +0,0 @@
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.typedefs cimport atom_t
 from thinc.structs cimport FeatureC
 from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
 from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
 cdef class ParserModel(AveragedPerceptron):
    cdef int set_featuresC(self, atom_t* context, FeatureC* features,
                            const StateC* state) nogil
 cdef class Parser:
    cdef readonly Vocab vocab
    cdef readonly ParserModel model
    cdef readonly TransitionSystem moves
    cdef readonly object cfg
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,526 +0,0 @@
 """
 MALT-style dependency parser
 """
 # coding: utf-8
 # cython: infer_types=True
 from __future__ import unicode_literals
 from collections import Counter
 import ujson
 cimport cython
 cimport cython.parallel
 import numpy.random
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
 from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
 from thinc.extra.eg cimport Example
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
 from .transition_system import OracleError
 from .transition_system cimport TransitionSystem, Transition
 from ..structs cimport TokenC
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
 from ..gold cimport GoldParse
 USE_FTRL = True
 DEBUG = False
 def set_debug(val):
    global DEBUG
    DEBUG = val
 def get_templates(name):
    pf = _parse_features
    if name == 'ner':
        return pf.ner
    elif name == 'debug':
        return pf.unigrams
    elif name.startswith('embed'):
        return (pf.words, pf.tags, pf.labels)
    else:
        return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s1_s0 + pf.s0_n1 + pf.n0_n1 + \
                pf.tree_shape + pf.trigrams)
 cdef class ParserModel(AveragedPerceptron):
    cdef int set_featuresC(self, atom_t* context, FeatureC* features,
            const StateC* state) nogil:
        fill_context(context, state)
        nr_feat = self.extracter.set_features(features, context)
        return nr_feat
    def update(self, Example eg, itn=0):
        """
        Does regression on negative cost. Sort of cute?
        """
        self.time += 1
        cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
        cdef int guess = eg.guess
        if guess == best or best == -1:
            return 0.0
        cdef FeatureC feat
        cdef int clas
        cdef weight_t gradient
        if USE_FTRL:
            for feat in eg.c.features[:eg.c.nr_feat]:
                for clas in range(eg.c.nr_class):
                    if eg.c.is_valid[clas] and eg.c.scores[clas] >= eg.c.scores[best]:
                        gradient = eg.c.scores[clas] + eg.c.costs[clas]
                        self.update_weight_ftrl(feat.key, clas, feat.value * gradient)
        else:
            for feat in eg.c.features[:eg.c.nr_feat]:
                self.update_weight(feat.key, guess, feat.value * eg.c.costs[guess])
                self.update_weight(feat.key, best, -feat.value * eg.c.costs[guess])
        return eg.c.costs[guess]
    def update_from_histories(self, TransitionSystem moves, Doc doc, histories, weight_t min_grad=0.0):
        cdef Pool mem = Pool()
        features = <FeatureC*>mem.alloc(self.nr_feat, sizeof(FeatureC))
        cdef StateClass stcls
        cdef class_t clas
        self.time += 1
        cdef atom_t[CONTEXT_SIZE] atoms
        histories = [(grad, hist) for grad, hist in histories if abs(grad) >= min_grad and hist]
        if not histories:
            return None
        gradient = [Counter() for _ in range(max([max(h)+1 for _, h in histories]))]
        for d_loss, history in histories:
            stcls = StateClass.init(doc.c, doc.length)
            moves.initialize_state(stcls.c)
            for clas in history:
                nr_feat = self.set_featuresC(atoms, features, stcls.c)
                clas_grad = gradient[clas]
                for feat in features[:nr_feat]:
                    clas_grad[feat.key] += d_loss * feat.value
                moves.c[clas].do(stcls.c, moves.c[clas].label)
        cdef feat_t key
        cdef weight_t d_feat
        for clas, clas_grad in enumerate(gradient):
            for key, d_feat in clas_grad.items():
                if d_feat != 0:
                    self.update_weight_ftrl(key, clas, d_feat)
 cdef class Parser:
    """
    Base class of the DependencyParser and EntityRecognizer.
    """
    @classmethod
    def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
        """
        Load the statistical model from the supplied path.
        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                The vocabulary. Must be shared by the documents to be processed.
            require (bool):
                Whether to raise an error if the files are not found.
        Returns (Parser):
            The newly constructed object.
        """
        with (path / 'config.json').open() as file_:
            cfg = ujson.load(file_)
        # TODO: remove this shim when we don't have to support older data
        if 'labels' in cfg and 'actions' not in cfg:
            cfg['actions'] = cfg.pop('labels')
        # TODO: remove this shim when we don't have to support older data
        for action_name, labels in dict(cfg.get('actions', {})).items():
            # We need this to be sorted
            if isinstance(labels, dict):
                labels = list(sorted(labels.keys()))
            cfg['actions'][action_name] = labels
        self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg)
        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
        elif require:
            raise IOError(
                "Required file %s/model not found when loading" % str(path))
        return self
    def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
        """
        Create a Parser.
        Arguments:
            vocab (Vocab):
                The vocabulary object. Must be shared with documents to be processed.
            model (thinc.linear.AveragedPerceptron):
                The statistical model.
        Returns (Parser):
            The newly constructed object.
        """
        if TransitionSystem is None:
            TransitionSystem = self.TransitionSystem
        self.vocab = vocab
        cfg['actions'] = TransitionSystem.get_actions(**cfg)
        self.moves = TransitionSystem(vocab.strings, cfg['actions'])
        # TODO: Remove this when we no longer need to support old-style models
        if isinstance(cfg.get('features'), basestring):
            cfg['features'] = get_templates(cfg['features'])
        elif 'features' not in cfg:
            cfg['features'] = self.feature_templates
        self.model = ParserModel(cfg['features'])
        self.model.l1_penalty = cfg.get('L1', 0.0)
        self.model.learn_rate = cfg.get('learn_rate', 0.001)
        self.cfg = cfg
        # TODO: This is a pretty hacky fix to the problem of adding more
        # labels. The issue is they come in out of order, if labels are
        # added during training
        for label in cfg.get('extra_labels', []):
            self.add_label(label)
    def __reduce__(self):
        return (Parser, (self.vocab, self.moves, self.model), None, None)
    def __call__(self, Doc tokens):
        """
        Apply the entity recognizer, setting the annotations onto the Doc object.
        Arguments:
            doc (Doc): The document to be processed.
        Returns:
            None
        """
        cdef int nr_feat = self.model.nr_feat
        with nogil:
            status = self.parseC(tokens.c, tokens.length, nr_feat)
        # Check for KeyboardInterrupt etc. Untested
        PyErr_CheckSignals()
        if status != 0:
            raise ParserStateError(tokens)
        self.moves.finalize_doc(tokens)
    def pipe(self, stream, int batch_size=1000, int n_threads=2):
        """
        Process a stream of documents.
        Arguments:
            stream: The sequence of documents to process.
            batch_size (int):
                The number of documents to accumulate into a working set.
            n_threads (int):
                The number of threads with which to work on the buffer in parallel.
        Yields (Doc): Documents, in order.
        """
        cdef Pool mem = Pool()
        cdef TokenC** doc_ptr = <TokenC**>mem.alloc(batch_size, sizeof(TokenC*))
        cdef int* lengths = <int*>mem.alloc(batch_size, sizeof(int))
        cdef Doc doc
        cdef int i
        cdef int nr_feat = self.model.nr_feat
        cdef int status
        queue = []
        for doc in stream:
            doc_ptr[len(queue)] = doc.c
            lengths[len(queue)] = doc.length
            queue.append(doc)
            if len(queue) == batch_size:
                with nogil:
                    for i in cython.parallel.prange(batch_size, num_threads=n_threads):
                        status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
                        if status != 0:
                            with gil:
                                raise ParserStateError(queue[i])
                PyErr_CheckSignals()
                for doc in queue:
                    self.moves.finalize_doc(doc)
                    yield doc
                queue = []
        batch_size = len(queue)
        with nogil:
            for i in cython.parallel.prange(batch_size, num_threads=n_threads):
                status = self.parseC(doc_ptr[i], lengths[i], nr_feat)
                if status != 0:
                    with gil:
                        raise ParserStateError(queue[i])
        PyErr_CheckSignals()
        for doc in queue:
            self.moves.finalize_doc(doc)
            yield doc
    cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil:
        state = new StateC(tokens, length)
        # NB: This can change self.moves.n_moves!
        # I think this causes memory errors if called by .pipe()
        self.moves.initialize_state(state)
        nr_class = self.moves.n_moves
        cdef ExampleC eg
        eg.nr_feat = nr_feat
        eg.nr_atom = CONTEXT_SIZE
        eg.nr_class = nr_class
        eg.features = <FeatureC*>calloc(sizeof(FeatureC), nr_feat)
        eg.atoms = <atom_t*>calloc(sizeof(atom_t), CONTEXT_SIZE)
        eg.scores = <weight_t*>calloc(sizeof(weight_t), nr_class)
        eg.is_valid = <int*>calloc(sizeof(int), nr_class)
        cdef int i
        while not state.is_final():
            eg.nr_feat = self.model.set_featuresC(eg.atoms, eg.features, state)
            self.moves.set_valid(eg.is_valid, state)
            self.model.set_scoresC(eg.scores, eg.features, eg.nr_feat)
            guess = VecVec.arg_max_if_true(eg.scores, eg.is_valid, eg.nr_class)
            if guess < 0:
                return 1
            action = self.moves.c[guess]
            action.do(state, action.label)
            memset(eg.scores, 0, sizeof(eg.scores[0]) * eg.nr_class)
            for i in range(eg.nr_class):
                eg.is_valid[i] = 1
        self.moves.finalize_state(state)
        for i in range(length):
            tokens[i] = state._sent[i]
        del state
        free(eg.features)
        free(eg.atoms)
        free(eg.scores)
        free(eg.is_valid)
        return 0
    def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0):
        """
        Update the statistical model.
        Arguments:
            doc (Doc):
                The example document for the update.
            gold (GoldParse):
                The gold-standard annotations, to calculate the loss.
        Returns (float):
            The loss on this example.
        """
        self.moves.preprocess_gold(gold)
        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls.c)
        cdef Pool mem = Pool()
        cdef Example eg = Example(
                nr_class=self.moves.n_moves,
                nr_atom=CONTEXT_SIZE,
                nr_feat=self.model.nr_feat)
        cdef weight_t loss = 0
        cdef Transition action
        cdef double dropout_rate = self.cfg.get('dropout', drop)
        while not stcls.is_final():
            eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features,
                                                    stcls.c)
            dropout(eg.c.features, eg.c.nr_feat, dropout_rate)
            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
            self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat)
            guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
            self.model.update(eg)
            action = self.moves.c[guess]
            action.do(stcls.c, action.label)
            loss += eg.costs[guess]
            eg.fill_scores(0, eg.c.nr_class)
            eg.fill_costs(0, eg.c.nr_class)
            eg.fill_is_valid(1, eg.c.nr_class)
        self.moves.finalize_state(stcls.c)
        return loss
    def step_through(self, Doc doc, GoldParse gold=None):
        """
        Set up a stepwise state, to introspect and control the transition sequence.
        Arguments:
            doc (Doc): The document to step through.
            gold (GoldParse): Optional gold parse
        Returns (StepwiseState):
            A state object, to step through the annotation process.
        """
        return StepwiseState(self, doc, gold=gold)
    def from_transition_sequence(self, Doc doc, sequence):
        """Control the annotations on a document by specifying a transition sequence
        to follow.
        Arguments:
            doc (Doc): The document to annotate.
            sequence: A sequence of action names, as unicode strings.
        Returns: None
        """
        with self.step_through(doc) as stepwise:
            for transition in sequence:
                stepwise.transition(transition)
    def add_label(self, label):
        # Doesn't set label into serializer -- subclasses override it to do that.
        for action in self.moves.action_types:
            added = self.moves.add_action(action, label)
            if added:
                # Important that the labels be stored as a list! We need the
                # order, or the model goes out of synch
                self.cfg.setdefault('extra_labels', []).append(label)
 cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1:
    if prob <= 0 or prob >= 1.:
        return 0
    cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat)
    cdef double* probs = &py_probs[0]
    for i in range(nr_feat):
        if probs[i] >= prob:
            feats[i].value /= prob
        else:
            feats[i].value = 0.
 cdef class StepwiseState:
    cdef readonly StateClass stcls
    cdef readonly Example eg
    cdef readonly Doc doc
    cdef readonly GoldParse gold
    cdef readonly Parser parser
    def __init__(self, Parser parser, Doc doc, GoldParse gold=None):
        self.parser = parser
        self.doc = doc
        if gold is not None:
            self.gold = gold
            self.parser.moves.preprocess_gold(self.gold)
        else:
            self.gold = GoldParse(doc)
        self.stcls = StateClass.init(doc.c, doc.length)
        self.parser.moves.initialize_state(self.stcls.c)
        self.eg = Example(
            nr_class=self.parser.moves.n_moves,
            nr_atom=CONTEXT_SIZE,
            nr_feat=self.parser.model.nr_feat)
    def __enter__(self):
        return self
    def __exit__(self, type, value, traceback):
        self.finish()
    @property
    def is_final(self):
        return self.stcls.is_final()
    @property
    def stack(self):
        return self.stcls.stack
    @property
    def queue(self):
        return self.stcls.queue
    @property
    def heads(self):
        return [self.stcls.H(i) for i in range(self.stcls.c.length)]
    @property
    def deps(self):
        return [self.doc.vocab.strings[self.stcls.c._sent[i].dep]
                for i in range(self.stcls.c.length)]
    @property
    def costs(self):
        """
        Find the action-costs for the current state.
        """
        if not self.gold:
            raise ValueError("Can't set costs: No GoldParse provided")
        self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
                self.stcls, self.gold)
        costs = {}
        for i in range(self.parser.moves.n_moves):
            if not self.eg.c.is_valid[i]:
                continue
            transition = self.parser.moves.c[i]
            name = self.parser.moves.move_name(transition.move, transition.label)
            costs[name] = self.eg.c.costs[i]
        return costs
    def predict(self):
        self.eg.reset()
        self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features,
                                                            self.stcls.c)
        self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls.c)
        self.parser.model.set_scoresC(self.eg.c.scores,
            self.eg.c.features, self.eg.c.nr_feat)
        cdef Transition action = self.parser.moves.c[self.eg.guess]
        return self.parser.moves.move_name(action.move, action.label)
    def transition(self, action_name=None):
        if action_name is None:
            action_name = self.predict()
        moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3}
        if action_name == '_':
            action_name = self.predict()
            action = self.parser.moves.lookup_transition(action_name)
        elif action_name == 'L' or action_name == 'R':
            self.predict()
            move = moves[action_name]
            clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c,
                                 self.eg.c.nr_class)
            action = self.parser.moves.c[clas]
        else:
            action = self.parser.moves.lookup_transition(action_name)
        action.do(self.stcls.c, action.label)
    def finish(self):
        if self.stcls.is_final():
            self.parser.moves.finalize_state(self.stcls.c)
        self.doc.set_parse(self.stcls.c._sent)
        self.parser.moves.finalize_doc(self.doc)
 class ParserStateError(ValueError):
    def __init__(self, doc):
        ValueError.__init__(self,
            "Error analysing doc -- no valid actions available. This should "
            "never happen, so please report the error on the issue tracker. "
            "Here's the thread to do so --- reopen it if it's closed:\n"
            "https://github.com/spacy-io/spaCy/issues/429\n"
            "Please include the text that the parser failed on, which is:\n"
            "%s" % repr(doc.text))
 cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs, int n) nogil:
    cdef int best = -1
    for i in range(n):
        if costs[i] <= 0:
            if best == -1 or scores[i] > scores[best]:
                best = i
    return best
 cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions,
                       int nr_class) except -1:
    cdef weight_t score = 0
    cdef int mode = -1
    cdef int i
    for i in range(nr_class):
        if actions[i].move == move and (mode == -1 or scores[i] >= score):
            mode = i
            score = scores[i]
    return mode
--- a/spacy/tests/doc/test_add_entities.py
+++ b/spacy/tests/doc/test_add_entities.py
@ -10,7 +10,8 @@ import pytest
 def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = get_doc(en_vocab, text)
-    ner = EntityRecognizer(en_vocab, features=[(2,), (3,)])
+    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert len(list(doc.ents)) == 0
--- a/spacy/tests/parser/test_add_label.py
+++ b/spacy/tests/parser/test_add_label.py
@ -9,7 +9,7 @@ from ...attrs import NORM
 from ...gold import GoldParse
 from ...vocab import Vocab
 from ...tokens import Doc
-from ...pipeline import NeuralDependencyParser
+from ...pipeline import DependencyParser
 numpy.random.seed(0)
@ -21,7 +21,7 @@ def vocab():
@pytest.fixture
 def parser(vocab):
-    parser = NeuralDependencyParser(vocab)
+    parser = DependencyParser(vocab)
    parser.cfg['token_vector_width'] = 8
    parser.cfg['hidden_width'] = 30
    parser.cfg['hist_size'] = 0
--- a/spacy/tests/parser/test_neural_parser.py
+++ b/spacy/tests/parser/test_neural_parser.py
@ -6,7 +6,7 @@ import numpy
 from ..._ml import chain, Tok2Vec, doc2feats
 from ...vocab import Vocab
-from ...pipeline import TokenVectorEncoder
+from ...pipeline import Tensorizer
 from ...syntax.arc_eager import ArcEager
 from ...syntax.nn_parser import Parser
 from ...tokens.doc import Doc
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -8,7 +8,7 @@ from ...attrs import NORM
 from ...gold import GoldParse
 from ...vocab import Vocab
 from ...tokens import Doc
-from ...pipeline import NeuralDependencyParser
+from ...pipeline import DependencyParser
@pytest.fixture
 def vocab():
@ -16,7 +16,7 @@ def vocab():
@pytest.fixture
 def parser(vocab):
-    parser = NeuralDependencyParser(vocab)
+    parser = DependencyParser(vocab)
    parser.cfg['token_vector_width'] = 4
    parser.cfg['hidden_width'] = 32
    #parser.add_label('right')
--- a/spacy/tests/parser/test_to_from_bytes_disk.py
+++ b/spacy/tests/parser/test_to_from_bytes_disk.py
@ -1,11 +1,11 @@
 import pytest
-from ...pipeline import NeuralDependencyParser
+from ...pipeline import DependencyParser
@pytest.fixture
 def parser(en_vocab):
-    parser = NeuralDependencyParser(en_vocab)
+    parser = DependencyParser(en_vocab)
    parser.add_label('nsubj')
    parser.model, cfg = parser.Model(parser.moves.n_moves)
    parser.cfg.update(cfg)
@ -14,7 +14,7 @@ def parser(en_vocab):
@pytest.fixture
 def blank_parser(en_vocab):
-    parser = NeuralDependencyParser(en_vocab)
+    parser = DependencyParser(en_vocab)
    return parser
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -82,3 +82,21 @@ def test_remove_pipe(nlp, name):
    assert not len(nlp.pipeline)
    assert removed_name == name
    assert removed_component == new_pipe
@pytest.mark.parametrize('name', ['my_component'])
 def test_disable_pipes_method(nlp, name):
    nlp.add_pipe(new_pipe, name=name)
    assert nlp.has_pipe(name)
    disabled = nlp.disable_pipes(name)
    assert not nlp.has_pipe(name)
    disabled.restore()
@pytest.mark.parametrize('name', ['my_component'])
 def test_disable_pipes_context(nlp, name):
    nlp.add_pipe(new_pipe, name=name)
    assert nlp.has_pipe(name)
    with nlp.disable_pipes(name):
        assert not nlp.has_pipe(name)
    assert nlp.has_pipe(name)
--- a/spacy/tests/serialize/test_serialize_parser_ner.py
+++ b/spacy/tests/serialize/test_serialize_parser_ner.py
@ -2,8 +2,8 @@
 from __future__ import unicode_literals
 from ..util import make_tempdir
-from ...pipeline import NeuralDependencyParser as DependencyParser
+from ...pipeline import DependencyParser
-from ...pipeline import NeuralEntityRecognizer as EntityRecognizer
+from ...pipeline import EntityRecognizer
 import pytest
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 from ..util import make_tempdir
-from ...pipeline import NeuralTagger as Tagger
+from ...pipeline import Tagger
 import pytest
--- a/spacy/tests/serialize/test_serialize_tensorizer.py
+++ b/spacy/tests/serialize/test_serialize_tensorizer.py
@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 from ..util import make_tempdir
-from ...pipeline import TokenVectorEncoder as Tensorizer
+from ...pipeline import Tensorizer
 import pytest
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -15,7 +15,6 @@ from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .tokens.token cimport Token
 from .attrs cimport PROB, LANG
 from .structs cimport SerializedLexemeC
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab)
    alt_file - [string] alternative file path used in footer and link button
    height   - [integer] height of code preview in px
-mixin github(repo, file, alt_file, height, language)
+mixin github(repo, file, height, alt_file, language)
    - var branch = ALPHA ? "develop" : "master"
    - var height = height || 250
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -38,7 +38,7 @@ for id in CURRENT_MODELS
                +cell #[+label Size]
                +cell #[+tag=comps.size] #[span(data-tpl=id data-tpl-key="size") #[em n/a]]
-            each label in ["Pipeline", "Sources", "Author", "License"]
+            each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
                - var field = label.toLowerCase()
                +row
                    +cell.u-nowrap
--- a/website/api/_annotation/_training.jade
+++ b/website/api/_annotation/_training.jade
@ -13,7 +13,9 @@ p
    |  that are part of an entity are set to the entity label, prefixed by the
    |  BILUO marker. For example #[code "B-ORG"] describes the first token of
    |  a multi-token #[code ORG] entity and #[code "U-PERSON"] a single
-    |  token representing a #[code PERSON] entity
+    |  token representing a #[code PERSON] entity. The
    |  #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]]
    |  function can help you convert entity offsets to the right format.
 +code("Example structure").
    [{
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@ -136,7 +136,7 @@ p
    |  #[+src(gh("spacy", "spacy/glossary.py")) #[code glossary.py]].
 +aside-code("Example").
-    spacy.explain('NORP')
+    spacy.explain(u'NORP')
    # Nationalities or religious or political groups
    doc = nlp(u'Hello world')
--- a/website/api/dependencyparser.jade
+++ b/website/api/dependencyparser.jade
@ -2,4 +2,5 @@
 include ../_includes/_mixins
 //- This class inherits from Pipe, so this page uses the template in pipe.jade.
 !=partial("pipe", { subclass: "DependencyParser", short: "parser", pipeline_id: "parser" })
--- a/website/api/entityrecognizer.jade
+++ b/website/api/entityrecognizer.jade
@ -2,4 +2,5 @@
 include ../_includes/_mixins
 //- This class inherits from Pipe, so this page uses the template in pipe.jade.
 !=partial("pipe", { subclass: "EntityRecognizer", short: "ner", pipeline_id: "ner" })
--- a/website/api/language.jade
+++ b/website/api/language.jade
@ -441,6 +441,37 @@ p
        +cell tuple
        +cell A #[code (name, component)] tuple of the removed component.
 +h(2, "disable_pipes") Language.disable_pipes
    +tag contextmanager
    +tag-new(2)
 p
    |  Disable one or more pipeline components. If used as a context manager,
    |  the pipeline will be restored to the initial state at the end of the
    |  block. Otherwise, a #[code DisabledPipes] object is returned, that has a
    |  #[code .restore()] method you can use to undo your changes.
 +aside-code("Example").
    with nlp.disable_pipes('tagger', 'parser'):
        optimizer = nlp.begin_training(gold_tuples)
    disabled = nlp.disable_pipes('tagger', 'parser')
    optimizer = nlp.begin_training(gold_tuples)
    disabled.restore()
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code *disabled]
        +cell unicode
        +cell Names of pipeline components to disable.
    +row("foot")
        +cell returns
        +cell #[code DisabledPipes]
        +cell
            |  The disabled pipes that can be restored by calling the object's
            |  #[code .restore()] method.
 +h(2, "to_disk") Language.to_disk
    +tag method
    +tag-new(2)
--- a/website/api/pipe.jade
+++ b/website/api/pipe.jade
@ -304,6 +304,21 @@ p Modify the pipe's model, to use the given parameter values.
            |  The parameter values to use in the model. At the end of the
            |  context, the original parameters are restored.
 +h(2, "add_label") #{CLASSNAME}.add_label
    +tag method
 p Add a new label to the pipe.
 +aside-code("Example").
    #{VARNAME} = #{CLASSNAME}(nlp.vocab)
    #{VARNAME}.add_label('MY_LABEL')
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code label]
        +cell unicode
        +cell The label to add.
 +h(2, "to_disk") #{CLASSNAME}.to_disk
    +tag method
--- a/website/api/tagger.jade
+++ b/website/api/tagger.jade
@ -2,4 +2,5 @@
 include ../_includes/_mixins
 //- This class inherits from Pipe, so this page uses the template in pipe.jade.
 !=partial("pipe", { subclass: "Tagger", pipeline_id: "tagger" })
--- a/website/api/tensorizer.jade
+++ b/website/api/tensorizer.jade
@ -2,4 +2,5 @@
 include ../_includes/_mixins
 //- This class inherits from Pipe, so this page uses the template in pipe.jade.
 !=partial("pipe", { subclass: "Tensorizer", pipeline_id: "tensorizer" })
--- a/website/api/textcategorizer.jade
+++ b/website/api/textcategorizer.jade
@ -16,4 +16,5 @@ p
    |  before a logistic activation is applied elementwise. The value of each
    |  output neuron is the probability that some class is present.
 //- This class inherits from Pipe, so this page uses the template in pipe.jade.
 !=partial("pipe", { subclass: "TextCategorizer", short: "textcat", pipeline_id: "textcat" })
--- a/website/assets/js/main.js
+++ b/website/assets/js/main.js
@ -140,6 +140,10 @@ class ModelLoader {
        else return ({ ok: res.ok })
    }
    convertNumber(num, separator = ',') {
        return num.toString().replace(/\B(?=(\d{3})+(?!\d))/g, separator);
    }
    getModels(compat) {
        this.compat = compat;
        for (let modelId of this.modelIds) {
@ -159,7 +163,7 @@ class ModelLoader {
        const template = new Templater(modelId);
        template.get('table').removeAttribute('data-loading');
        template.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'author', 'license']) {
+        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
            template.get(key).parentElement.parentElement.style.display = 'none';
        }
    }
@ -167,13 +171,14 @@ class ModelLoader {
    /**
     * Update model details in tables. Currently quite hacky :(
     */
-    render({ lang, name, version, sources, pipeline, url, author, license, accuracy, size, description, notes }) {
+    render({ lang, name, version, sources, pipeline, vectors, url, author, license, accuracy, size, description, notes }) {
        const modelId = `${lang}_${name}`;
        const model = `${modelId}-${version}`;
        const template = new Templater(modelId);
        const getSources = s => (s instanceof Array) ? s.join(', ') : s;
        const getPipeline = p => p.map(comp => `<code>${comp}</code>`).join(', ');
        const getVectors = v => `${this.convertNumber(v.entries)} (${v.width} dimensions)`;
        const getLink = (t, l) => `<a href="${l}" target="_blank">${t}</a>`;
        const keys = { version, size, description, notes }
@ -182,6 +187,8 @@ class ModelLoader {
        if (sources) template.fill('sources', getSources(sources));
        if (pipeline && pipeline.length) template.fill('pipeline', getPipeline(pipeline), true);
        else template.get('pipeline').parentElement.parentElement.style.display = 'none';
        if (vectors) template.fill('vectors', getVectors(vectors));
        else template.get('vectors').parentElement.parentElement.style.display = 'none';
        if (author) template.fill('author', url ? getLink(author, url) : author, true);
        if (license) template.fill('license', this.licenses[license] ? getLink(license, this.licenses[license]) : license, true);
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@ -106,7 +106,7 @@
            "How Pipelines Work": "pipelines",
            "Custom Components": "custom-components",
            "Developing Extensions": "extensions",
-            "Multi-threading": "multithreading",
+            "Multi-Threading": "multithreading",
            "Serialization": "serialization"
        }
    },
@ -196,9 +196,10 @@
        "teaser": "Full code examples you can modify and run.",
        "next": "resources",
        "menu": {
            "Information Extraction": "information-extraction",
            "Pipeline": "pipeline",
            "Matching": "matching",
            "Training": "training",
            "Vectors & Similarity": "vectors",
            "Deep Learning": "deep-learning"
        }
    }
--- a/website/usage/_processing-pipelines/_custom-components.jade
+++ b/website/usage/_processing-pipelines/_custom-components.jade
@ -234,7 +234,7 @@ p
    |  when you customise spaCy's tokenization rules. When you call #[code nlp]
    |  on a text, the custom pipeline component is applied to the #[code Doc]
-+github("spacy", "examples/pipeline/custom_component_entities.py", false, 500)
+github("spacy", "examples/pipeline/custom_component_entities.py", 500)
 p
    |  Wrapping this functionality in a
@ -255,7 +255,7 @@ p
    |  #[code Token] – for example, the capital, latitude/longitude coordinates
    |  and even the country flag.
-+github("spacy", "examples/pipeline/custom_component_countries_api.py", false, 500)
+github("spacy", "examples/pipeline/custom_component_countries_api.py", 500)
 p
    |  In this case, all data can be fetched on initialisation in one request.
--- a/website/usage/_processing-pipelines/_multithreading.jade
+++ b/website/usage/_processing-pipelines/_multithreading.jade
@ -38,3 +38,16 @@ p
        |  the generator in two, and then #[code izip] the extra stream to the
        |  document stream. Here's
        |  #[+a(gh("spacy") + "/issues/172#issuecomment-183963403") an example].
 +h(3, "multi-processing-example") Example: Multi-processing with Joblib
 p
    |  This example shows how to use multiple cores to process text using
    |  spaCy and #[+a("https://pythonhosted.org/joblib/") Joblib]. We're
    |  exporting part-of-speech-tagged, true-cased, (very roughly)
    |  sentence-separated text, with each "sentence" on a newline, and
    |  spaces between tokens. Data is loaded from the IMDB movie reviews
    |  dataset and will be loaded automatically via Thinc's built-in dataset
    |  loader.
 +github("spacy", "examples/pipeline/multi_processing.py", 500)
--- a/website/usage/_training/_ner.jade
+++ b/website/usage/_training/_ner.jade
@ -24,38 +24,108 @@ p
    |  #[strong experiment on your own data] to find a solution that works best
    |  for you.
-+h(3, "example-new-entity-type") Example: Training an additional entity type
+h(3, "example-train-ner") Updating the Named Entity Recognizer
 p
-    |  This script shows how to add a new entity type to an existing pre-trained
+    |  This example shows how to update spaCy's entity recognizer
-    |  NER model. To keep the example short and simple, only a few sentences are
+    |  with your own examples, starting off with an existing, pre-trained
    |  model, or from scratch using a blank #[code Language] class. To do
    |  this, you'll need #[strong example texts] and the
    |  #[strong character offsets] and #[strong labels] of each entity contained
    |  in the texts.
    +github("spacy", "examples/training/train_ner.py", 500)
 +h(4) Step by step guide
 +list("numbers")
    +item
        |  #[strong Reformat the training data] to match spaCy's
        |  #[+a("/api/annotation#json-input") JSON format]. The built-in
        |  #[+api("goldparse#biluo_tags_from_offsets") #[code biluo_tags_from_offsets]]
        |  function can help you with this.
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
        |  language. If you're using a blank model, don't forget to add the
        |  entity recognizer to the pipeline. If you're using an existing model,
        |  make sure to disable all other pipeline components during training
        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
        |  This way, you'll only be training the entity recognizer.
    +item
        |  #[strong Shuffle and loop over] the examples and create a
        |  #[code Doc] and #[code GoldParse] object for each example.
    +item
        |  For each example, #[strong update the model]
        |  by calling #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
        |  #[strong prediction]. It then consults the annotations provided on the
        |  #[code GoldParse] instance, to see whether it was
        |  right. If it was wrong, it adjusts its weights so that the correct
        |  action will score higher next time.
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the entities in the training
        |  data are recognised correctly.
 +h(3, "example-new-entity-type") Training an additional entity type
 p
    |  This script shows how to add a new entity type #[code ANIMAL] to an
    |  existing pre-trained NER model, or an empty #[code Language] class. To
    |  keep the example short and simple, only a few sentences are
    |  provided as examples. In practice, you'll need many more — a few hundred
    |  would be a good start. You will also likely need to mix in examples of
    |  other entity types, which might be obtained by running the entity
    |  recognizer over unlabelled sentences, and adding their annotations to the
    |  training set.
-p
+github("spacy", "examples/training/train_new_entity_type.py", 500)
    |  The actual training is performed by looping over the examples, and
    |  calling #[+api("language#update") #[code nlp.update()]]. The
    |  #[code update] method steps through the words of the input. At each word,
    |  it makes a prediction. It then consults the annotations provided on the
    |  #[+api("goldparse") #[code GoldParse]] instance, to see whether it was
    |  right. If it was wrong, it adjusts its weights so that the correct
    |  action will score higher next time.
-+github("spacy", "examples/training/train_new_entity_type.py")
+h(4) Step by step guide
-+h(3, "example-ner-from-scratch") Example: Training an NER system from scratch
+list("numbers")
    +item
        |  Create #[code Doc] and #[code GoldParse] objects for
        |  #[strong each example in your training data].
-p
+    +item
-    |  This example is written to be self-contained and reasonably transparent.
+        |  #[strong Load the model] you want to start with, or create an
-    |  To achieve that, it duplicates some of spaCy's internal functionality.
+        |  #[strong empty model] using
-    |  Specifically, in this example, we don't use spaCy's built-in
+        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
-    |  #[+api("language") #[code Language]] class to wire together the
+        |  language. If you're using a blank model, don't forget to add the
-    |  #[+api("vocab") #[code Vocab]], #[+api("tokenizer") #[code Tokenizer]]
+        |  entity recognizer to the pipeline. If you're using an existing model,
-    |  and #[+api("entityrecognizer") #[code EntityRecognizer]]. Instead, we
+        |  make sure to disable all other pipeline components during training
-    |  write our own simle #[code Pipeline] class, so that it's easier to see
+        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
-    |  how the pieces interact.
+        |  This way, you'll only be training the entity recognizer.
-+github("spacy", "examples/training/train_ner_standalone.py")
+    +item
        |  #[strong Add the new entity label] to the entity recognizer using the
        |  #[+api("entityrecognizer#add_label") #[code add_label]] method. You
        |  can access the entity recognizer in the pipeline via
        |  #[code nlp.get_pipe('ner')].
    +item
        |  #[strong Loop over] the examples and call
        |  #[+api("language#update") #[code nlp.update]], which steps through
        |  the words of the input. At each word, it makes a
        |  #[strong prediction]. It then consults the annotations provided on the
        |  #[code GoldParse] instance, to see whether it was right. If it was
        |  wrong, it adjusts its weights so that the correct action will score
        |  higher next time.
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the new entity is recognised
        |  correctly.
--- a/website/usage/_training/_tagger-parser.jade
+++ b/website/usage/_training/_tagger-parser.jade
@ -1,6 +1,195 @@
 //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER
-+under-construction
+h(3, "example-train-parser") Updating the Dependency Parser
 p
    |  This example shows how to train spaCy's dependency parser, starting off
    |  with an existing model or a blank model. You'll need a set of
    |  #[strong training examples] and the respective #[strong heads] and
    |  #[strong dependency label] for each token of the example texts.
 +github("spacy", "examples/training/train_parser.py", 500)
 +h(4) Step by step guide
 +list("numbers")
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
        |  language. If you're using a blank model, don't forget to add the
        |  parser to the pipeline. If you're using an existing model,
        |  make sure to disable all other pipeline components during training
        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
        |  This way, you'll only be training the parser.
    +item
        |  #[strong Add the dependency labels] to the parser using the
        |  #[+api("dependencyparser#add_label") #[code add_label]] method. If
        |  you're starting off with a pre-trained spaCy model, this is usually
        |  not necessary – but it doesn't hurt either, just to be safe.
    +item
        |  #[strong Shuffle and loop over] the examples and create a
        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
        |  to pass in the #[code heads] and #[code deps] when you create the
        |  #[code GoldParse].
    +item
        |  For each example, #[strong update the model]
        |  by calling #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
        |  #[strong prediction]. It then consults the annotations provided on the
        |  #[code GoldParse] instance, to see whether it was
        |  right. If it was wrong, it adjusts its weights so that the correct
        |  action will score higher next time.
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the parser works as expected.
 +h(3, "example-train-tagger") Updating the Part-of-speech Tagger
 p
    |  In this example, we're training spaCy's part-of-speech tagger with a
    |  custom tag map. We start off with a blank #[code Language] class, update
    |  its defaults with our custom tags and then train the tagger. You'll need
    |  a set of #[strong training examples] and the respective
    |  #[strong custom tags], as well as a dictionary mapping those tags to the
    |  #[+a("http://universaldependencies.github.io/docs/u/pos/index.html") Universal Dependencies scheme].
 +github("spacy", "examples/training/train_tagger.py", 500)
 +h(4) Step by step guide
 +list("numbers")
    +item
        |  #[strong Create] a new #[code Language] class and before initialising
        |  it, update the #[code tag_map] in its #[code Defaults] with your
        |  custom tags.
    +item
        |  #[strong Create a new tagger] component and add it to the pipeline.
    +item
        |  #[strong Shuffle and loop over] the examples and create a
        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
        |  to pass in the #[code tags] when you create the #[code GoldParse].
    +item
        |  For each example, #[strong update the model]
        |  by calling #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
        |  #[strong prediction]. It then consults the annotations provided on the
        |  #[code GoldParse] instance, to see whether it was
        |  right. If it was wrong, it adjusts its weights so that the correct
        |  action will score higher next time.
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the parser works as expected.
 +h(3, "intent-parser") Training a parser for custom semantics
 p
    |  spaCy's parser component can be used to trained to predict any type
    |  of tree structure over your input text – including
    |  #[strong semantic relations] that are not syntactic dependencies. This
    |  can be useful to for #[strong conversational applications], which need to
    | predict trees over whole documents or chat logs, with connections between
    |  the sentence roots used to annotate discourse structure. For example, you
    |  can train spaCy's parser to label intents and their targets, like
    |  attributes, quality, time and locations. The result could look like this:
 +codepen("991f245ef90debb78c8fc369294f75ad", 300)
 +code.
    doc = nlp(u"find a hotel with good wifi")
    print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
    # [('find', 'ROOT', 'find'), ('hotel', 'PLACE', 'find'),
    #  ('good', 'QUALITY', 'wifi'), ('wifi', 'ATTRIBUTE', 'hotel')]
 p
    |  The above tree attaches "wifi" to "hotel" and assigns the dependency
    |  label #[code ATTRIBUTE]. This may not be a correct syntactic dependency –
    |  but in this case, it expresses exactly what we need: the user is looking
    |  for a hotel with the attribute "wifi" of the quality "good". This query
    |  can then be processed by your application and used to trigger the
    |  respective action – e.g. search the database for hotels with high ratings
    |  for their wifi offerings.
 +aside("Tip: merge phrases and entities")
    |  To achieve even better accuracy, try merging multi-word tokens and
    |  entities specific to your domain into one token before parsing your text.
    |  You can do this by running the entity recognizer or
    |  #[+a("/usage/linguistic-features#rule-based-matching") rule-based matcher]
    |  to find relevant spans, and merging them using
    |  #[+api("span#merge") #[code Span.merge]]. You could even add your own
    |  custom #[+a("/usage/processing-pipelines#custom-components") pipeline component]
    |  to do this automatically – just make sure to add it #[code before='parser'].
 p
    |  The following example example shows a full implementation of a training
    |  loop for a custom message parser for a common "chat intent": finding
    |  local businesses. Our message semantics will have the following types
    |  of relations: #[code ROOT], #[code PLACE], #[code QUALITY],
    |  #[code ATTRIBUTE], #[code TIME] and #[code LOCATION].
 +github("spacy", "examples/training/train_intent_parser.py", 500)
 +h(4) Step by step guide
 +list("numbers")
    +item
        |  #[strong Create the training data] consisting of words, their heads
        |  and their dependency labels in order. A token's head is the index
        |  of the token it is attached to. The heads don't need to be
        |  syntactically correct – they should express the
        |  #[strong semantic relations] you want the parser to learn. For words
        |  that shouldn't receive a label, you can choose an arbitrary
        |  placeholder, for example #[code -].
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
        |  language. If you're using a blank model, don't forget to add the
        |  parser to the pipeline. If you're using an existing model,
        |  make sure to disable all other pipeline components during training
        |  using #[+api("language#disable_pipes") #[code nlp.disable_pipes]].
        |  This way, you'll only be training the parser.
    +item
        |  #[strong Add the dependency labels] to the parser using the
        |  #[+api("dependencyparser#add_label") #[code add_label]] method.
    +item
        |  #[strong Shuffle and loop over] the examples and create a
        |  #[code Doc] and #[code GoldParse] object for each example. Make sure
        |  to pass in the #[code heads] and #[code deps] when you create the
        |  #[code GoldParse].
    +item
        |  For each example, #[strong update the model]
        |  by calling #[+api("language#update") #[code nlp.update]], which steps
        |  through the words of the input. At each word, it makes a
        |  #[strong prediction]. It then consults the annotations provided on the
        |  #[code GoldParse] instance, to see whether it was
        |  right. If it was wrong, it adjusts its weights so that the correct
        |  action will score higher next time.
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the parser works as expected.
 +h(3, "training-json") JSON format for training
--- a/website/usage/_training/_textcat.jade
+++ b/website/usage/_training/_textcat.jade
@ -1,13 +1,62 @@
 //- 💫 DOCS > USAGE > TRAINING > TEXT CLASSIFICATION
-+under-construction
+h(3, "example-textcat") Adding a text classifier to a spaCy model
 +h(3, "example-textcat") Example: Training spaCy's text classifier
    +tag-new(2)
 p
-    |  This example shows how to use and train spaCy's new
+    |  This example shows how to train a multi-label convolutional neural
-    |  #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
+    |  network text classifier on IMDB movie reviews, using spaCy's new
-    |  on IMDB movie reviews.
+    |  #[+api("textcategorizer") #[code TextCategorizer]] component. The
    |  dataset will be loaded automatically via Thinc's built-in dataset
    |  loader. Predictions are available via
    |  #[+api("doc#attributes") #[code Doc.cats]].
-+github("spacy", "examples/training/train_textcat.py")
+github("spacy", "examples/training/train_textcat.py", 500)
 +h(4) Step by step guide
 +list("numbers")
    +item
        |  #[strong Load the model] you want to start with, or create an
        |  #[strong empty model] using
        |  #[+api("spacy#blank") #[code spacy.blank]] with the ID of your
        |  language. If you're using an existing model, make sure to disable all
        |  other pipeline components during training using
        |  #[+api("language#disable_pipes") #[code nlp.disable_pipes]]. This
        |  way, you'll only be training the text classifier.
    +item
        |  #[strong Add the text classifier] to the pipeline, and add the labels
        |  you want to train – for example, #[code POSITIVE].
    +item
        |  #[strong Load and pre-process the dataset], shuffle the data and
        |  split off a part of it to hold back for evaluation. This way, you'll
        |  be able to see results on each training iteration.
    +item
        |  #[strong Loop over] the training examples, partition them into
        |  batches and create #[code Doc] and #[code GoldParse] objects for each
        |  example in the batch.
    +item
        |  #[strong Update the model] by calling
        |  #[+api("language#update") #[code nlp.update]], which steps
        |  through the examples and makes a #[strong prediction]. It then
        |  consults the annotations provided on the #[code GoldParse] instance,
        |  to see whether it was right. If it was wrong, it adjusts its weights
        |  so that the correct prediction will score higher next time.
    +item
        |  Optionally, you can also #[strong evaluate the text classifier] on
        |  each iteration, by checking how it performs on the development data
        |  held back from the dataset. This lets you print the
        |  #[strong precision], #[strong recall] and #[strong F-score].
    +item
        |  #[strong Save] the trained model using
        |  #[+api("language#to_disk") #[code nlp.to_disk]].
    +item
        |  #[strong Test] the model to make sure the text classifier works as
        |  expected.
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -2,6 +2,37 @@
 include ../_includes/_mixins
 +section("information-extraction")
    +h(3, "phrase-matcher") Using spaCy's phrase matcher
        +tag-new(2)
    p
        |  This example shows how to use the new
        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
        |  entities from a large terminology list.
    +github("spacy", "examples/information_extraction/phrase_matcher.py")
    +h(3, "entity-relations") Extracting entity relations
    p
        |  A simple example of extracting relations between phrases and
        |  entities using spaCy's named entity recognizer and the dependency
        |  parse. Here, we extract money and currency values (entities labelled
        |  as #[code MONEY]) and then check the dependency tree to find the
        |  noun phrase they are referring to – for example: "$9.4 million"
        |  &rarr; "Net income".
    +github("spacy", "examples/information_extraction/entity_relations.py")
    +h(3, "subtrees") Navigating the parse tree and subtrees
    p
        |  This example shows how to navigate the parse tree including subtrees
        |  attached to a word.
    +github("spacy", "examples/information_extraction/parse_subtrees.py")
 +section("pipeline")
    +h(3, "custom-components-entities") Custom pipeline components and attribute extensions
        +tag-new(2)
@ -40,27 +71,29 @@ include ../_includes/_mixins
    +github("spacy", "examples/pipeline/custom_attr_methods.py")
-+section("matching")
+    +h(3, "multi-processing") Multi-processing with Joblib
    +h(3, "matcher") Using spaCy's rule-based matcher
    p
-        |  This example shows how to use spaCy's rule-based
+        |  This example shows how to use multiple cores to process text using
-        |  #[+api("matcher") #[code Matcher]] to find and label entities across
+        |  spaCy and #[+a("https://pythonhosted.org/joblib/") Joblib]. We're
-        |  documents.
+        |  exporting part-of-speech-tagged, true-cased, (very roughly)
        |  sentence-separated text, with each "sentence" on a newline, and
        |  spaces between tokens. Data is loaded from the IMDB movie reviews
        |  dataset and will be loaded automatically via Thinc's built-in dataset
        |  loader.
-    +github("spacy", "examples/matcher_example.py")
+    +github("spacy", "examples/pipeline/multi_processing.py")
    +h(3, "phrase-matcher") Using spaCy's phrase matcher
        +tag-new(2)
    p
        |  This example shows how to use the new
        |  #[+api("phrasematcher") #[code PhraseMatcher]] to efficiently find
        |  entities from a large terminology list.
    +github("spacy", "examples/phrase_matcher.py")
 +section("training")
    +h(3, "training-ner") Training spaCy's Named Entity Recognizer
    p
        |  This example shows how to update spaCy's entity recognizer
        |  with your own examples, starting off with an existing, pre-trained
        |  model, or from scratch using a blank #[code Language] class.
    +github("spacy", "examples/training/train_ner.py")
    +h(3, "new-entity-type") Training an additional entity type
    p
@ -71,25 +104,63 @@ include ../_includes/_mixins
    +github("spacy", "examples/training/train_new_entity_type.py")
-    +h(3, "ner-standalone") Training an NER system from scratch
+    +h(3, "parser") Training spaCy's Dependency Parser
    p
-        |  This example is written to be self-contained and reasonably
+        |  This example shows how to update spaCy's dependency parser,
-        |  transparent. To achieve that, it duplicates some of spaCy's internal
+        |  starting off with an existing, pre-trained model, or from scratch
-        |  functionality.
+        |  using a blank #[code Language] class.
-    +github("spacy", "examples/training/train_ner_standalone.py")
+    +github("spacy", "examples/training/train_parser.py")
    +h(3, "tagger") Training spaCy's Part-of-speech Tagger
    p
        |  In this example, we're training spaCy's part-of-speech tagger with a
        |  custom tag map, mapping our own tags to the mapping those tags to the
        |  #[+a("http://universaldependencies.github.io/docs/u/pos/index.html") Universal Dependencies scheme].
    +github("spacy", "examples/training/train_tagger.py")
    +h(3, "intent-parser") Training a custom parser for chat intent semantics
    p
        |  spaCy's parser component can be used to trained to predict any type
        |  of tree structure over your input text. You can also predict trees
        |  over whole documents or chat logs, with connections between the
        |  sentence-roots used to annotate discourse structure. In this example,
        |  we'll build a message parser for a common "chat intent": finding
        |  local businesses. Our message semantics will have the following types
        |  of relations: #[code ROOT], #[code PLACE], #[code QUALITY],
        |  #[code ATTRIBUTE], #[code TIME] and #[code LOCATION].
    +github("spacy", "examples/training/train_intent_parser.py")
    +h(3, "textcat") Training spaCy's text classifier
        +tag-new(2)
    p
-        |  This example shows how to use and train spaCy's new
+        |  This example shows how to train a multi-label convolutional neural
-        |  #[+api("textcategorizer") #[code TextCategorizer]] pipeline component
+        |  network text classifier on IMDB movie reviews, using spaCy's new
-        |  on IMDB movie reviews.
+        |  #[+api("textcategorizer") #[code TextCategorizer]] component. The
        |  dataset will be loaded automatically via Thinc's built-in dataset
        |  loader. Predictions are available via
        |  #[+api("doc#attributes") #[code Doc.cats]].
    +github("spacy", "examples/training/train_textcat.py")
 +section("vectors")
    +h(3, "fasttext") Loading pre-trained fastText vectors
    p
        |  This simple snippet is all you need to be able to use the Facebook's
        |  #[+a("https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md") fastText vectors]
        |  (294 languages, pre-trained on Wikipedia) with spaCy.  Once they're
        |  loaded, the vectors will be available via spaCy's built-in
        |  #[code similarity()] methods.
    +github("spacy", "examples/vectors_fast_text.py")
 +section("deep-learning")
    +h(3, "keras") Text classification with Keras
@ -108,4 +179,4 @@ include ../_includes/_mixins
        |  parameters, and was implemented using #[+a("https://keras.io") Keras]
        |  and spaCy.
-    +github("spacy", "examples/keras_parikh_entailment/__main__.py", "examples/keras_parikh_entailment")
+    +github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")
--- a/website/usage/text-classification.jade
+++ b/website/usage/text-classification.jade
@ -2,8 +2,4 @@
 include ../_includes/_mixins
-+under-construction
+include _training/_textcat
 +h(2, "example") Example
 +github("spacy", "examples/training/train_textcat.py")
		`@ -1,5 +0,0 @@`
			`An example of inventory counting using SpaCy.io NLP library. Meant to show how to instantiate Spacy's English class, and allow reusability by reloading the main module.`

			`In the future, a better implementation of this library would be to apply machine learning to each query and learn what to classify as the quantitative statement (55 kgs OF), vs the actual item of count (how likely is a preposition object to be the item of count if x,y,z qualifications appear in the statement).`