Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-09-16 09:02:35 +03:00 · 2017-05-28 08:12:05 -05:00 · 2017-05-28 08:12:05 -05:00 · 8a24c60c1e
commit 8a24c60c1e
parent bc97bc292c fe11564b8e
58 changed files with 787 additions and 719 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,9 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import importlib
 from .compat import basestring_
 from .cli.info import info as cli_info
 from .glossary import explain
 from .deprecated import resolve_load_name
@ -12,14 +9,7 @@ from . import util
 def load(name, **overrides):
    name = resolve_load_name(name, **overrides)
-    model_path = util.resolve_model_path(name)
+    return util.load_model(name)
    meta = util.parse_package_meta(model_path)
    if 'lang' not in meta:
        raise IOError('No language setting found in model meta.')
    cls = util.get_lang_class(meta['lang'])
    overrides['meta'] = meta
    overrides['path'] = model_path
    return cls(**overrides)
 def info(model=None, markdown=False):
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -150,6 +150,9 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
        else:
            int_key = IDS[name.upper()]
        if strings_map is not None and isinstance(value, basestring):
            if hasattr(strings_map, 'add'):
                value = strings_map.add(value)
            else:
                value = strings_map[value]
        inty_attrs[int_key] = value
    return inty_attrs
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -20,8 +20,14 @@ def info(cmd, model=None, markdown=False):
    prints details in Markdown for easy copy-pasting to GitHub issues.
    """
    if model:
-        model_path = util.resolve_model_path(model)
+        if util.is_package(model):
-        meta = util.parse_package_meta(model_path)
+            model_path = util.get_package_path(model)
        else:
            model_path = util.get_data_path() / model
        meta_path = model_path / 'meta.json'
        if not meta_path.is_file():
            prints(meta_path, title="Can't find model meta.json", exits=1)
        meta = read_json(meta_path)
        if model_path.resolve() != model_path:
            meta['link'] = path2str(model_path)
            meta['source'] = path2str(model_path.resolve())
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -21,7 +21,7 @@ def link(cmd, origin, link_name, force=False):
    directory. Linking models allows loading them via spacy.load(link_name).
    """
    if util.is_package(origin):
-        model_path = util.get_model_package_path(origin)
+        model_path = util.get_package_path(model)
    else:
        model_path = Path(origin)
    if not model_path.exists():
--- a/spacy/gold.pxd
+++ b/spacy/gold.pxd
@ -1,13 +1,14 @@
 from cymem.cymem cimport Pool
 from .structs cimport TokenC
 from .typedefs cimport attr_t
 from .syntax.transition_system cimport Transition
 cdef struct GoldParseC:
    int* tags
    int* heads
-    int* labels
+    attr_t* labels
    int** brackets
    Transition* ner
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -384,7 +384,7 @@ cdef class GoldParse:
        # These are filled by the tagger/parser/entity recogniser
        self.c.tags = <int*>self.mem.alloc(len(doc), sizeof(int))
        self.c.heads = <int*>self.mem.alloc(len(doc), sizeof(int))
-        self.c.labels = <int*>self.mem.alloc(len(doc), sizeof(int))
+        self.c.labels = <attr_t*>self.mem.alloc(len(doc), sizeof(attr_t))
        self.c.ner = <Transition*>self.mem.alloc(len(doc), sizeof(Transition))
        self.words = [None] * len(doc)
--- a/spacy/lang/en/init.py
+++ b/spacy/lang/en/init.py
@ -35,4 +35,4 @@ class English(Language):
    Defaults = EnglishDefaults
-__all__ = ['English', 'EnglishDefaults']
+__all__ = ['English']
--- a/spacy/lang/xx/init.py
+++ b/spacy/lang/xx/init.py
@ -0,0 +1,26 @@
 # coding: utf8
 from __future__ import unicode_literals
 from ..tokenizer_exceptions import BASE_EXCEPTIONS
 from ...language import Language
 from ...attrs import LANG
 from ...util import update_exc
 class MultiLanguageDefaults(Language.Defaults):
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
    lex_attr_getters[LANG] = lambda text: 'xx'
    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
 class MultiLanguage(Language):
    """Language class to be used for models that support multiple languages.
    This module allows models to specify their language ID as 'xx'.
    """
    lang = 'xx'
    Defaults = MultiLanguageDefaults
 __all__ = ['MultiLanguage']
--- a/spacy/language.py
+++ b/spacy/language.py
@ -215,7 +215,9 @@ class Language(object):
        grads = {}
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
-        for proc in self.pipeline[1:]:
+        pipes = list(self.pipeline[1:])
        random.shuffle(pipes)
        for proc in pipes:
            if not hasattr(proc, 'update'):
                continue
            tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -27,7 +27,7 @@ cdef class Lexeme:
    cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil:
        cdef SerializedLexemeC lex_data
        buff = <const unsigned char*>&lex.flags
-        end = <const unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
+        end = <const unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
        for i in range(sizeof(lex_data.data)):
            lex_data.data[i] = buff[i]
        return lex_data
@ -35,7 +35,7 @@ cdef class Lexeme:
    @staticmethod
    cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil:
        buff = <unsigned char*>&lex.flags
-        end = <unsigned char*>&lex.l2_norm + sizeof(lex.l2_norm)
+        end = <unsigned char*>&lex.sentiment + sizeof(lex.sentiment)
        for i in range(sizeof(lex_data.data)):
            buff[i] = lex_data.data[i]
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -35,11 +35,11 @@ cdef class Lexeme:
    tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
    tag).
    """
-    def __init__(self, Vocab vocab, int orth):
+    def __init__(self, Vocab vocab, attr_t orth):
        """Create a Lexeme object.
        vocab (Vocab): The parent vocabulary
-        orth (int): The orth id of the lexeme.
+        orth (uint64): The orth id of the lexeme.
        Returns (Lexeme): The newly constructd object.
        """
        self.vocab = vocab
@ -51,7 +51,7 @@ cdef class Lexeme:
        if isinstance(other, Lexeme):
            a = self.orth
            b = other.orth
-        elif isinstance(other, int):
+        elif isinstance(other, long):
            a = self.orth
            b = other
        elif isinstance(other, str):
@ -109,7 +109,7 @@ cdef class Lexeme:
    def to_bytes(self):
        lex_data = Lexeme.c_to_bytes(self.c)
        start = <const char*>&self.c.flags
-        end = <const char*>&self.c.l2_norm + sizeof(self.c.l2_norm)
+        end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
        assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
        byte_string = b'\0' * sizeof(lex_data.data)
        byte_chars = <char*>byte_string
@ -136,12 +136,7 @@ cdef class Lexeme:
        RETURNS (bool): Whether a word vector is associated with the object.
        """
        def __get__(self):
-            cdef int i
+            return self.vocab.has_vector(self.c.orth)
            for i in range(self.vocab.vectors_length):
                if self.c.vector[i] != 0:
                    return True
            else:
                return False
    property vector_norm:
        """The L2 norm of the lexeme's vector representation.
@ -149,10 +144,8 @@ cdef class Lexeme:
        RETURNS (float): The L2 norm of the vector representation.
        """
        def __get__(self):
-            return self.c.l2_norm
+            vector = self.vector
-
+            return numpy.sqrt((vector**2).sum())
        def __set__(self, float value):
            self.c.l2_norm = value
    property vector:
        """A real-valued meaning representation.
@ -169,27 +162,16 @@ cdef class Lexeme:
                    "model doesn't include word vectors. For more info, see "
                    "the documentation: \n%s\n" % about.__docs_models__
                )
-
+            return self.vocab.get_vector(self.c.orth)
            vector_view = <float[:length,]>self.c.vector
            return numpy.asarray(vector_view)
        def __set__(self, vector):
            assert len(vector) == self.vocab.vectors_length
-            cdef float value
+            self.vocab.set_vector(self.c.orth, vector)
            cdef double norm = 0.0
            for i, value in enumerate(vector):
                self.c.vector[i] = value
                norm += value * value
            self.c.l2_norm = sqrt(norm)
    property rank:
        def __get__(self):
            return self.c.id
    property repvec:
        def __get__(self):
            raise AttributeError("lex.repvec has been renamed to lex.vector")
    property sentiment:
        def __get__(self):
            return self.c.sentiment
@ -210,31 +192,31 @@ cdef class Lexeme:
    property lower:
        def __get__(self): return self.c.lower
-        def __set__(self, int x): self.c.lower = x
+        def __set__(self, attr_t x): self.c.lower = x
    property norm:
        def __get__(self): return self.c.norm
-        def __set__(self, int x): self.c.norm = x
+        def __set__(self, attr_t x): self.c.norm = x
    property shape:
        def __get__(self): return self.c.shape
-        def __set__(self, int x): self.c.shape = x
+        def __set__(self, attr_t x): self.c.shape = x
    property prefix:
        def __get__(self): return self.c.prefix
-        def __set__(self, int x): self.c.prefix = x
+        def __set__(self, attr_t x): self.c.prefix = x
    property suffix:
        def __get__(self): return self.c.suffix
-        def __set__(self, int x): self.c.suffix = x
+        def __set__(self, attr_t x): self.c.suffix = x
    property cluster:
        def __get__(self): return self.c.cluster
-        def __set__(self, int x): self.c.cluster = x
+        def __set__(self, attr_t x): self.c.cluster = x
    property lang:
        def __get__(self): return self.c.lang
-        def __set__(self, int x): self.c.lang = x
+        def __set__(self, attr_t x): self.c.lang = x
    property prob:
        def __get__(self): return self.c.prob
@ -270,7 +252,7 @@ cdef class Lexeme:
    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_OOV)
-        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_OOV, x)
+        def __set__(self, attr_t x): Lexeme.c_set_flag(self.c, IS_OOV, x)
    property is_stop:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_STOP)
@ -320,7 +302,6 @@ cdef class Lexeme:
        def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x)
    property like_url:
        def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL)
        def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x)
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -154,7 +154,7 @@ def _convert_strings(token_specs, string_store):
            if isinstance(attr, basestring):
                attr = attrs.IDS.get(attr.upper())
            if isinstance(value, basestring):
-                value = string_store[value]
+                value = string_store.add(value)
            if isinstance(value, bool):
                value = int(value)
            if attr is not None:
@ -381,7 +381,7 @@ cdef class Matcher:
    def _normalize_key(self, key):
        if isinstance(key, basestring):
-            return self.vocab.strings[key]
+            return self.vocab.strings.add(key)
        else:
            return key
@ -469,7 +469,7 @@ cdef class PhraseMatcher:
            self(doc)
            yield doc
-    def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
+    def accept_match(self, Doc doc, attr_t ent_id, attr_t label, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
        for i in range(self.max_length):
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -48,7 +48,7 @@ cdef class Morphology:
            self.tag_map[tag_str] = dict(attrs)
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
            self.rich_tags[i].id = i
-            self.rich_tags[i].name = self.strings[tag_str]
+            self.rich_tags[i].name = self.strings.add(tag_str)
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
@ -59,10 +59,12 @@ cdef class Morphology:
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
-            tag_id = self.reverse_index[self.strings[tag]]
+            tag = self.strings.add(tag)
-        else:
+        if tag in self.reverse_index:
            tag_id = self.reverse_index[tag]
            self.assign_tag_id(token, tag_id)
        else:
            token.tag = tag
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id >= self.n_tags:
@ -73,7 +75,7 @@ cdef class Morphology:
        # the statistical model fails.
        # Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings['SP']]
+            tag_id = self.reverse_index[self.strings.add('SP')]
        rich_tag = self.rich_tags[tag_id]
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
@ -104,7 +106,7 @@ cdef class Morphology:
            tag (unicode): The part-of-speech tag to key the exception.
            orth (unicode): The word-form to key the exception.
        """
-        tag = self.strings[tag_str]
+        tag = self.strings.add(tag_str)
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
@ -140,14 +142,14 @@ cdef class Morphology:
    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
-            return self.strings[py_string.lower()]
+            return self.strings.add(py_string.lower())
        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings[py_string.lower()]
+            return self.strings.add(py_string.lower())
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
        lemma_string = sorted(lemma_strings)[0]
-        lemma = self.strings[lemma_string]
+        lemma = self.strings.add(lemma_string)
        return lemma
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -228,6 +228,7 @@ class NeuralTagger(object):
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        d_scores /= d_scores.shape[0]
        loss = (d_scores**2).sum()
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
@ -292,6 +293,7 @@ class NeuralLabeller(NeuralTagger):
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        d_scores /= d_scores.shape[0]
        loss = (d_scores**2).sum()
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -1,4 +1,5 @@
 from libc.stdint cimport int64_t
 from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
@ -8,6 +9,9 @@ from .typedefs cimport attr_t, hash_t
 cpdef hash_t hash_string(unicode string) except 0
 cdef hash_t hash_utf8(char* utf8_string, int length) nogil
 cdef unicode decode_Utf8Str(const Utf8Str* string)
 ctypedef union Utf8Str:
@ -17,13 +21,11 @@ ctypedef union Utf8Str:
 cdef class StringStore:
    cdef Pool mem
    cdef Utf8Str* c
    cdef int64_t size
    cdef bint is_frozen
    cdef vector[hash_t] keys
    cdef public PreshMap _map
    cdef public PreshMap _oov
    cdef int64_t _resize_at
    cdef const Utf8Str* intern_unicode(self, unicode py_string)
    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -11,6 +11,9 @@ from libc.stdint cimport uint32_t
 import ujson
 import dill
 from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
 from .typedefs cimport hash_t
 from . import util
@ -28,7 +31,7 @@ cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil:
    return hash32(utf8_string, length, 1)
-cdef unicode _decode(const Utf8Str* string):
+cdef unicode decode_Utf8Str(const Utf8Str* string):
    cdef int i, length
    if string.s[0] < sizeof(string.s) and string.s[0] != 0:
        return string.s[1:string.s[0]+1].decode('utf8')
@ -45,10 +48,10 @@ cdef unicode _decode(const Utf8Str* string):
        return string.p[i:length + i].decode('utf8')
-cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
+cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *:
    cdef int n_length_bytes
    cdef int i
-    cdef Utf8Str string
+    cdef Utf8Str* string = <Utf8Str*>mem.alloc(1, sizeof(Utf8Str))
    cdef uint32_t ulength = length
    if length < sizeof(string.s):
        string.s[0] = <unsigned char>length
@ -73,7 +76,7 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
 cdef class StringStore:
-    """Map strings to and from integer IDs."""
+    """Lookup strings by 64-bit hash"""
    def __init__(self, strings=None, freeze=False):
        """Create the StringStore.
@ -83,70 +86,66 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        self.is_frozen = freeze
        if strings is not None:
            for string in strings:
-                _ = self[string]
+                self.add(string)
-    property size:
+    def __getitem__(self, object string_or_id):
-        def __get__(self):
+        """Retrieve a string from a given hash ID, or vice versa.
-            return self.size -1
+
        string_or_id (bytes or unicode or uint64): The value to encode.
        Returns (unicode or uint64): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return u''
        elif string_or_id in SYMBOLS_BY_STR:
            return SYMBOLS_BY_STR[string_or_id]
        cdef hash_t key
        if isinstance(string_or_id, unicode):
            key = hash_string(string_or_id)
            return key
        elif isinstance(string_or_id, bytes):
            key = hash_utf8(string_or_id, len(string_or_id))
            return key
        else:
            if string_or_id < len(SYMBOLS_BY_INT):
                return SYMBOLS_BY_INT[string_or_id]
            key = string_or_id
            utf8str = <Utf8Str*>self._map.get(key)
            if utf8str is NULL:
                raise KeyError(string_or_id)
            else:
                return decode_Utf8Str(utf8str)
    def add(self, string):
        if isinstance(string, unicode):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            key = hash_string(string)
            self.intern_unicode(string)
        elif isinstance(string, bytes):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
            key = hash_utf8(string, len(string))
            self._intern_utf8(string, len(string))
        else:
            raise TypeError(
                "Can only add unicode or bytes. Got type: %s" % type(string))
        return key
    def __len__(self):
        """The number of strings in the store.
        RETURNS (int): The number of strings in the store.
        """
-        return self.size-1
+        return self.keys.size()
-    def __getitem__(self, object string_or_id):
+    def __contains__(self, string not None):
        """Retrieve a string from a given integer ID, or vice versa.
        string_or_id (bytes or unicode or int): The value to encode.
        Returns (unicode or int): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
        elif string_or_id == 0:
            return u''
        cdef bytes byte_string
        cdef const Utf8Str* utf8str
        cdef uint64_t int_id
        cdef uint32_t oov_id
        if isinstance(string_or_id, (int, long)):
            int_id = string_or_id
            oov_id = string_or_id
            if int_id < <uint64_t>self.size:
                return _decode(&self.c[int_id])
            else:
                utf8str = <Utf8Str*>self._oov.get(oov_id)
                if utf8str is not NULL:
                    return _decode(utf8str)
                else:
                    raise IndexError(string_or_id)
        else:
            if isinstance(string_or_id, bytes):
                byte_string = <bytes>string_or_id
            elif isinstance(string_or_id, unicode):
                byte_string = (<unicode>string_or_id).encode('utf8')
            else:
                raise TypeError(type(string_or_id))
            utf8str = self._intern_utf8(byte_string, len(byte_string))
            if utf8str is NULL:
                # TODO: We need to use 32 bit here, for compatibility with the
                # vocabulary values. This makes birthday paradox probabilities
                # pretty bad.
                # We could also get unlucky here, and hash into a value that
                # collides with the 'real' strings.
                return hash32_utf8(byte_string, len(byte_string))
            else:
                return utf8str - self.c
    def __contains__(self, unicode string not None):
        """Check whether a string is in the store.
        string (unicode): The string to check.
@ -154,7 +153,11 @@ cdef class StringStore:
        """
        if len(string) == 0:
            return True
-        cdef hash_t key = hash_string(string)
+        if string in SYMBOLS_BY_STR:
            return True
        if isinstance(string, unicode):
            string = string.encode('utf8')
        cdef hash_t key = hash_utf8(string, len(string))
        return self._map.get(key) is not NULL
    def __iter__(self):
@ -163,16 +166,15 @@ cdef class StringStore:
        YIELDS (unicode): A string in the store.
        """
        cdef int i
-        for i in range(self.size):
+        cdef hash_t key
-            yield _decode(&self.c[i]) if i > 0 else u''
+        for i in range(self.keys.size()):
            key = self.keys[i]
            utf8str = <Utf8Str*>self._map.get(key)
            yield decode_Utf8Str(utf8str)
        # TODO: Iterate OOV here?
    def __reduce__(self):
-        strings = [""]
+        strings = list(self)
        for i in range(1, self.size):
            string = &self.c[i]
            py_string = _decode(string)
            strings.append(py_string)
        return (StringStore, (strings,), None, None, None)
    def to_disk(self, path):
@ -230,11 +232,9 @@ cdef class StringStore:
        self.mem = Pool()
        self._map = PreshMap()
        self._oov = PreshMap()
-        self._resize_at = 10000
+        self.keys.clear()
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        for string in strings:
-            _ = self[string]
+            self.add(string)
        self.is_frozen = freeze
    cdef const Utf8Str* intern_unicode(self, unicode py_string):
@ -258,39 +258,11 @@ cdef class StringStore:
            key32 = hash32_utf8(utf8_string, length)
            # Important: Make the OOV store own the memory. That way it's trivial
            # to flush them all.
-            value = <Utf8Str*>self._oov.mem.alloc(1, sizeof(Utf8Str))
+            value = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
            value[0] = _allocate(self._oov.mem, <unsigned char*>utf8_string, length)
            self._oov.set(key32, value)
            return NULL
-        if self.size == self._resize_at:
+        value = _allocate(self.mem, <unsigned char*>utf8_string, length)
-            self._realloc()
+        self._map.set(key, value)
-        self.c[self.size] = _allocate(self.mem, <unsigned char*>utf8_string, length)
+        self.keys.push_back(key)
-        self._map.set(key, <void*>&self.c[self.size])
+        return value
        self.size += 1
        return &self.c[self.size-1]
    def _realloc(self):
        # We want to map straight to pointers, but they'll be invalidated if
        # we resize our array. So, first we remap to indices, then we resize,
        # then we can acquire the new pointers.
        cdef Pool tmp_mem = Pool()
        keys = <key_t*>tmp_mem.alloc(self.size, sizeof(key_t))
        cdef key_t key
        cdef void* value
        cdef const Utf8Str ptr
        cdef int i = 0
        cdef size_t offset
        while map_iter(self._map.c_map, &i, &key, &value):
            # Find array index with pointer arithmetic
            offset = ((<Utf8Str*>value) - self.c)
            keys[offset] = key
        self._resize_at *= 2
        cdef size_t new_size = self._resize_at * sizeof(Utf8Str)
        self.c = <Utf8Str*>self.mem.realloc(self.c, new_size)
        self._map = PreshMap(self.size)
        for i in range(self.size):
            if keys[i]:
                self._map.set(keys[i], &self.c[i])
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -5,8 +5,6 @@ from .parts_of_speech cimport univ_pos_t
 cdef struct LexemeC:
    float* vector
    flags_t flags
    attr_t lang
@ -25,11 +23,10 @@ cdef struct LexemeC:
    float prob
    float sentiment
    float l2_norm
 cdef struct SerializedLexemeC:
-    unsigned char[4*13 + 8] data
+    unsigned char[8 + 8*10 + 4 + 4] data
    #    sizeof(flags_t)  # flags
    #    + sizeof(attr_t) # lang
    #    + sizeof(attr_t) # id
@ -50,7 +47,7 @@ cdef struct Entity:
    hash_t id
    int start
    int end
-    int label
+    attr_t label
 cdef struct TokenC:
@ -58,12 +55,12 @@ cdef struct TokenC:
    uint64_t morph
    univ_pos_t pos
    bint spacy
-    int tag
+    attr_t tag
    int idx
-    int lemma
+    attr_t lemma
-    int sense
+    attr_t sense
    int head
-    int dep
+    attr_t dep
    bint sent_start
    uint32_t l_kids
@ -72,5 +69,5 @@ cdef struct TokenC:
    uint32_t r_edge
    int ent_iob
-    int ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
+    attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth..
    hash_t ent_id
--- a/spacy/syntax/arc_eager.pxd
+++ b/spacy/syntax/arc_eager.pxd
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from .stateclass cimport StateClass
 from ..typedefs cimport attr_t
 from .transition_system cimport TransitionSystem, Transition
 from ..gold cimport GoldParseC
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -99,7 +99,7 @@ cdef bint arc_is_gold(const GoldParseC* gold, int head, int child) nogil:
        return False
-cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) nogil:
+cdef bint label_is_gold(const GoldParseC* gold, int head, int child, attr_t label) nogil:
    if gold.labels[child] == -1:
        return True
    elif label == -1:
@ -116,16 +116,16 @@ cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil:
 cdef class Shift:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return st.buffer_length() >= 2 and not st.shifted[st.B(0)] and not st.B_(0).sent_start
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.push()
        st.fast_forward()
    @staticmethod
-    cdef weight_t cost(StateClass st, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass st, const GoldParseC* gold, attr_t label) nogil:
        return Shift.move_cost(st, gold) + Shift.label_cost(st, gold, label)
    @staticmethod
@ -133,17 +133,17 @@ cdef class Shift:
        return push_cost(s, gold, s.B(0))
    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0
 cdef class Reduce:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return st.stack_depth() >= 2
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        if st.has_head(st.S(0)):
            st.pop()
        else:
@ -151,7 +151,7 @@ cdef class Reduce:
        st.fast_forward()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return Reduce.move_cost(s, gold) + Reduce.label_cost(s, gold, label)
    @staticmethod
@ -170,23 +170,23 @@ cdef class Reduce:
        return cost
    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0
 cdef class LeftArc:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return not st.B_(0).sent_start
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.add_arc(st.B(0), st.S(0), label)
        st.pop()
        st.fast_forward()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return LeftArc.move_cost(s, gold) + LeftArc.label_cost(s, gold, label)
    @staticmethod
@ -204,23 +204,23 @@ cdef class LeftArc:
            return cost + pop_cost(s, gold, s.S(0)) + arc_cost(s, gold, s.B(0), s.S(0))
    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return arc_is_gold(gold, s.B(0), s.S(0)) and not label_is_gold(gold, s.B(0), s.S(0), label)
 cdef class RightArc:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return not st.B_(0).sent_start
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.add_arc(st.S(0), st.B(0), label)
        st.push()
        st.fast_forward()
    @staticmethod
-    cdef inline weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return RightArc.move_cost(s, gold) + RightArc.label_cost(s, gold, label)
    @staticmethod
@ -233,13 +233,13 @@ cdef class RightArc:
            return push_cost(s, gold, s.B(0)) + arc_cost(s, gold, s.S(0), s.B(0))
    @staticmethod
-    cdef weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return arc_is_gold(gold, s.S(0), s.B(0)) and not label_is_gold(gold, s.S(0), s.B(0), label)
 cdef class Break:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int i
        if not USE_BREAK:
            return False
@ -251,12 +251,12 @@ cdef class Break:
            return True
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_break(st.B_(0).l_edge)
        st.fast_forward()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return Break.move_cost(s, gold) + Break.label_cost(s, gold, label)
    @staticmethod
@ -281,7 +281,7 @@ cdef class Break:
            return cost + 1
    @staticmethod
-    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef inline weight_t label_cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 0
 cdef int _get_root(int word, const GoldParseC* gold) nogil:
@ -295,9 +295,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil:
 cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
    cdef StateClass st = StateClass.init(<const TokenC*>tokens, length)
    # Ensure sent_start is set to 0 throughout
    for i in range(st.c.length):
        st.c._sent[i].sent_start = False
        st.c._sent[i].l_edge = i
        st.c._sent[i].r_edge = i
    st.fast_forward()
@ -371,7 +369,7 @@ cdef class ArcEager(TransitionSystem):
                if label.upper() == 'ROOT':
                    label = 'ROOT'
                gold.c.heads[i] = gold.heads[i]
-                gold.c.labels[i] = self.strings[label]
+                gold.c.labels[i] = self.strings.add(label)
        return gold
    cdef Transition lookup_transition(self, object name) except *:
@ -386,14 +384,14 @@ cdef class ArcEager(TransitionSystem):
            if self.c[i].move == move and self.c[i].label == label:
                return self.c[i]
-    def move_name(self, int move, int label):
+    def move_name(self, int move, attr_t label):
        label_str = self.strings[label]
        if label_str:
            return MOVE_NAMES[move] + '-' + label_str
        else:
            return MOVE_NAMES[move]
-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
        cdef Transition t
@ -426,9 +424,7 @@ cdef class ArcEager(TransitionSystem):
        return t
    cdef int initialize_state(self, StateC* st) nogil:
        # Ensure sent_start is set to 0 throughout
        for i in range(st.length):
            st._sent[i].sent_start = False
            st._sent[i].l_edge = i
            st._sent[i].r_edge = i
        st.fast_forward()
@ -473,7 +469,7 @@ cdef class ArcEager(TransitionSystem):
        label_cost_funcs[RIGHT] = RightArc.label_cost
        label_cost_funcs[BREAK] = Break.label_cost
-        cdef int* labels = gold.c.labels
+        cdef attr_t* labels = gold.c.labels
        cdef int* heads = gold.c.heads
        n_gold = 0
--- a/spacy/syntax/ner.pxd
+++ b/spacy/syntax/ner.pxd
@ -1,6 +1,7 @@
 from .transition_system cimport TransitionSystem
 from .transition_system cimport Transition
 from ..gold cimport GoldParseC
 from ..typedefs cimport attr_t
 cdef class BiluoPushDown(TransitionSystem):
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -100,7 +100,7 @@ cdef class BiluoPushDown(TransitionSystem):
        def __get__(self):
            return (BEGIN, IN, LAST, UNIT, OUT)
-    def move_name(self, int move, int label):
+    def move_name(self, int move, attr_t label):
        if move == OUT:
            return 'O'
        elif move == MISSING:
@ -132,7 +132,7 @@ cdef class BiluoPushDown(TransitionSystem):
            if label_str.startswith('!'):
                label_str = label_str[1:]
                move_str = 'x'
-            label = self.strings[label_str]
+            label = self.strings.add(label_str)
        else:
            move_str = name
            label = 0
@ -145,7 +145,7 @@ cdef class BiluoPushDown(TransitionSystem):
        else:
            raise KeyError(name)
-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        # TODO: Apparent Cython bug here when we try to use the Transition()
        # constructor with the function pointers
        cdef Transition t
@ -194,21 +194,21 @@ cdef class BiluoPushDown(TransitionSystem):
 cdef class Missing:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        return False
    @staticmethod
-    cdef int transition(StateC* s, int label) nogil:
+    cdef int transition(StateC* s, attr_t label) nogil:
        pass
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        return 9000
 cdef class Begin:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        # Ensure we don't clobber preset entities. If no entity preset,
        # ent_iob is 0
        cdef int preset_ent_iob = st.B_(0).ent_iob
@ -232,14 +232,14 @@ cdef class Begin:
            return label != 0 and not st.entity_is_open()
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.open_ent(label)
        st.set_ent_tag(st.B(0), 3, label)
        st.push()
        st.pop()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
        cdef int g_tag = gold.ner[s.B(0)].label
@ -261,7 +261,7 @@ cdef class Begin:
 cdef class In:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 2:
            return False
@ -277,17 +277,17 @@ cdef class In:
        return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_ent_tag(st.B(0), 1, label)
        st.push()
        st.pop()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        move = IN
        cdef int next_act = gold.ner[s.B(1)].move if s.B(0) < s.c.length else OUT
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label
        cdef bint is_sunk = _entity_is_sunk(s, gold.ner)
        if g_act == MISSING:
@ -313,24 +313,24 @@ cdef class In:
 cdef class Last:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        if st.B_(1).ent_iob == 1:
            return False
        return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.close_ent()
        st.set_ent_tag(st.B(0), 1, label)
        st.push()
        st.pop()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        move = LAST
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label
        if g_act == MISSING:
            return 0
@ -355,7 +355,7 @@ cdef class Last:
 cdef class Unit:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 2:
            return False
@ -368,7 +368,7 @@ cdef class Unit:
        return label != 0 and not st.entity_is_open()
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.open_ent(label)
        st.close_ent()
        st.set_ent_tag(st.B(0), 3, label)
@ -376,9 +376,9 @@ cdef class Unit:
        st.pop()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label
        if g_act == MISSING:
            return 0
@ -398,7 +398,7 @@ cdef class Unit:
 cdef class Out:
    @staticmethod
-    cdef bint is_valid(const StateC* st, int label) nogil:
+    cdef bint is_valid(const StateC* st, attr_t label) nogil:
        cdef int preset_ent_iob = st.B_(0).ent_iob
        if preset_ent_iob == 3:
            return False
@ -407,15 +407,15 @@ cdef class Out:
        return not st.entity_is_open()
    @staticmethod
-    cdef int transition(StateC* st, int label) nogil:
+    cdef int transition(StateC* st, attr_t label) nogil:
        st.set_ent_tag(st.B(0), 2, 0)
        st.push()
        st.pop()
    @staticmethod
-    cdef weight_t cost(StateClass s, const GoldParseC* gold, int label) nogil:
+    cdef weight_t cost(StateClass s, const GoldParseC* gold, attr_t label) nogil:
        cdef int g_act = gold.ner[s.B(0)].move
-        cdef int g_tag = gold.ner[s.B(0)].label
+        cdef attr_t g_tag = gold.ner[s.B(0)].label
        if g_act == MISSING or g_act == ISNT:
            return 0
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -428,7 +428,7 @@ cdef class Parser:
        cuda_stream = get_cuda_stream()
-        states, golds, max_length = self._init_gold_batch(docs, golds)
+        states, golds, max_steps = self._init_gold_batch(docs, golds)
        state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream,
                                                      0.0)
        todo = [(s, g) for (s, g) in zip(states, golds)
@ -439,6 +439,7 @@ cdef class Parser:
        backprops = []
        d_tokvecs = state2vec.ops.allocate(tokvecs.shape)
        cdef float loss = 0.
        n_steps = 0
        while todo:
            states, golds = zip(*todo)
@ -450,7 +451,7 @@ cdef class Parser:
            scores, bp_scores = vec2scores.begin_update(vector, drop=drop)
            d_scores = self.get_batch_loss(states, golds, scores)
-            d_vector = bp_scores(d_scores, sgd=sgd)
+            d_vector = bp_scores(d_scores / d_scores.shape[0], sgd=sgd)
            if drop != 0:
                d_vector *= mask
@ -468,7 +469,8 @@ cdef class Parser:
            todo = [st for st in todo if not st[0].is_final()]
            if losses is not None:
                losses[self.name] += (d_scores**2).sum()
-            if len(backprops) >= (max_length * 2):
+            n_steps += 1
            if n_steps >= max_steps:
                break
        self._make_updates(d_tokvecs,
            backprops, sgd, cuda_stream)
@ -483,7 +485,8 @@ cdef class Parser:
            StateClass state
            Transition action
        whole_states = self.moves.init_batch(whole_docs)
-        max_length = max(5, min(20, min([len(doc) for doc in whole_docs])))
+        max_length = max(5, min(50, min([len(doc) for doc in whole_docs])))
        max_moves = 0
        states = []
        golds = []
        for doc, state, gold in zip(whole_docs, whole_states, whole_golds):
@ -494,16 +497,20 @@ cdef class Parser:
            start = 0
            while start < len(doc):
                state = state.copy()
                n_moves = 0
                while state.B(0) < start and not state.is_final():
                    action = self.moves.c[oracle_actions.pop(0)]
                    action.do(state.c, action.label)
                    n_moves += 1
                has_gold = self.moves.has_gold(gold, start=start,
                                               end=start+max_length)
                if not state.is_final() and has_gold:
                    states.append(state)
                    golds.append(gold)
                    max_moves = max(max_moves, n_moves)
                start += min(max_length, len(doc)-start)
-        return states, golds, max_length
+            max_moves = max(max_moves, len(oracle_actions))
        return states, golds, max_moves
    def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None):
        # Tells CUDA to block, so our async copies complete.
--- a/spacy/syntax/transition_system.pxd
+++ b/spacy/syntax/transition_system.pxd
@ -1,6 +1,7 @@
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from ..typedefs cimport attr_t
 from ..structs cimport TokenC
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
@ -13,20 +14,22 @@ from ._state cimport StateC
 cdef struct Transition:
    int clas
    int move
-    int label
+    attr_t label
    weight_t score
-    bint (*is_valid)(const StateC* state, int label) nogil
+    bint (*is_valid)(const StateC* state, attr_t label) nogil
-    weight_t (*get_cost)(StateClass state, const GoldParseC* gold, int label) nogil
+    weight_t (*get_cost)(StateClass state, const GoldParseC* gold, attr_t label) nogil
-    int (*do)(StateC* state, int label) nogil
+    int (*do)(StateC* state, attr_t label) nogil
-ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
+ctypedef weight_t (*get_cost_func_t)(StateClass state, const GoldParseC* gold,
        attr_tlabel) nogil
 ctypedef weight_t (*move_cost_func_t)(StateClass state, const GoldParseC* gold) nogil
-ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC* gold, int label) nogil
+ctypedef weight_t (*label_cost_func_t)(StateClass state, const GoldParseC*
        gold, attr_t label) nogil
-ctypedef int (*do_func_t)(StateC* state, int label) nogil
+ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
 ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
@ -36,7 +39,7 @@ cdef class TransitionSystem:
    cdef Transition* c
    cdef readonly int n_moves
    cdef int _size
-    cdef public int root_label
+    cdef public attr_t root_label
    cdef public freqs
    cdef init_state_t init_beam_state
@ -45,7 +48,7 @@ cdef class TransitionSystem:
    cdef Transition lookup_transition(self, object name) except *
-    cdef Transition init_transition(self, int clas, int move, int label) except *
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *
    cdef int set_valid(self, int* output, const StateC* st) nogil
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -99,7 +99,7 @@ cdef class TransitionSystem:
    cdef Transition lookup_transition(self, object name) except *:
        raise NotImplementedError
-    cdef Transition init_transition(self, int clas, int move, int label) except *:
+    cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
        raise NotImplementedError
    def is_valid(self, StateClass stcls, move_name):
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -204,6 +204,7 @@ def test_doc_api_right_edge(en_tokenizer):
    assert doc[6].right_edge.text == ','
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
    ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
 ])
--- a/spacy/tests/doc/test_noun_chunks.py
+++ b/spacy/tests/doc/test_noun_chunks.py
@ -20,7 +20,7 @@ def test_doc_noun_chunks_not_nested(en_tokenizer):
    tokens.from_array(
        [HEAD, DEP],
        numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc],
-                       [-2, conj], [-5, dobj]], dtype='int32'))
+                       [-2, conj], [-5, dobj]], dtype='uint64'))
    tokens.noun_chunks_iterator = english_noun_chunks
    word_occurred = {}
    for chunk in tokens.noun_chunks:
--- a/spacy/tests/doc/test_token_api.py
+++ b/spacy/tests/doc/test_token_api.py
@ -68,6 +68,7 @@ def test_doc_token_api_is_properties(en_vocab):
    assert doc[5].like_email
@pytest.mark.xfail
@pytest.mark.parametrize('text,vectors', [
    ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
 ])
--- a/spacy/tests/regression/test_issue615.py
+++ b/spacy/tests/regression/test_issue615.py
@ -15,7 +15,9 @@ def test_issue615(en_tokenizer):
        # Get Span objects
        spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches]
        for ent_id, label, span in spans:
-            span.merge('NNP' if label else span.root.tag_, span.text, doc.vocab.strings[label])
+            span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text,
                label=label)
            doc.ents = doc.ents + ((label, span.start, span.end),)
    text = "The golf club is broken"
    pattern = [{'ORTH': "golf"}, {'ORTH': "club"}]
@ -25,6 +27,7 @@ def test_issue615(en_tokenizer):
    matcher = Matcher(doc.vocab)
    matcher.add(label, merge_phrases, pattern)
    match = matcher(doc)
    print(match)
    entities = list(doc.ents)
    assert entities != [] #assertion 1
--- a/spacy/tests/regression/test_issue834.py
+++ b/spacy/tests/regression/test_issue834.py
@ -1,5 +1,6 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import pytest
 word2vec_str = """, -0.046107 -0.035951 -0.560418
@ -8,6 +9,7 @@ de -0.648927 -0.400976 -0.527124
 \u00A0 -1.499184 -0.184280 -0.598371"""
@pytest.mark.xfail
 def test_issue834(en_vocab, text_file):
    """Test that no-break space (U+00A0) is detected as space by the load_vectors function."""
    text_file.write(word2vec_str)
--- a/spacy/tests/stringstore/test_freeze_string_store.py
+++ b/spacy/tests/stringstore/test_freeze_string_store.py
@ -7,6 +7,7 @@ from __future__ import unicode_literals
 import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["a", "b", "c"]])
 def test_stringstore_freeze_oov(stringstore, text):
    assert stringstore[text[0]] == 1
--- a/spacy/tests/stringstore/test_stringstore.py
+++ b/spacy/tests/stringstore/test_stringstore.py
@ -8,69 +8,65 @@ import pytest
@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')])
 def test_stringstore_save_bytes(stringstore, text1, text2, text3):
-    i = stringstore[text1]
+    key = stringstore.add(text1)
-    assert i == 1
+    assert stringstore[text1] == key
-    assert stringstore[text1] == 1
+    assert stringstore[text2] != key
-    assert stringstore[text2] != i
+    assert stringstore[text3] != key
    assert stringstore[text3] != i
    assert i == 1
@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')])
 def test_stringstore_save_unicode(stringstore, text1, text2, text3):
-    i = stringstore[text1]
+    key = stringstore.add(text1)
-    assert i == 1
+    assert stringstore[text1] == key
-    assert stringstore[text1] == 1
+    assert stringstore[text2] != key
-    assert stringstore[text2] != i
+    assert stringstore[text3] != key
    assert stringstore[text3] != i
    assert i == 1
@pytest.mark.parametrize('text', [b'A'])
 def test_stringstore_retrieve_id(stringstore, text):
-    i = stringstore[text]
+    key = stringstore.add(text)
-    assert stringstore.size == 1
+    assert len(stringstore) == 1
-    assert stringstore[1] == text.decode('utf8')
+    assert stringstore[key] == text.decode('utf8')
-    with pytest.raises(IndexError):
+    with pytest.raises(KeyError):
-        stringstore[2]
+        stringstore[20000]
@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')])
 def test_stringstore_med_string(stringstore, text1, text2):
-    store = stringstore[text1]
+    store = stringstore.add(text1)
    assert stringstore[store] == text1.decode('utf8')
-    dummy = stringstore[text2]
+    dummy = stringstore.add(text2)
    assert stringstore[text1] == store
 def test_stringstore_long_string(stringstore):
    text = "INFORMATIVE](http://www.google.com/search?as_q=RedditMonkey&amp;hl=en&amp;num=50&amp;btnG=Google+Search&amp;as_epq=&amp;as_oq=&amp;as_eq=&amp;lr=&amp;as_ft=i&amp;as_filetype=&amp;as_qdr=all&amp;as_nlo=&amp;as_nhi=&amp;as_occt=any&amp;as_dt=i&amp;as_sitesearch=&amp;as_rights=&amp;safe=off"
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
@pytest.mark.parametrize('factor', [254, 255, 256])
 def test_stringstore_multiply(stringstore, factor):
    text = 'a' * factor
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
 def test_stringstore_massive_strings(stringstore):
    text = 'a' * 511
-    store = stringstore[text]
+    store = stringstore.add(text)
    assert stringstore[store] == text
    text2 = 'z' * 512
-    store = stringstore[text2]
+    store = stringstore.add(text2)
    assert stringstore[store] == text2
    text3 = '1' * 513
-    store = stringstore[text3]
+    store = stringstore.add(text3)
    assert stringstore[store] == text3
@pytest.mark.parametrize('text', ["qqqqq"])
 def test_stringstore_to_bytes(stringstore, text):
-    store = stringstore[text]
+    store = stringstore.add(text)
    serialized = stringstore.to_bytes()
    new_stringstore = StringStore().from_bytes(serialized)
    assert new_stringstore[store] == text
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -10,8 +10,11 @@ import numpy
 def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [''] * len(words)
    tags = tags or [''] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [''] * len(words)
    for value in (deps+tags+pos):
        vocab.strings.add(value)
    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
--- a/spacy/tests/vectors/test_similarity.py
+++ b/spacy/tests/vectors/test_similarity.py
@ -16,7 +16,7 @@ def vectors():
 def vocab(en_vocab, vectors):
    return add_vecs_to_vocab(en_vocab, vectors)
-
+@pytest.mark.xfail
 def test_vectors_similarity_LL(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    lex1 = vocab[word1]
@ -30,6 +30,7 @@ def test_vectors_similarity_LL(vocab, vectors):
    assert numpy.isclose(lex2.similarity(lex2), lex1.similarity(lex1))
@pytest.mark.xfail
 def test_vectors_similarity_TT(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
@ -42,18 +43,21 @@ def test_vectors_similarity_TT(vocab, vectors):
    assert numpy.isclose(doc[1].similarity(doc[0]), doc[0].similarity(doc[1]))
@pytest.mark.xfail
 def test_vectors_similarity_TD(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[0]) == doc[0].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_DS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
    assert doc.similarity(doc[:2]) == doc[:2].similarity(doc)
@pytest.mark.xfail
 def test_vectors_similarity_TS(vocab, vectors):
    [(word1, vec1), (word2, vec2)] = vectors
    doc = get_doc(vocab, words=[word1, word2])
--- a/spacy/tests/vectors/test_vectors.py
+++ b/spacy/tests/vectors/test_vectors.py
@ -22,6 +22,7 @@ def tokenizer_v(vocab):
    return Tokenizer(vocab, {}, None, None, None)
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple and orange"])
 def test_vectors_token_vector(tokenizer_v, vectors, text):
    doc = tokenizer_v(text)
@ -29,6 +30,7 @@ def test_vectors_token_vector(tokenizer_v, vectors, text):
    assert vectors[1] == (doc[2].text, list(doc[2].vector))
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple", "orange"])
 def test_vectors_lexeme_vector(vocab, text):
    lex = vocab[text]
@ -36,6 +38,7 @@ def test_vectors_lexeme_vector(vocab, text):
    assert lex.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
 def test_vectors_doc_vector(vocab, text):
    doc = get_doc(vocab, text)
@ -43,6 +46,7 @@ def test_vectors_doc_vector(vocab, text):
    assert doc.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "and", "orange"]])
 def test_vectors_span_vector(vocab, text):
    span = get_doc(vocab, text)[0:2]
@ -50,6 +54,7 @@ def test_vectors_span_vector(vocab, text):
    assert span.vector_norm
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["apple orange"])
 def test_vectors_token_token_similarity(tokenizer_v, text):
    doc = tokenizer_v(text)
@ -57,6 +62,7 @@ def test_vectors_token_token_similarity(tokenizer_v, text):
    assert 0.0 < doc[0].similarity(doc[1]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
 def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
    token = tokenizer_v(text1)
@ -65,6 +71,7 @@ def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2):
    assert 0.0 < token.similarity(lex) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_token_span_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -72,6 +79,7 @@ def test_vectors_token_span_similarity(vocab, text):
    assert 0.0 < doc[0].similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_token_doc_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -79,6 +87,7 @@ def test_vectors_token_doc_similarity(vocab, text):
    assert 0.0 < doc[0].similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_lexeme_span_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -87,6 +96,7 @@ def test_vectors_lexeme_span_similarity(vocab, text):
    assert 0.0 < doc.similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [("apple", "orange")])
 def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
    lex1 = vocab[text1]
@ -95,6 +105,7 @@ def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2):
    assert 0.0 < lex1.similarity(lex2) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_lexeme_doc_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -103,6 +114,7 @@ def test_vectors_lexeme_doc_similarity(vocab, text):
    assert 0.0 < lex.similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_span_span_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -110,6 +122,7 @@ def test_vectors_span_span_similarity(vocab, text):
    assert 0.0 < doc[0:2].similarity(doc[1:3]) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text', [["apple", "orange", "juice"]])
 def test_vectors_span_doc_similarity(vocab, text):
    doc = get_doc(vocab, text)
@ -117,6 +130,7 @@ def test_vectors_span_doc_similarity(vocab, text):
    assert 0.0 < doc[0:2].similarity(doc) < 1.0
@pytest.mark.xfail
@pytest.mark.parametrize('text1,text2', [
    (["apple", "and", "apple", "pie"], ["orange", "juice"])])
 def test_vectors_doc_doc_similarity(vocab, text1, text2):
--- a/spacy/tests/vocab/test_add_vectors.py
+++ b/spacy/tests/vocab/test_add_vectors.py
@ -5,6 +5,7 @@ import numpy
 import pytest
@pytest.mark.xfail
@pytest.mark.parametrize('text', ["Hello"])
 def test_vocab_add_vector(en_vocab, text):
    en_vocab.resize_vectors(10)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -11,7 +11,6 @@ import struct
 import dill
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
 from libc.math cimport sqrt
 from .span cimport Span
@ -21,6 +20,7 @@ from .token cimport Token
 from .printers import parse_tree
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
 from ..attrs import intify_attrs
 from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
@ -494,8 +494,8 @@ cdef class Doc:
        cdef np.ndarray[attr_t, ndim=2] output
        # Make an array from the attributes --- otherwise our inner loop is Python
        # dict iteration.
-        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.int32)
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
-        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.int32)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
        for i in range(self.length):
            for j, feature in enumerate(attr_ids):
                output[i, j] = get_token_attr(&self.c[i], feature)
@ -640,7 +640,7 @@ cdef class Doc:
        """
        if self.length != 0:
            raise ValueError("Cannot load into non-empty Doc")
-        cdef int[:, :] attrs
+        cdef attr_t[:, :] attrs
        cdef int i, start, end, has_space
        fields = dill.loads(data)
        text, attrs = fields[:2]
@ -679,17 +679,15 @@ cdef class Doc:
        if len(args) == 3:
            # TODO: Warn deprecation
            tag, lemma, ent_type = args
-            attributes[TAG] = self.vocab.strings[tag]
+            attributes[TAG] = tag
-            attributes[LEMMA] = self.vocab.strings[lemma]
+            attributes[LEMMA] = lemma
-            attributes[ENT_TYPE] = self.vocab.strings[ent_type]
+            attributes[ENT_TYPE] = ent_type
        elif not args:
            # TODO: This code makes little sense overall. We're still
            # ignoring most of the attributes?
            if "label" in attributes and 'ent_type' not in attributes:
                if type(attributes["label"]) == int:
                    attributes[ENT_TYPE] = attributes["label"]
                else:
-                    attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]]
+                    attributes[ENT_TYPE] = self.vocab.strings.add(attributes["label"])
            if 'ent_type' in attributes:
                attributes[ENT_TYPE] = attributes['ent_type']
        elif args:
@ -699,6 +697,12 @@ cdef class Doc:
                "Arguments supplied:\n%s\n"
                "Keyword arguments:%s\n" % (len(args), repr(args), repr(attributes)))
        # More deprecated attribute handling =/
        if 'label' in attributes:
            attributes['ent_type'] = attributes.pop('label')
        attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
@ -708,13 +712,6 @@ cdef class Doc:
        # Currently we have the token index, we want the range-end index
        end += 1
        cdef Span span = self[start:end]
        tag = self.vocab.strings[attributes.get(TAG, span.root.tag)]
        lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)]
        ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)]
        ent_id = attributes.get('ent_id', span.root.ent_id)
        if isinstance(ent_id, basestring):
            ent_id = self.vocab.strings[ent_id]
        # Get LexemeC for newly merged token
        new_orth = ''.join([t.text_with_ws for t in span])
        if span[-1].whitespace_:
@ -723,18 +720,11 @@ cdef class Doc:
        # House the new merged token where it starts
        cdef TokenC* token = &self.c[start]
        token.spacy = self.c[end-1].spacy
-        if tag in self.vocab.morphology.tag_map:
+        for attr_name, attr_value in attributes.items():
-            self.vocab.morphology.assign_tag(token, tag)
+            if attr_name == TAG:
                self.vocab.morphology.assign_tag(token, attr_value) 
            else:
-            token.tag = self.vocab.strings[tag]
+                Token.set_struct_attr(token, attr_name, attr_value)
        token.lemma = self.vocab.strings[lemma]
        if ent_type == 'O':
            token.ent_iob = 2
            token.ent_type = 0
        else:
            token.ent_iob = 3
            token.ent_type = self.vocab.strings[ent_type]
        token.ent_id = ent_id
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
        # Before thinking of something simpler, beware the case where a dependency
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -21,14 +21,14 @@ from .. import about
 cdef class Span:
    """A slice from a Doc object."""
-    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
+    def __cinit__(self, Doc doc, int start, int end, attr_t label=0, vector=None,
                  vector_norm=None):
        """Create a `Span` object from the slice `doc[start : end]`.
        doc (Doc): The parent document.
        start (int): The index of the first token of the span.
        end (int): The index of the first token after the span.
-        label (int): A label to attach to the Span, e.g. for named entities.
+        label (uint64): A label to attach to the Span, e.g. for named entities.
        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        RETURNS (Span): The newly constructed object.
        """
@ -377,7 +377,7 @@ cdef class Span:
    property ent_id:
        """An (integer) entity ID. Usually assigned by patterns in the `Matcher`.
-        RETURNS (int): The entity ID.
+        RETURNS (uint64): The entity ID.
        """
        def __get__(self):
            return self.root.ent_id
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -202,11 +202,11 @@ cdef class Token:
    property lemma:
        """Base form of the word, with no inflectional suffixes.
-        RETURNS (int): Token lemma.
+        RETURNS (uint64): Token lemma.
        """
        def __get__(self):
            return self.c.lemma
-        def __set__(self, int lemma):
+        def __set__(self, attr_t lemma):
            self.c.lemma = lemma
    property pos:
@ -216,13 +216,13 @@ cdef class Token:
    property tag:
        def __get__(self):
            return self.c.tag
-        def __set__(self, int tag):
+        def __set__(self, attr_t tag):
            self.vocab.morphology.assign_tag(self.c, tag)
    property dep:
        def __get__(self):
            return self.c.dep
-        def __set__(self, int label):
+        def __set__(self, attr_t label):
            self.c.dep = label
    property has_vector:
@ -234,12 +234,7 @@ cdef class Token:
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
-            cdef int i
+            return self.vocab.has_vector(self.lex.c.orth)
            for i in range(self.vocab.vectors_length):
                if self.c.lex.vector[i] != 0:
                    return True
            else:
                return False
    property vector:
        """A real-valued meaning representation.
@ -250,16 +245,7 @@ cdef class Token:
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
-            cdef int length = self.vocab.vectors_length
+            return self.vocab.get_vector(self.c.lex.orth)
            if length == 0:
                raise ValueError(
                    "Word vectors set to length 0. This may be because you "
                    "don't have a model installed or loaded, or because your "
                    "model doesn't include word vectors. For more info, see "
                    "the documentation: \n%s\n" % about.__docs_models__
                )
            vector_view = <float[:length,]>self.c.lex.vector
            return numpy.asarray(vector_view)
    property vector_norm:
        """The L2 norm of the token's vector representation.
@ -269,7 +255,8 @@ cdef class Token:
        def __get__(self):
            if 'vector_norm' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector_norm'](self)
-            return self.c.lex.l2_norm
+            vector = self.vector 
            return numpy.sqrt((vector ** 2).sum())
    property n_lefts:
        def __get__(self):
@ -516,16 +503,18 @@ cdef class Token:
    property ent_type:
        """Named entity type.
-        RETURNS (int): Named entity type.
+        RETURNS (uint64): Named entity type.
        """
        def __get__(self):
            return self.c.ent_type
        def __set__(self, ent_type):
            self.c.ent_type = ent_type
    property ent_iob:
        """IOB code of named entity tag. `1="I", 2="O", 3="B"`. 0 means no tag
        is assigned.
-        RETURNS (int): IOB code of named entity tag.
+        RETURNS (uint64): IOB code of named entity tag.
        """
        def __get__(self):
            return self.c.ent_iob
@ -537,6 +526,8 @@ cdef class Token:
        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]
        def __set__(self, ent_type):
            self.c.ent_type = self.vocab.strings.add(ent_type)
    property ent_iob_:
        """IOB code of named entity tag. "B" means the token begins an entity,
@ -553,7 +544,7 @@ cdef class Token:
        """ID of the entity the token is an instance of, if any. Usually
        assigned by patterns in the Matcher.
-        RETURNS (int): ID of the entity.
+        RETURNS (uint64): ID of the entity.
        """
        def __get__(self):
            return self.c.ent_id
@ -571,7 +562,7 @@ cdef class Token:
            return self.vocab.strings[self.c.ent_id]
        def __set__(self, name):
-            self.c.ent_id = self.vocab.strings[name]
+            self.c.ent_id = self.vocab.strings.add(name)
    property whitespace_:
        def __get__(self):
@ -613,7 +604,7 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.lemma]
        def __set__(self, unicode lemma_):
-            self.c.lemma = self.vocab.strings[lemma_]
+            self.c.lemma = self.vocab.strings.add(lemma_)
    property pos_:
        def __get__(self):
@ -623,13 +614,13 @@ cdef class Token:
        def __get__(self):
            return self.vocab.strings[self.c.tag]
        def __set__(self, tag):
-            self.tag = self.vocab.strings[tag]
+            self.tag = self.vocab.strings.add(tag)
    property dep_:
        def __get__(self):
            return self.vocab.strings[self.c.dep]
        def __set__(self, unicode label):
-            self.c.dep = self.vocab.strings[label]
+            self.c.dep = self.vocab.strings.add(label)
    property is_oov:
        def __get__(self): return Lexeme.c_check_flag(self.c.lex, IS_OOV)
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -4,7 +4,7 @@ from libc.stdint cimport uint8_t
 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
-ctypedef int32_t attr_t
+ctypedef uint64_t attr_t
 ctypedef uint64_t flags_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
--- a/spacy/util.py
+++ b/spacy/util.py
@ -78,27 +78,86 @@ def ensure_path(path):
        return path
-def resolve_model_path(name):
+def load_model(name):
-    """Resolve a model name or string to a model path.
+    """Load a model from a shortcut link, package or data path.
    name (unicode): Package name, shortcut link or model path.
-    RETURNS (Path): Path to model data directory.
+    RETURNS (Language): `Language` class with the loaded model.
    """
    data_path = get_data_path()
    if not data_path or not data_path.exists():
        raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
    if isinstance(name, basestring_):
-        if (data_path / name).exists(): # in data dir or shortcut link
+        if (data_path / name).exists(): # in data dir or shortcut
-            return (data_path / name)
+            return load_model_from_path(data_path / name)
-        if is_package(name): # installed as a package
+        if is_package(name): # installed as package
-            return get_model_package_path(name)
+            return load_model_from_pkg(name)
-        if Path(name).exists(): # path to model
+        if Path(name).exists(): # path to model data directory
-            return Path(name)
+            return load_data_from_path(Path(name))
-    elif hasattr(name, 'exists'): # Path or Path-like object
+    elif hasattr(name, 'exists'): # Path or Path-like to model data
-        return name
+        return load_data_from_path(name)
    raise IOError("Can't find model '%s'" % name)
 def load_model_from_init_py(init_file):
    """Helper function to use in the `load()` method of a model package's
    __init__.py.
    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
    RETURNS (Language): `Language` class with loaded model.
    """
    model_path = Path(init_file).parent
    return load_data_from_path(model_path, package=True)
 def load_model_from_path(model_path):
    """Import and load a model package from its file path.
    path (unicode or Path): Path to package directory.
    RETURNS (Language): `Language` class with loaded model.
    """
    model_path = ensure_path(model_path)
    spec = importlib.util.spec_from_file_location('model', model_path)
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module.load()
 def load_model_from_pkg(name):
    """Import and load a model package.
    name (unicode): Name of model package installed via pip.
    RETURNS (Language): `Language` class with loaded model.
    """
    module = importlib.import_module(name)
    return module.load()
 def load_data_from_path(model_path, package=False):
    """Initialie a `Language` class with a loaded model from a model data path.
    model_path (unicode or Path): Path to model data directory.
    package (bool): Does the path point to the parent package directory?
    RETURNS (Language): `Language` class with loaded model.
    """
    model_path = ensure_path(model_path)
    meta_path = model_path / 'meta.json'
    if not meta_path.is_file():
        raise IOError("Could not read meta.json from %s" % location)
    meta = read_json(location)
    for setting in ['lang', 'name', 'version']:
        if setting not in meta:
            raise IOError('No %s setting found in model meta.json' % setting)
    if package:
        model_data_path = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
        model_path = model_path / model_data_path
    if not model_path.exists():
        raise ValueError("Can't find model directory: %s" % path2str(model_path))
    cls = get_lang_class(meta['lang'])
    nlp = cls(pipeline=meta.get('pipeline', True))
    return nlp.from_disk(model_path)
 def is_package(name):
    """Check if string maps to a package installed via pip.
@ -112,36 +171,16 @@ def is_package(name):
    return False
-def get_model_package_path(package_name):
+def get_package_path(name):
-    """Get path to a model package installed via pip.
+    """Get the path to an installed package.
-    package_name (unicode): Name of installed package.
+    name (unicode): Package name.
-    RETURNS (Path): Path to model data directory.
+    RETURNS (Path): Path to installed package.
    """
    # Here we're importing the module just to find it. This is worryingly
    # indirect, but it's otherwise very difficult to find the package.
    # Python's installation and import rules are very complicated.
    pkg = importlib.import_module(package_name)
-    package_path = Path(pkg.__file__).parent.parent
+    return Path(pkg.__file__).parent
    meta = parse_package_meta(package_path / package_name)
    model_name = '%s-%s' % (package_name, meta['version'])
    return package_path / package_name / model_name
 def parse_package_meta(package_path, require=True):
    """Check if a meta.json exists in a package and return its contents.
    package_path (Path): Path to model package directory.
    require (bool): If True, raise error if no meta.json is found.
    RETURNS (dict or None): Model meta.json data or None.
    """
    location = package_path / 'meta.json'
    if location.is_file():
        return read_json(location)
    elif require:
        raise IOError("Could not read meta.json from %s" % location)
    else:
        return None
 def is_in_jupyter():
@ -177,10 +216,13 @@ def get_async(stream, numpy_array):
 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
-    and yielding them sometime later. Obviously, this is not unbiased --
+    and yielding them sometime later. Obviously, this is not unbiased –
    but should be good enough for batching. Larger bufsize means less bias.
    From https://gist.github.com/andres-erbsen/1307752
    iterable (iterable): Iterator to shuffle.
    bufsize (int): Items to hold back.
    YIELDS (iterable): The shuffled iterator.
    """
    iterable = iter(iterable)
    buf = []
@ -315,17 +357,16 @@ def normalize_slice(length, start, stop, step=None):
 def compounding(start, stop, compound):
-    '''Yield an infinite series of compounding values. Each time the
+    """Yield an infinite series of compounding values. Each time the
    generator is called, a value is produced by multiplying the previous
    value by the compound rate.
-    EXAMPLE
+    EXAMPLE:
      >>> sizes = compounding(1., 10., 1.5)
      >>> assert next(sizes) == 1.
      >>> assert next(sizes) == 1 * 1.5
      >>> assert next(sizes) == 1.5 * 1.5
-    '''
+    """
    def clip(value):
        return max(value, stop) if (start>stop) else min(value, stop)
    curr = float(start)
@ -335,7 +376,7 @@ def compounding(start, stop, compound):
 def decaying(start, stop, decay):
-    '''Yield an infinite series of linearly decaying values.'''
+    """Yield an infinite series of linearly decaying values."""
    def clip(value):
        return max(value, stop) if (start>stop) else min(value, stop)
    nr_upd = 1.
@ -344,12 +385,6 @@ def decaying(start, stop, decay):
        nr_upd += 1
 def check_renamed_kwargs(renamed, kwargs):
    for old, new in renamed.items():
        if old in kwargs:
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
 def read_json(location):
    """Open and load JSON from file.
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -26,15 +26,6 @@ from . import attrs
 from . import symbols
 DEF MAX_VEC_SIZE = 100000
 cdef float[MAX_VEC_SIZE] EMPTY_VEC
 memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
 memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 EMPTY_LEXEME.vector = EMPTY_VEC
 cdef class Vocab:
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
@ -53,8 +44,6 @@ cdef class Vocab:
            vice versa.
        RETURNS (Vocab): The newly constructed vocab object.
        """
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
        tag_map = tag_map if tag_map is not None else {}
        if lemmatizer in (None, True, False):
@ -66,7 +55,7 @@ cdef class Vocab:
        self.strings = StringStore()
        if strings:
            for string in strings:
-                self.strings[string]
+                self.strings.add(string)
        # Load strings in a special order, so that we have an onset number for
        # the vocabulary. This way, when words are added in order, the orth ID
        # is the frequency rank of the word, plus a certain offset. The structural
@ -77,7 +66,7 @@ cdef class Vocab:
        # Need to rethink this.
        for name in symbols.NAMES + list(sorted(tag_map.keys())):
            if name:
-                _ = self.strings[name]
+                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
@ -176,15 +165,14 @@ cdef class Vocab:
            mem = self.mem
        cdef bint is_oov = mem is not self.mem
        lex = <LexemeC*>mem.alloc(sizeof(LexemeC), 1)
-        lex.orth = self.strings[string]
+        lex.orth = self.strings.add(string)
        lex.length = len(string)
        lex.id = self.length
        lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
        if self.lex_attr_getters is not None:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
                if isinstance(value, unicode):
-                    value = self.strings[value]
+                    value = self.strings.add(value)
                if attr == PROB:
                    lex.prob = value
                elif value is not None:
@ -239,7 +227,7 @@ cdef class Vocab:
        """
        cdef attr_t orth
        if type(id_or_string) == unicode:
-            orth = self.strings[id_or_string]
+            orth = self.strings.add(id_or_string)
        else:
            orth = id_or_string
        return Lexeme(self, orth)
@ -258,6 +246,26 @@ cdef class Vocab:
                Token.set_struct_attr(token, attr_id, value)
        return tokens
    def get_vector(self, orth):
        """Retrieve a vector for a word in the vocabulary.
        Words can be looked up by string or int ID.
        RETURNS:
            A word vector. Size and shape determed by the
            vocab.vectors instance. Usually, a numpy ndarray
            of shape (300,) and dtype float32.
        RAISES: If no vectors data is loaded, ValueError is raised.
        """
        raise NotImplementedError
    def has_vector(self, orth):
        """Check whether a word has a vector. Returns False if no
        vectors have been loaded. Words can be looked up by string
        or int ID."""
        raise NotImplementedError
    def to_disk(self, path):
        """Save the current state to a directory.
@ -271,9 +279,6 @@ cdef class Vocab:
        with strings_loc.open('w', encoding='utf8') as file_:
            self.strings.dump(file_)
        # TODO: pickle
        # self.dump(path / 'lexemes.bin')
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
@ -286,7 +291,7 @@ cdef class Vocab:
        with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
            strings_list = ujson.load(file_)
        for string in strings_list:
-            self.strings[string]
+            self.strings.add(string)
        self.load_lexemes(path / 'lexemes.bin')
    def to_bytes(self, **exclude):
@ -346,7 +351,6 @@ cdef class Vocab:
                lex_data.data[j] = bytes_ptr[i+j]
            Lexeme.c_from_bytes(lexeme, lex_data)
            lexeme.vector = EMPTY_VEC
            py_str = self.strings[lexeme.orth]
            assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
            key = hash_string(py_str)
@ -354,172 +358,6 @@ cdef class Vocab:
            self._by_orth.set(lexeme.orth, lexeme)
            self.length += 1
    # Deprecated --- delete these once stable
    def dump_vectors(self, out_loc):
        """Save the word vectors to a binary file.
        loc (Path): The path to save to.
        """
        cdef int32_t vec_len = self.vectors_length
        cdef int32_t word_len
        cdef bytes word_str
        cdef char* chars
        cdef Lexeme lexeme
        cdef CFile out_file = CFile(out_loc, 'wb')
        for lexeme in self:
            word_str = lexeme.orth_.encode('utf8')
            vec = lexeme.c.vector
            word_len = len(word_str)
            out_file.write_from(&word_len, 1, sizeof(word_len))
            out_file.write_from(&vec_len, 1, sizeof(vec_len))
            chars = <char*>word_str
            out_file.write_from(chars, word_len, sizeof(char))
            out_file.write_from(vec, vec_len, sizeof(float))
        out_file.close()
    def load_vectors(self, file_):
        """Load vectors from a text-based file.
        file_ (buffer): The file to read from. Entries should be separated by
            newlines, and each entry should be whitespace delimited. The first value of the entry
            should be the word string, and subsequent entries should be the values of the
            vector.
        RETURNS (int): The length of the vectors loaded.
        """
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef int32_t vec_len = -1
        cdef double norm = 0.0
        whitespace_pattern = re.compile(r'\s', re.UNICODE)
        for line_num, line in enumerate(file_):
            pieces = line.split()
            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
            if vec_len == -1:
                vec_len = len(pieces)
            elif vec_len != len(pieces):
                raise VectorReadError.mismatched_sizes(file_, line_num,
                                                        vec_len, len(pieces))
            orth = self.strings[word_str]
            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
            lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
            for i, val_str in enumerate(pieces):
                lexeme.vector[i] = float(val_str)
            norm = 0.0
            for i in range(vec_len):
                norm += lexeme.vector[i] * lexeme.vector[i]
            lexeme.l2_norm = sqrt(norm)
        self.vectors_length = vec_len
        return vec_len
    def load_vectors_from_bin_loc(self, loc):
        """Load vectors from the location of a binary file.
        loc (unicode): The path of the binary file to load from.
        RETURNS (int): The length of the vectors loaded.
        """
        cdef CFile file_ = CFile(loc, b'rb')
        cdef int32_t word_len
        cdef int32_t vec_len = 0
        cdef int32_t prev_vec_len = 0
        cdef float* vec
        cdef Address mem
        cdef attr_t string_id
        cdef bytes py_word
        cdef vector[float*] vectors
        cdef int line_num = 0
        cdef Pool tmp_mem = Pool()
        while True:
            try:
                file_.read_into(&word_len, sizeof(word_len), 1)
            except IOError:
                break
            file_.read_into(&vec_len, sizeof(vec_len), 1)
            if prev_vec_len != 0 and vec_len != prev_vec_len:
                raise VectorReadError.mismatched_sizes(loc, line_num,
                                                       vec_len, prev_vec_len)
            if 0 >= vec_len >= MAX_VEC_SIZE:
                raise VectorReadError.bad_size(loc, vec_len)
            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
            string_id = self.strings[chars[:word_len]]
            # Insert words into vocab to add vector.
            self.get_by_orth(self.mem, string_id)
            while string_id >= vectors.size():
                vectors.push_back(EMPTY_VEC)
            assert vec != NULL
            vectors[string_id] = vec
            line_num += 1
        cdef LexemeC* lex
        cdef size_t lex_addr
        cdef double norm = 0.0
        cdef int i
        for orth, lex_addr in self._by_orth.items():
            lex = <LexemeC*>lex_addr
            if lex.lower < vectors.size():
                lex.vector = vectors[lex.lower]
                norm = 0.0
                for i in range(vec_len):
                    norm += lex.vector[i] * lex.vector[i]
                lex.l2_norm = sqrt(norm)
            else:
                lex.vector = EMPTY_VEC
        self.vectors_length = vec_len
        return vec_len
    def resize_vectors(self, int new_size):
        """Set vectors_length to a new size, and allocate more memory for the
        `Lexeme` vectors if necessary. The memory will be zeroed.
        new_size (int): The new size of the vectors.
        """
        cdef hash_t key
        cdef size_t addr
        if new_size > self.vectors_length:
            for key, addr in self._by_hash.items():
                lex = <LexemeC*>addr
                lex.vector = <float*>self.mem.realloc(lex.vector,
                                        new_size * sizeof(lex.vector[0]))
        self.vectors_length = new_size
 def write_binary_vectors(in_loc, out_loc):
    cdef CFile out_file = CFile(out_loc, 'wb')
    cdef Address mem
    cdef int32_t word_len
    cdef int32_t vec_len
    cdef char* chars
    with bz2.BZ2File(in_loc, 'r') as file_:
        for line in file_:
            pieces = line.split()
            word = pieces.pop(0)
            mem = Address(len(pieces), sizeof(float))
            vec = <float*>mem.ptr
            for i, val_str in enumerate(pieces):
                vec[i] = float(val_str)
            word_len = len(word)
            vec_len = len(pieces)
            out_file.write_from(&word_len, 1, sizeof(word_len))
            out_file.write_from(&vec_len, 1, sizeof(vec_len))
            chars = <char*>word
            out_file.write_from(chars, len(word), sizeof(char))
            out_file.write_from(vec, vec_len, sizeof(float))
 def pickle_vocab(vocab):
    sstore = vocab.strings
@ -567,21 +405,3 @@ class LookupError(Exception):
            "ID of orth: {orth_id}".format(
                query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
        )
 class VectorReadError(Exception):
    @classmethod
    def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
        return cls(
            "Error reading word vectors from %s on line %d.\n"
            "All vectors must be the same size.\n"
            "Prev size: %d\n"
            "Curr size: %d" % (loc, line_num, prev_size, curr_size))
    @classmethod
    def bad_size(cls, loc, size):
        return cls(
            "Error reading word vectors from %s.\n"
            "Vector size: %d\n"
            "Max size: %d\n"
            "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
--- a/website/assets/img/docs/architecture.svg
+++ b/website/assets/img/docs/architecture.svg
@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-1 -1 907 737" width="906" height="746">
    <style>
-        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__architecture__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro" }
+        .svg__architecture__text-medium { fill: #1a1e23; font: 17px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro" }
+        .svg__architecture__text-small { fill: #1a1e23; font: bold 14px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro" }
+        .svg__architecture__text-code {  fill: #1a1e23; font: 600 12px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <ellipse cx="404" cy="203" fill="#dae8fc" stroke="#09a3d5" stroke-width="2" rx="74.8" ry="49.8"/>
    <text class="svg__architecture__text-large" transform="translate(362.5 206.5)" width="81" height="40">Language</text>
--- a/website/assets/img/docs/language_data.svg
+++ b/website/assets/img/docs/language_data.svg
@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="931" height="456" viewBox="-1 -1 932 480" preserveAspectRatio="xMinYMin meet">
    <style>
-        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__langdata__text-large { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-small { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro" }
+        .svg__langdata__text-tiny { fill: #1a1e23; font: bold 16px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
    </style>
    <path fill="none" stroke="#b85450" stroke-width="3" stroke-miterlimit="10" d="M610 404h-69.8" stroke-dasharray="1 6" stroke-linecap="round"/>
    <path fill="#b85450" stroke="#b85450" stroke-width="2" stroke-miterlimit="10" d="M534.2 404l8-4-2 4 2 4z"/>
--- a/website/assets/img/docs/pipeline.svg
+++ b/website/assets/img/docs/pipeline.svg
@ -1,8 +1,8 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 923 200" width="923" height="200">
    <style>
-        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro" }
+        .svg__pipeline__text { fill: #1a1e23; font: 20px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro" }
+        .svg__pipeline__text-small { fill: #1a1e23; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__pipeline__text-code {  fill: #1a1e23; font: 600 16px "Source Code Pro" }
+        .svg__pipeline__text-code { fill: #1a1e23; font: 600 16px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <rect width="601" height="127" x="159" y="21" fill="none" stroke="#09a3d5" stroke-width="3" rx="19.1" stroke-dasharray="3 6" ry="19.1"/>
    <path fill="#e1d5e7" stroke="#9673a6" stroke-width="2" d="M801 55h120v60H801z"/>
--- a/website/assets/img/docs/tokenization.svg
+++ b/website/assets/img/docs/tokenization.svg
@ -0,0 +1,123 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" width="600" height="380" viewBox="-20 -10 550 400">
    <style>
        .svg__tokenization__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
        .svg__tokenization__text-small { fill: #fff; font: 600 13px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M71 39v12H16v11M71 39v12h20v11"/>
    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M1 1h140v38.2H1z"/>
    <text class="svg__tokenization__text" dy="1em" width="43" height="19" transform="translate(48.5 9.5)">“Let’s</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 39v23"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 1h50v38.2h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 9.5)" width="19" height="19">go</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 39v23"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 1h50v38.2h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 9.5)" width="15" height="19">to</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 39v23"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 1h141v38.2H270z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 9.5)" width="38" height="19">N.Y.!”</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 100v20"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 62h30v38.2H1z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 70.5)" width="7" height="19">“</text>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M91 100v11H66v9M91 100v11h29v9"/>
    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M41 62h100v38.2H41z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(72.5 70.5)" width="35" height="19">Let’s</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M175 100v20"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M150 62h50v38.2h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 70.5)" width="19" height="19">go</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M235 100v20"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M210 62h50v38.2h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 70.5)" width="15" height="19">to</text>
    <path fill="none" stroke="#666" stroke-width="2" stroke-miterlimit="10" d="M341 100v20"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M270 62h141v38.2H270z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 70.5)" width="38" height="19">N.Y.!”</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 120h30v38H1z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 128.5)" width="7" height="19">“</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 158v24"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 120h50v38H41z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 128.5)" width="23" height="19">Let</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 158v24"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 120h50v38h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 128.5)" width="19" height="19">go</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 158v24"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 120h50v38h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 128.5)" width="15" height="19">to</text>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M341 158v13h-20v11M341 158v13h55v11"/>
    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 120h141v38H270z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(320.5 128.5)" width="38" height="19">N.Y.!”</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 158v24"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 120h40v38h-40z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 128.5)" width="11" height="19">’s</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 181.8h30V220H1z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 190.5)" width="7" height="19">“</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 181.8h50V220H41z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 190.5)" width="23" height="19">Let</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 181.8h50V220h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 190.5)" width="19" height="19">go</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 181.8h50V220h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 190.5)" width="15" height="19">to</text>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M321 220v11h-20v12M321 220v11h34v12"/>
    <path fill="#f8cecc" stroke="#c00" stroke-width="2" d="M270 181.8h101V220H270z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(304.5 190.5)" width="30" height="19">N.Y.!</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 181.8h40V220h-40z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 190.5)" width="11" height="19">’s</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 220v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 181.8h30V220h-30z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 190.5)" width="7" height="19">”</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M16 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 242.7h30V281H1z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 251.5)" width="7" height="19">“</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M66 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 242.7h50V281H41z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 251.5)" width="23" height="19">Let</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M175 281v20-17 20"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 242.7h50V281h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 251.5)" width="19" height="19">go</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M235 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 242.7h50V281h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 251.5)" width="15" height="19">to</text>
    <path fill="none" stroke="#c00" stroke-width="2" stroke-miterlimit="10" d="M301 281v23"/>
    <path fill="#f8cecc" stroke="#b85450" stroke-width="2" d="M270 242.7h61V281h-61z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 251.5)" width="26" height="19">N.Y.</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M120 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 242.7h40V281h-40z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 251.5)" width="11" height="19">’s</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M396 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 242.7h30V281h-30z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 251.5)" width="7" height="19">”</text>
    <path fill="none" stroke="#82b366" stroke-width="2" stroke-miterlimit="10" d="M355 281v23"/>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 242.7h30V281h-30z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 251.5)" width="5" height="19">!</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M1 304h30v38H1z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(11.5 312.5)" width="7" height="19">“</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M41 304h50v38H41z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(53.5 312.5)" width="23" height="19">Let</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M150 304h50v38h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(164.5 312.5)" width="19" height="19">go</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M210 304h50v38h-50z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(226.5 312.5)" width="15" height="19">to</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M270 304h61v38h-61z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(286.5 312.5)" width="26" height="19">N.Y.</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M100 304h40v38h-40z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(113.5 312.5)" width="11" height="19">’s</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M381 304h30v38h-30z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(391.5 312.5)" width="7" height="19">”</text>
    <path fill="#d5e8d4" stroke="#82b366" stroke-width="2" d="M340 304h30v38h-30z"/>
    <text class="svg__tokenization__text" dy="1em" transform="translate(351.5 312.5)" width="5" height="19">!</text>
    <rect width="104" height="19" x="437" y="72" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 74.5)" width="65" height="12">EXCEPTION</text>
    <rect width="104" height="19" x="437" y="11" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 13.5)" width="43" height="12">PREFIX</text>
    <rect width="104" height="19" x="437" y="130" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 132.5)" width="43" height="12">SUFFIX</text>
    <rect width="104" height="19" x="437" y="191" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(466.5 193.5)" width="43" height="12">SUFFIX</text>
    <rect width="104" height="19" x="437" y="252" fill="#c00" stroke="#c00" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(455.5 254.5)" width="65" height="12">EXCEPTION</text>
    <rect width="104" height="19" x="437" y="313" fill="#82b366" stroke="#82b366" stroke-width="2" rx="2.9" ry="2.9"/>
    <text class="svg__tokenization__text-small" dy="0.9em" transform="translate(473.5 315.5)" width="29" height="12">DONE</text>
 </svg>
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@ -1,9 +1,9 @@
 <svg class="o-svg" xmlns="http://www.w3.org/2000/svg" viewBox="-10 -10 582 365" width="572" height="355">
    <style>
-        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro" }
+        .svg__vocab__text { fill: #1a1e23; font: 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif }
-        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro"; text-transform: uppercase }
+        .svg__vocab__text-large { fill: #fff; font: bold 18px "Source Sans Pro", Tahoma, Helvetica, Arial, sans-serif; text-transform: uppercase }
-        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro" }
+        .svg__vocab__text-box { fill: #fff; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
-        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro" }
+        .svg__vocab__text-code {  fill: #1a1e23; font: bold 12px "Source Code Pro", Monaco, "Courier New", monospace }
    </style>
    <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
--- a/website/docs/api/_data.json
+++ b/website/docs/api/_data.json
@ -158,7 +158,8 @@
    "binder": {
        "title": "Binder",
-        "tag": "class"
+        "tag": "class",
        "source": "spacy/tokens/binder.pyx"
    },
    "annotation": {
--- a/website/docs/api/language-models.jade
+++ b/website/docs/api/language-models.jade
@ -2,7 +2,10 @@
 include ../../_includes/_mixins
-p spaCy currently supports the following languages and capabilities:
+p
    |  spaCy currently provides models for the following languages and
    |  capabilities:
 +aside-code("Download language models", "bash").
    python -m spacy download en
@ -22,12 +25,16 @@ p spaCy currently supports the following languages and capabilities:
    +row
        +cell French #[code fr]
-        each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ]
+        each icon in [ "pro", "con", "con", "pro", "con", "pro", "pro", "con" ]
            +cell.u-text-center #[+procon(icon)]
-+h(2, "available") Available models
+    +row
        +cell Spanish #[code es]
        each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ]
            +cell.u-text-center #[+procon(icon)]
-include ../usage/_models-list
+p
    +button("/docs/usage/models", true, "primary") See available models
 +h(2, "alpha-support") Alpha tokenization support
@ -52,9 +59,35 @@ p
    |  #[+a("https://github.com/mocobeta/janome") Janome].
 +table([ "Language", "Code", "Source" ])
-    each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
+    each language, code in { it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", da: "Danish", hu: "Hungarian", pl: "Polish", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" }
        +row
            +cell #{language}
            +cell #[code=code]
            +cell
                +src(gh("spaCy", "spacy/lang/" + code)) lang/#{code}
 +h(2, "multi-language") Multi-language support
    +tag-new(2)
 p
    |  As of v2.0, spaCy supports models trained on more than one language. This
    |  is especially useful for named entity recognition. The language ID used
    |  for multi-language or language-neutral models is #[code xx]. The
    |  language class, a generic subclass containing only the base language data,
    |  can be found in #[+src(gh("spaCy", "spacy/lang/xx")) lang/xx].
 p
    |  To load your model with the neutral, multi-language class, simply set
    |  #[code "language": "xx"] in your
    |  #[+a("/docs/usage/saving-loading#models-generating") model package]'s
    |  meta.json. You can also import the class directly, or call
    |  #[+api("util#get_lang_class") #[code util.get_lang_class()]] for
    |  lazy-loading.
 +code("Standard import").
    from spacy.lang.xx import MultiLanguage
    nlp = MultiLanguage()
 +code("With lazy-loading").
    from spacy.util import get_lang_class
    nlp = get_lang_class('xx')
--- a/website/docs/api/spacy.jade
+++ b/website/docs/api/spacy.jade
@ -11,8 +11,13 @@ p
    |  the name of an installed
    |  #[+a("/docs/usage/saving-loading#generating") model package], a unicode
    |  path or a #[code Path]-like object. spaCy will try resolving the load
-    |  argument in this order. The #[code Language] class to initialise will be
+    |  argument in this order. If a model is loaded from a shortcut link or
-    |  determined based on the model's settings.
+    |  package name, spaCy will assume it's a Python package and import it and
    |  call the model's own #[code load()] method. If a model is loaded from a
    |  path, spaCy will assume it's a data directory, read the language and
    |  pipeline settings off the meta.json and initialise the #[code Language]
    |  class. The data will be loaded in via
    |  #[+api("language#from_disk") #[code Language.from_disk()]].
 +aside-code("Example").
    nlp = spacy.load('en') # shortcut link
@ -20,7 +25,7 @@ p
    nlp = spacy.load('/path/to/en') # unicode path
    nlp = spacy.load(Path('/path/to/en')) # pathlib Path
-    nlp = spacy.load('en', disable['parser', 'tagger'])
+    nlp = spacy.load('en', disable=['parser', 'tagger'])
 +table(["Name", "Type", "Description"])
    +row
--- a/website/docs/api/util.jade
+++ b/website/docs/api/util.jade
@ -1,12 +1,10 @@
-//- 💫 DOCS > API > ANNOTATION SPECS
+//- 💫 DOCS > API > UTIL
 include ../../_includes/_mixins
 p
    |  spaCy comes with a small collection of utility functions located in
    |  #[+src(gh("spaCy", "spacy/util.py")) spacy/util.py].
 +infobox("Important note")
    |  Because utility functions are mostly intended for
    |  #[strong internal use within spaCy], their behaviour may change with
    |  future releases. The functions documented on this page should be safe
@ -74,15 +72,23 @@ p
        +cell #[code Language]
        +cell Language class.
-+h(2, "resolve_model_path") util.resolve_model_path
+h(2, "load_model") util.load_model
    +tag function
    +tag-new(2)
-p Resolve a model name or string to a model path.
+p
    |  Load a model from a shortcut link, package or data path. If called with a
    |  shortcut link or package name, spaCy will assume the model is a Python
    |  package and import and call its #[code load()] method. If called with a
    |  path, spaCy will assume it's a data directory, read the language and
    |  pipeline settings from the meta.json and initialise a #[code Language]
    |  class. The model data will then be loaded in via
    |  #[+api("language#from_disk") #[code Language.from_disk()]].
 +aside-code("Example").
-    model_path = util.resolve_model_path('en')
+    nlp = util.load_model('en')
-    model_path = util.resolve_model_path('/path/to/en')
+    nlp = util.load_model('en_core_web_sm')
    nlp = util.load_model('/path/to/data')
 +table(["Name", "Type", "Description"])
    +row
@ -92,8 +98,33 @@ p Resolve a model name or string to a model path.
    +footrow
        +cell returns
-        +cell #[code Path]
+        +cell #[code Language]
-        +cell Path to model data directory.
+        +cell #[code Language] class with the loaded model.
 +h(2, "load_model_from_init_py") util.load_model_from_init_py
    +tag function
    +tag-new(2)
 p
    |  A helper function to use in the #[code load()] method of a model package's
    |  #[+src(gh("spacy-dev-resources", "templates/model/en_model_name/__init__.py")) __init__.py].
 +aside-code("Example").
    from spacy.util import load_model_from_init_py
    def load():
        return load_model_from_init_py(__file__)
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code init_file]
        +cell unicode
        +cell Path to model's __init__.py, i.e. #[code __file__].
    +footrow
        +cell returns
        +cell #[code Language]
        +cell #[code Language] class with the loaded model.
 +h(2, "is_package") util.is_package
    +tag function
@ -117,16 +148,18 @@ p
        +cell #[code bool]
        +cell #[code True] if installed package, #[code False] if not.
-+h(2, "get_model_package_path") util.get_model_package_path
+h(2, "get_package_path") util.get_package_path
    +tag function
    +tag-new(2)
 p
-    |  Get path to a #[+a("/docs/usage/models") model package] installed via pip.
+    |  Get path to an installed package. Mainly used to resolve the location of
-    |  Currently imports the package to find it and parse its meta data.
+    |  #[+a("/docs/usage/models") model packages]. Currently imports the package
    |  to find its path.
 +aside-code("Example").
-    util.get_model_package_path('en_core_web_sm')
+    util.get_package_path('en_core_web_sm')
-    # /usr/lib/python3.6/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
+    # /usr/lib/python3.6/site-packages/en_core_web_sm
 +table(["Name", "Type", "Description"])
    +row
@ -137,37 +170,8 @@ p
    +footrow
        +cell returns
        +cell #[code Path]
        +cell Path to model data directory.
 +h(2, "parse_package_meta") util.parse_package_meta
    +tag function
 p
    |  Check if a #[code meta.json] exists in a model package and return its
    |  contents.
 +aside-code("Example").
    if util.is_package('en_core_web_sm'):
        path = util.get_model_package_path('en_core_web_sm')
        meta = util.parse_package_meta(path, require=True)
        # {'name': 'core_web_sm', 'lang': 'en', ...}
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code package_path]
        +cell #[code Path]
        +cell Path to model package directory.
    +row
        +cell #[code require]
        +cell #[code bool]
        +cell If #[code True], raise error if no #[code meta.json] is found.
    +footrow
        +cell returns
        +cell dict / #[code None]
        +cell Model meta data or #[code None].
 +h(2, "is_in_jupyter") util.is_in_jupyter
    +tag function
    +tag-new(2)
--- a/website/docs/usage/_spacy-101/_similarity.jade
+++ b/website/docs/usage/_spacy-101/_similarity.jade
@ -5,7 +5,7 @@ p
    |  #[strong how similar they are]. Predicting similarity is useful for
    |  building recommendation systems or flagging duplicates. For example, you
    |  can suggest a user content that's similar to what they're currently
-    |  looking at, or label a support ticket as a duplicate, if it's very
+    |  looking at, or label a support ticket as a duplicate if it's very
    |  similar to an already existing one.
 p
--- a/website/docs/usage/_spacy-101/_tokenization.jade
+++ b/website/docs/usage/_spacy-101/_tokenization.jade
@ -16,3 +16,47 @@ p
    +row
        for cell in ["Apple", "is", "looking", "at", "buying", "U.K.", "startup", "for", "$", "1", "billion"]
            +cell=cell
 p
    |  Fist, the raw text is split on whitespace characters, similar to
    |  #[code text.split(' ')]. Then, the tokenizer processes the text from
    |  left to right. On each substring, it performs two checks:
 +list("numbers")
    +item
        |  #[strong Does the substring match a tokenizer exception rule?] For
        |  example, "don't" does not contain whitespace, but should be split
        |  into two tokens, "do" and "n't", while "U.K." should always
        |  remain one token.
    +item
        |  #[strong Can a prefix, suffix or infixes be split off?]. For example
        |  punctuation like commas, periods, hyphens or quotes.
 p
    |  If there's a match, the rule is applied and the tokenizer continues its
    |  loop, starting with the newly split substrings. This way, spaCy can split
    |  #[strong complex, nested tokens] like combinations of abbreviations and
    |  multiple punctuation marks.
 +aside
    |  #[strong Tokenizer exception:] Special-case rule to split a string into
    |  several tokens or prevent a token from being split when punctuation rules
    |  are applied.#[br]
    |  #[strong Prefix:] Character(s) at the beginning, e.g.
    |  #[code $], #[code (], #[code “], #[code ¿].#[br]
    |  #[strong Suffix:] Character(s) at the end, e.g.
    |  #[code km], #[code &#41;], #[code ”], #[code !].#[br]
    |  #[strong Infix:] Character(s) in between, e.g.
    |  #[code -], #[code --], #[code /], #[code …].#[br]
 +image
    include ../../../assets/img/docs/tokenization.svg
    .u-text-right
        +button("/assets/img/docs/tokenization.svg", false, "secondary").u-text-tag View large graphic
 p
    |  While punctuation rules are usually pretty general, tokenizer exceptions
    |  strongly depend on the specifics of the individual language. This is
    |  why each #[+a("/docs/api/language-models") available language] has its
    |  own subclass like #[code English] or #[code German], that loads in lists
    |  of hard-coded data and exception rules.
--- a/website/docs/usage/_spacy-101/_vocab-stringstore.jade
+++ b/website/docs/usage/_spacy-101/_vocab-stringstore.jade
@ -89,4 +89,6 @@ p
 p
    |  Even though both #[code Doc] objects contain the same words, the internal
-    |  integer IDs are very different.
+    |  integer IDs are very different. The same applies for all other strings,
    |  like the annotation scheme. To avoid mismatched IDs, spaCy will always
    |  export the vocab if you save a #[code Doc] or #[code nlp] object.
--- a/website/docs/usage/language-processing-pipeline.jade
+++ b/website/docs/usage/language-processing-pipeline.jade
@ -144,7 +144,7 @@ p
 +table(["Argument", "Type", "Description"])
    +row
        +cell #[code vocab]
-        +cell #[coce Vocab]
+        +cell #[code Vocab]
        +cell
            |  Shared data between components, including strings, morphology,
            |  vectors etc.
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -139,6 +139,8 @@ p
    new_doc = Doc(Vocab()).from_disk('/moby_dick.bin')
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]],
    |  #[+api("doc") #[code Doc]]
    |  #[strong Usage:] #[+a("/docs/usage/saving-loading") Saving and loading]
 +h(2, "rule-matcher") Match text with token rules
--- a/website/docs/usage/rule-based-matching.jade
+++ b/website/docs/usage/rule-based-matching.jade
@ -345,7 +345,7 @@ p
    |  account and check the #[code subtree] for intensifiers like "very", to
    |  increase the sentiment score. At some point, you might also want to train
    |  a sentiment model. However, the approach described in this example is
-    |  very useful for #[strong bootstrapping rules to gather training data].
+    |  very useful for #[strong bootstrapping rules to collect training data].
    |  It's also an incredibly fast way to gather first insights into your data
    |  – with about 1 million tweets, you'd be looking at a processing time of
    |  #[strong under 1 minute].
--- a/website/docs/usage/spacy-101.jade
+++ b/website/docs/usage/spacy-101.jade
@ -65,7 +65,7 @@ p
    |  spaCy provides a variety of linguistic annotations to give you insights
    |  into a text's grammatical structure. This includes the word types,
    |  i.e. the parts of speech, and how the words are related to each other.
-    |  For example, if you're analysing text, it makes a #[em huge] difference
+    |  For example, if you're analysing text, it makes a huge difference
    |  whether a noun is the subject of a sentence, or the object – or whether
    |  "google" is used as a verb, or refers to the website or company in a
    |  specific context.
@ -94,9 +94,10 @@ p
 include _spacy-101/_tokenization
 +infobox
-    |  To learn more about how spaCy's tokenizer and its rules work in detail,
+    |  To learn more about how spaCy's tokenization rules work in detail,
-    |  how to #[strong customise] it and how to #[strong add your own tokenizer]
+    |  how to #[strong customise and replace] the default tokenizer and how to
-    |  to a processing pipeline, see the usage guide on
+    |  #[strong add language-specific data], see the usage guides on
    |  #[+a("/docs/usage/adding-languages") adding languages] and
    |  #[+a("/docs/usage/customizing-tokenizer") customising the tokenizer].
 +h(3, "annotations-pos-deps") Part-of-speech tags and dependencies
@ -118,9 +119,11 @@ include _spacy-101/_named-entities
 +infobox
    |  To learn more about entity recognition in spaCy, how to
-    |  #[strong add your own entities] to a document and how to train and update
+    |  #[strong add your own entities] to a document and how to
-    |  the entity predictions of a model, see the usage guide on
+    |  #[strong train and update] the entity predictions of a model, see the
-    |  #[+a("/docs/usage/entity-recognition") named entity recognition].
+    |  usage guides on
    |  #[+a("/docs/usage/entity-recognition") named entity recognition] and
    |  #[+a("/docs/usage/training-ner") training the named entity recognizer].
 +h(2, "vectors-similarity") Word vectors and similarity
    +tag-model("vectors")
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -20,19 +20,18 @@ p
    nlp = Language(pipeline=['my_factory', mycomponent])
 p
-    |  It's now much easier to customise the pipeline with your own components.
+    |  It's now much easier to #[strong customise the pipeline] with your own
-    |  Components are functions that receive a #[code Doc] object, modify and
+    |  components, functions that receive a #[code Doc] object, modify and
-    |  return it. If your component is stateful, you'll want to create a new one
+    |  return it. If your component is stateful, you can define and register a
-    |  for each pipeline. You can do that by defining and registering a factory
+    |  factory which receives the shared #[code Vocab] object and returns a
-    |  which receives the shared #[code Vocab] object and returns a component.
+    |  component. spaCy's default components can be added to your pipeline by
-
+    |  using their string IDs. This way, you won't have to worry about finding
-p
+    |  and implementing them – simply add #[code "tagger"] to the pipeline,
    |  spaCy's default components – the vectorizer, tagger, parser and entity
    |  recognizer, can be added to your pipeline by using their string IDs.
    |  This way, you won't have to worry about finding and implementing them –
    |  to use the default tagger, simply add #[code "tagger"] to the pipeline,
    |  and spaCy will know what to do.
 +image
    include ../../assets/img/docs/pipeline.svg
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
@ -96,11 +95,10 @@ p
    |  #[code Language] class, or load a model that initialises one. This allows
    |  languages to contain more custom data, e.g. lemmatizer lookup tables, or
    |  complex regular expressions. The language data has also been tidied up
-    |  and simplified. It's now also possible to overwrite the functions that
+    |  and simplified. spaCy now also supports simple lookup-based lemmatization.
-    |  compute lexical attributes like #[code like_num], and supply
+
-    |  language-specific syntax iterators, e.g. to determine noun chunks. spaCy
+image
-    |  now also supports simple lookup-based lemmatization. The data is stored
+    include ../../assets/img/docs/language_data.svg
    |  in a dictionary mapping a string to its lemma.
 +infobox
    |  #[strong API:] #[+api("language") #[code Language]]
@ -111,13 +109,10 @@ p
 +aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER, IS_PUNCT
    matcher = Matcher(nlp.vocab)
-    matcher.add('HelloWorld', None,
+    matcher.add('HEARTS', None, [{'ORTH': '❤️', 'OP': '+'}])
                [{LOWER: 'hello'}, {IS_PUNCT: True}, {LOWER: 'world'}],
                [{LOWER: 'hello'}, {LOWER: 'world'}])
    assert len(matcher) == 1
-    assert 'HelloWorld' in matcher
+    assert 'HEARTS' in matcher
 p
    |  Patterns can now be added to the matcher by calling
@ -157,28 +152,8 @@ p
        +cell #[+api("language#to_disk") #[code Language.to_disk]]
    +row
-        +cell #[code Tokenizer.load]
+        +cell #[code Language.create_make_doc]
-        +cell
+        +cell #[+api("language#attributes") #[code Language.tokenizer]]
            |  #[+api("tokenizer#from_disk") #[code Tokenizer.from_disk]]
            |  #[+api("tokenizer#from_bytes") #[code Tokenizer.from_bytes]]
    +row
        +cell #[code Tagger.load]
        +cell
            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
    +row
        +cell #[code DependencyParser.load]
        +cell
            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
    +row
        +cell #[code EntityRecognizer.load]
        +cell
            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
    +row
        +cell
@ -212,6 +187,28 @@ p
            |  #[+api("stringstore#to_disk") #[code StringStore.to_disk]]
            |  #[+api("stringstore#to_bytes") #[code StringStore.to_bytes]]
    +row
        +cell #[code Tokenizer.load]
        +cell -
    +row
        +cell #[code Tagger.load]
        +cell
            |  #[+api("tagger#from_disk") #[code Tagger.from_disk]]
            |  #[+api("tagger#from_bytes") #[code Tagger.from_bytes]]
    +row
        +cell #[code DependencyParser.load]
        +cell
            |  #[+api("dependencyparser#from_disk") #[code DependencyParser.from_disk]]
            |  #[+api("dependencyparser#from_bytes") #[code DependencyParser.from_bytes]]
    +row
        +cell #[code EntityRecognizer.load]
        +cell
            |  #[+api("entityrecognizer#from_disk") #[code EntityRecognizer.from_disk]]
            |  #[+api("entityrecognizer#from_bytes") #[code EntityRecognizer.from_bytes]]
    +row
        +cell #[code Matcher.load]
        +cell -
@ -232,7 +229,7 @@ p
    +row
        +cell #[code Doc.read_bytes]
-        +cell
+        +cell #[+api("binder") #[code Binder]]
    +row
        +cell #[code Token.is_ancestor_of]
`@ -35,4 +35,4 @@ class English(Language):`
	`Defaults = EnglishDefaults`	`Defaults = EnglishDefaults`


	`__all__ = ['English', 'EnglishDefaults']`	`__all__ = ['English']`