Tidy up rest

2025-08-02 03:10:22 +03:00 · 2017-10-27 21:07:59 +02:00 · 2017-10-27 21:07:59 +02:00 · d96e72f656
commit d96e72f656
parent a8e10f94e4
14 changed files with 233 additions and 261 deletions
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -8,11 +8,9 @@ from thinc.t2t import ExtractWindow, ParametricAttention
 from thinc.t2v import Pooling, sum_pool
 from thinc.misc import Residual
 from thinc.misc import LayerNorm as LN
-
 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.api import FeatureExtracter, with_getitem, flatten_add_lengths
 from thinc.api import uniqued, wrap, noop
-
 from thinc.linear.linear import LinearModel
 from thinc.neural.ops import NumpyOps, CupyOps
 from thinc.neural.util import get_array_module
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -101,17 +101,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
    """
    Normalize a dictionary of attributes, converting them to ints.

-    Arguments:
-        stringy_attrs (dict):
-            Dictionary keyed by attribute string names. Values can be ints or strings.
-
-        strings_map (StringStore):
-            Defaults to None. If provided, encodes string values into ints.
-
-    Returns:
-        inty_attrs (dict):
-            Attributes dictionary with keys and optionally values converted to
-            ints.
+    stringy_attrs (dict): Dictionary keyed by attribute string names. Values
+        can be ints or strings.
+    strings_map (StringStore): Defaults to None. If provided, encodes string
+        values into ints.
+    RETURNS (dict): Attributes dictionary with keys and optionally values
+        converted to ints.
    """
    inty_attrs = {}
    if _do_deprecated:
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -2,7 +2,6 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function

-import io
 import re
 import ujson
 import random
@ -10,9 +9,8 @@ import cytoolz
 import itertools

 from .syntax import nonproj
-from .util import ensure_path
-from . import util
 from .tokens import Doc
+from . import util


 def tags_to_entities(tags):
@ -310,7 +308,7 @@ def _corrupt(c, noise_level):


 def read_json_file(loc, docs_filter=None, limit=None):
-    loc = ensure_path(loc)
+    loc = util.ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,22 +1,22 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals
-from contextlib import contextmanager
-import copy

-from thinc.neural import Model
-from thinc.neural.optimizers import Adam
 import random
 import ujson
-from collections import OrderedDict
 import itertools
 import weakref
 import functools
+from collections import OrderedDict
+from contextlib import contextmanager
+from copy import copy
+from thinc.neural import Model
+from thinc.neural.optimizers import Adam

 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
-from .pipeline import DependencyParser, Tensorizer, Tagger
-from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
+from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
+from .pipeline import SimilarityHook, TextCategorizer
 from .compat import json_dumps, izip
 from .scorer import Scorer
 from ._ml import link_vectors_to_models
@ -649,7 +649,7 @@ class Language(object):
        serializers = OrderedDict((
            ('vocab', lambda: self.vocab.to_bytes()),
            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
-            ('meta', lambda: ujson.dumps(self.meta))
+            ('meta', lambda: json_dumps(self.meta))
        ))
        for i, (name, proc) in enumerate(self.pipeline):
            if name in disable:
@ -689,7 +689,7 @@ class DisabledPipes(list):
        # Important! Not deep copy -- we just want the container (but we also
        # want to support people providing arbitrarily typed nlp.pipeline
        # objects.)
-        self.original_pipeline = copy.copy(nlp.pipeline)
+        self.original_pipeline = copy(nlp.pipeline)
        list.__init__(self)
        self.extend(nlp.remove_pipe(name) for name in names)

--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -4,12 +4,6 @@
 from __future__ import unicode_literals

 import ujson
-
-from .typedefs cimport attr_t
-from .typedefs cimport hash_t
-from .attrs cimport attr_id_t
-from .structs cimport TokenC
-
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from libcpp.vector cimport vector
@ -17,14 +11,15 @@ from libcpp.pair cimport pair
 from murmurhash.mrmr cimport hash64
 from libc.stdint cimport int32_t

-from .attrs cimport ID, NULL_ATTR, ENT_TYPE
-from . import attrs
-from .tokens.doc cimport get_token_attr
-from .tokens.doc cimport Doc
+from .typedefs cimport attr_t
+from .typedefs cimport hash_t
+from .structs cimport TokenC
+from .tokens.doc cimport Doc, get_token_attr
 from .vocab cimport Vocab

+from .attrs import IDS
+from .attrs cimport attr_id_t, ID, NULL_ATTR
 from .attrs import FLAG61 as U_ENT
-
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
 from .attrs import FLAG58 as B4_ENT
@ -34,7 +29,6 @@ from .attrs import FLAG55 as B7_ENT
 from .attrs import FLAG54 as B8_ENT
 from .attrs import FLAG53 as B9_ENT
 from .attrs import FLAG52 as B10_ENT
-
 from .attrs import FLAG51 as I3_ENT
 from .attrs import FLAG50 as I4_ENT
 from .attrs import FLAG49 as I5_ENT
@ -43,7 +37,6 @@ from .attrs import FLAG47 as I7_ENT
 from .attrs import FLAG46 as I8_ENT
 from .attrs import FLAG45 as I9_ENT
 from .attrs import FLAG44 as I10_ENT
-
 from .attrs import FLAG43 as L2_ENT
 from .attrs import FLAG42 as L3_ENT
 from .attrs import FLAG41 as L4_ENT
@ -153,7 +146,7 @@ cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
 def _convert_strings(token_specs, string_store):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
-            '?': (ZERO_ONE,), '1': (ONE,)}
+                 '?': (ZERO_ONE,), '1': (ONE,)}
    tokens = []
    op = ONE
    for spec in token_specs:
@ -168,10 +161,10 @@ def _convert_strings(token_specs, string_store):
                if value in operators:
                    ops = operators[value]
                else:
-                    raise KeyError(
-                        "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
+                    msg = "Unknown operator '%s'. Options: %s"
+                    raise KeyError(msg % (value, ', '.join(operators.keys())))
            if isinstance(attr, basestring):
-                attr = attrs.IDS.get(attr.upper())
+                attr = IDS.get(attr.upper())
            if isinstance(value, basestring):
                value = string_store.add(value)
            if isinstance(value, bool):
@ -186,7 +179,7 @@ def _convert_strings(token_specs, string_store):
 def merge_phrase(matcher, doc, i, matches):
    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
-    span = doc[start : end]
+    span = doc[start:end]
    span.merge(ent_type=label, ent_id=ent_id)


@ -233,13 +226,13 @@ cdef class Matcher:
        return self._normalize_key(key) in self._patterns

    def add(self, key, on_match, *patterns):
-        """Add a match-rule to the matcher. A match-rule consists of: an ID key,
-        an on_match callback, and one or more patterns.
+        """Add a match-rule to the matcher. A match-rule consists of: an ID
+        key, an on_match callback, and one or more patterns.

        If the key exists, the patterns are appended to the previous ones, and
-        the previous on_match callback is replaced. The `on_match` callback will
-        receive the arguments `(matcher, doc, i, matches)`. You can also set
-        `on_match` to `None` to not perform any actions.
+        the previous on_match callback is replaced. The `on_match` callback
+        will receive the arguments `(matcher, doc, i, matches)`. You can also
+        set `on_match` to `None` to not perform any actions.

        A pattern consists of one or more `token_specs`, where a `token_spec`
        is a dictionary mapping attribute IDs to values, and optionally a
@ -253,8 +246,8 @@ cdef class Matcher:
        The + and * operators are usually interpretted "greedily", i.e. longer
        matches are returned where possible. However, if you specify two '+'
        and '*' patterns in a row and their matches overlap, the first
-        operator will behave non-greedily. This quirk in the semantics
-        makes the matcher more efficient, by avoiding the need for back-tracking.
+        operator will behave non-greedily. This quirk in the semantics makes
+        the matcher more efficient, by avoiding the need for back-tracking.

        key (unicode): The match ID.
        on_match (callable): Callback executed on match.
@ -268,7 +261,6 @@ cdef class Matcher:
        key = self._normalize_key(key)
        self._patterns.setdefault(key, [])
        self._callbacks[key] = on_match
-
        for pattern in patterns:
            specs = _convert_strings(pattern, self.vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, key, specs))
@ -315,9 +307,9 @@ cdef class Matcher:
        """Match a stream of documents, yielding them in turn.

        docs (iterable): A stream of documents.
-        batch_size (int): The number of documents to accumulate into a working set.
+        batch_size (int): Number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
-            in parallel, if the `Matcher` implementation supports multi-threading.
+            in parallel, if the implementation supports multi-threading.
        YIELDS (Doc): Documents, in order.
        """
        for doc in docs:
@ -325,7 +317,7 @@ cdef class Matcher:
            yield doc

    def __call__(self, Doc doc):
-        """Find all token sequences matching the supplied patterns on the `Doc`.
+        """Find all token sequences matching the supplied pattern.

        doc (Doc): The document to match over.
        RETURNS (list): A list of `(key, start, end)` tuples,
@ -342,8 +334,8 @@ cdef class Matcher:
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
-            # Go over the open matches, extending or finalizing if able. Otherwise,
-            # we over-write them (q doesn't advance)
+            # Go over the open matches, extending or finalizing if able.
+            # Otherwise, we over-write them (q doesn't advance)
            for state in partials:
                action = get_action(state.second, token)
                if action == PANIC:
@ -356,8 +348,8 @@ cdef class Matcher:

                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
-                    # (i.e. we don't overwrite -- we want to greedily match more
-                    # pattern.
+                    # (i.e. we don't overwrite -- we want to greedily match
+                    # more pattern.
                    q += 1
                elif action == REJECT:
                    pass
@ -366,8 +358,8 @@ cdef class Matcher:
                    partials[q].second += 1
                    q += 1
                elif action in (ACCEPT, ACCEPT_PREV):
-                    # TODO: What to do about patterns starting with ZERO? Need to
-                    # adjust the start position.
+                    # TODO: What to do about patterns starting with ZERO? Need
+                    # to adjust the start position.
                    start = state.first
                    end = token_i+1 if action == ACCEPT else token_i
                    ent_id = state.second[1].attrs[0].value
@ -388,8 +380,8 @@ cdef class Matcher:
                    state.second = pattern
                    partials.push_back(state)
                elif action == ADVANCE:
-                    # TODO: What to do about patterns starting with ZERO? Need to
-                    # adjust the start position.
+                    # TODO: What to do about patterns starting with ZERO? Need
+                    # to adjust the start position.
                    state.first = token_i
                    state.second = pattern + 1
                    partials.push_back(state)
@ -413,7 +405,6 @@ cdef class Matcher:
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
-        # TODO: only return (match_id, start, end)
        return matches

    def _normalize_key(self, key):
@ -441,7 +432,8 @@ def get_bilou(length):
    elif length == 8:
        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
    elif length == 9:
-        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
+        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT,
+                L9_ENT]
    elif length == 10:
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
                I10_ENT, I10_ENT, L10_ENT]
@ -454,10 +446,8 @@ cdef class PhraseMatcher:
    cdef Vocab vocab
    cdef Matcher matcher
    cdef PreshMap phrase_ids
-
    cdef int max_length
    cdef attr_t* _phrase_key
-
    cdef public object _callbacks
    cdef public object _patterns

@ -470,7 +460,8 @@ cdef class PhraseMatcher:
        self.phrase_ids = PreshMap()
        abstract_patterns = []
        for length in range(1, max_length):
-            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
+            abstract_patterns.append([{tag: True}
+                                      for tag in get_bilou(length)])
        self.matcher.add('Candidate', None, *abstract_patterns)
        self._callbacks = {}

@ -496,8 +487,8 @@ cdef class PhraseMatcher:
        return (self.__class__, (self.vocab,), None, None)

    def add(self, key, on_match, *docs):
-        """Add a match-rule to the matcher. A match-rule consists of: an ID key,
-        an on_match callback, and one or more patterns.
+        """Add a match-rule to the matcher. A match-rule consists of: an ID
+        key, an on_match callback, and one or more patterns.

        key (unicode): The match ID.
        on_match (callable): Callback executed on match.
@ -513,7 +504,6 @@ cdef class PhraseMatcher:
                raise ValueError(msg % (len(doc), self.max_length))
        cdef hash_t ent_id = self.matcher._normalize_key(key)
        self._callbacks[ent_id] = on_match
-
        cdef int length
        cdef int i
        cdef hash_t phrase_hash
@ -553,9 +543,9 @@ cdef class PhraseMatcher:
        """Match a stream of documents, yielding them in turn.

        docs (iterable): A stream of documents.
-        batch_size (int): The number of documents to accumulate into a working set.
+        batch_size (int): Number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
-            in parallel, if the `Matcher` implementation supports multi-threading.
+            in parallel, if the implementation supports multi-threading.
        YIELDS (Doc): Documents, in order.
        """
        for doc in stream:
@ -569,7 +559,8 @@ cdef class PhraseMatcher:
            self._phrase_key[i] = 0
        for i, j in enumerate(range(start, end)):
            self._phrase_key[i] = doc.c[j].lex.orth
-        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
+        cdef hash_t key = hash64(self._phrase_key,
+                                 self.max_length * sizeof(attr_t), 0)
        ent_id = <hash_t>self.phrase_ids.get(key)
        if ent_id == 0:
            return None
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -4,17 +4,15 @@ from __future__ import unicode_literals

 from libc.string cimport memset

-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
+from .attrs import LEMMA, intify_attrs
+from .parts_of_speech cimport SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
-from .attrs import LEMMA, intify_attrs


 def _normalize_props(props):
-    """
-    Transform deprecated string keys to correct names.
-    """
+    """Transform deprecated string keys to correct names."""
    out = {}
    for key, value in props.items():
        if key == POS:
@ -77,7 +75,8 @@ cdef class Morphology:
    cdef int assign_untagged(self, TokenC* token) except -1:
        """Set morphological attributes on a token without a POS tag. Uses
        the lemmatizer's lookup() method, which looks up the string in the
-        table provided by the language data as lemma_lookup (if available)."""
+        table provided by the language data as lemma_lookup (if available).
+        """
        if token.lemma == 0:
            orth_str = self.strings[token.lex.orth]
            lemma = self.lemmatizer.lookup(orth_str)
@ -95,11 +94,10 @@ cdef class Morphology:
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
        if tag_id > self.n_tags:
            raise ValueError("Unknown tag ID: %s" % tag_id)
-        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
-        # is that this is where the specific word and the tag interact. Still,
-        # we should have a better way to enforce this rule, or figure out why
-        # the statistical model fails.
-        # Related to Issue #220
+        # TODO: It's pretty arbitrary to put this logic here. I guess the
+        # justification is that this is where the specific word and the tag
+        # interact. Still, we should have a better way to enforce this rule, or
+        # figure out why the statistical model fails. Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
        rich_tag = self.rich_tags[tag_id]
@ -123,14 +121,13 @@ cdef class Morphology:
        else:
            flags[0] &= ~(one << flag_id)

-    def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
-        """
-        Add a special-case rule to the morphological analyser. Tokens whose
+    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
+                         force=False):
+        """Add a special-case rule to the morphological analyser. Tokens whose
        tag and orth match the rule will receive the specified properties.

-        Arguments:
-            tag (unicode): The part-of-speech tag to key the exception.
-            orth (unicode): The word-form to key the exception.
+        tag (unicode): The part-of-speech tag to key the exception.
+        orth (unicode): The word-form to key the exception.
        """
        self.exc[(tag_str, orth_str)] = dict(attrs)
        tag = self.strings.add(tag_str)
@ -144,10 +141,9 @@ cdef class Morphology:
        elif force:
            memset(cached, 0, sizeof(cached[0]))
        else:
-            msg = ("Conflicting morphology exception for (%s, %s). Use force=True "
-                   "to overwrite.")
-            msg = msg % (tag_str, orth_str)
-            raise ValueError(msg)
+            raise ValueError(
+                "Conflicting morphology exception for (%s, %s). Use "
+                "force=True to overwrite." % (tag_str, orth_str))

        cached.tag = rich_tag
        # TODO: Refactor this to take arbitrary attributes.
@ -218,7 +214,7 @@ IDS = {
    "Definite_two": Definite_two,
    "Definite_def": Definite_def,
    "Definite_red": Definite_red,
-    "Definite_cons": Definite_cons, # U20
+    "Definite_cons": Definite_cons,  # U20
    "Definite_ind": Definite_ind,
    "Degree_cmp": Degree_cmp,
    "Degree_comp": Degree_comp,
@ -227,7 +223,7 @@ IDS = {
    "Degree_sup": Degree_sup,
    "Degree_abs": Degree_abs,
    "Degree_com": Degree_com,
-    "Degree_dim ": Degree_dim, # du
+    "Degree_dim ": Degree_dim,  # du
    "Gender_com": Gender_com,
    "Gender_fem": Gender_fem,
    "Gender_masc": Gender_masc,
@ -242,15 +238,15 @@ IDS = {
    "Negative_neg": Negative_neg,
    "Negative_pos": Negative_pos,
    "Negative_yes": Negative_yes,
-    "Polarity_neg": Polarity_neg, # U20
-    "Polarity_pos": Polarity_pos, # U20
+    "Polarity_neg": Polarity_neg,  # U20
+    "Polarity_pos": Polarity_pos,  # U20
    "Number_com": Number_com,
    "Number_dual": Number_dual,
    "Number_none": Number_none,
    "Number_plur": Number_plur,
    "Number_sing": Number_sing,
-    "Number_ptan ": Number_ptan, # bg
-    "Number_count ": Number_count, # bg
+    "Number_ptan ": Number_ptan,  # bg
+    "Number_count ": Number_count,  # bg
    "NumType_card": NumType_card,
    "NumType_dist": NumType_dist,
    "NumType_frac": NumType_frac,
@ -276,7 +272,7 @@ IDS = {
    "PronType_rel": PronType_rel,
    "PronType_tot": PronType_tot,
    "PronType_clit": PronType_clit,
-    "PronType_exc ": PronType_exc, # es, ca, it, fa,
+    "PronType_exc ": PronType_exc,  # es, ca, it, fa,
    "Reflex_yes": Reflex_yes,
    "Tense_fut": Tense_fut,
    "Tense_imp": Tense_imp,
@ -292,19 +288,19 @@ IDS = {
    "VerbForm_partPres": VerbForm_partPres,
    "VerbForm_sup": VerbForm_sup,
    "VerbForm_trans": VerbForm_trans,
-    "VerbForm_conv": VerbForm_conv, # U20
-    "VerbForm_gdv ": VerbForm_gdv, # la,
+    "VerbForm_conv": VerbForm_conv,  # U20
+    "VerbForm_gdv ": VerbForm_gdv,  # la,
    "Voice_act": Voice_act,
    "Voice_cau": Voice_cau,
    "Voice_pass": Voice_pass,
-    "Voice_mid ": Voice_mid, # gkc,
-    "Voice_int ": Voice_int, # hb,
-    "Abbr_yes ": Abbr_yes, # cz, fi, sl, U,
-    "AdpType_prep ": AdpType_prep, # cz, U,
-    "AdpType_post ": AdpType_post, # U,
-    "AdpType_voc ": AdpType_voc, # cz,
-    "AdpType_comprep ": AdpType_comprep, # cz,
-    "AdpType_circ ": AdpType_circ, # U,
+    "Voice_mid ": Voice_mid,  # gkc,
+    "Voice_int ": Voice_int,  # hb,
+    "Abbr_yes ": Abbr_yes,  # cz, fi, sl, U,
+    "AdpType_prep ": AdpType_prep,  # cz, U,
+    "AdpType_post ": AdpType_post,  # U,
+    "AdpType_voc ": AdpType_voc,  # cz,
+    "AdpType_comprep ": AdpType_comprep,  # cz,
+    "AdpType_circ ": AdpType_circ,  # U,
    "AdvType_man": AdvType_man,
    "AdvType_loc": AdvType_loc,
    "AdvType_tim": AdvType_tim,
@ -314,122 +310,122 @@ IDS = {
    "AdvType_sta": AdvType_sta,
    "AdvType_ex": AdvType_ex,
    "AdvType_adadj": AdvType_adadj,
-    "ConjType_oper ": ConjType_oper, # cz, U,
-    "ConjType_comp ": ConjType_comp, # cz, U,
-    "Connegative_yes ": Connegative_yes, # fi,
-    "Derivation_minen ": Derivation_minen, # fi,
-    "Derivation_sti ": Derivation_sti, # fi,
-    "Derivation_inen ": Derivation_inen, # fi,
-    "Derivation_lainen ": Derivation_lainen, # fi,
-    "Derivation_ja ": Derivation_ja, # fi,
-    "Derivation_ton ": Derivation_ton, # fi,
-    "Derivation_vs ": Derivation_vs, # fi,
-    "Derivation_ttain ": Derivation_ttain, # fi,
-    "Derivation_ttaa ": Derivation_ttaa, # fi,
-    "Echo_rdp ": Echo_rdp, # U,
-    "Echo_ech ": Echo_ech, # U,
-    "Foreign_foreign ": Foreign_foreign, # cz, fi, U,
-    "Foreign_fscript ": Foreign_fscript, # cz, fi, U,
-    "Foreign_tscript ": Foreign_tscript, # cz, U,
-    "Foreign_yes ": Foreign_yes, # sl,
-    "Gender_dat_masc ": Gender_dat_masc, # bq, U,
-    "Gender_dat_fem ": Gender_dat_fem, # bq, U,
-    "Gender_erg_masc ": Gender_erg_masc, # bq,
-    "Gender_erg_fem ": Gender_erg_fem, # bq,
-    "Gender_psor_masc ": Gender_psor_masc, # cz, sl, U,
-    "Gender_psor_fem ": Gender_psor_fem, # cz, sl, U,
-    "Gender_psor_neut ": Gender_psor_neut, # sl,
-    "Hyph_yes ": Hyph_yes, # cz, U,
-    "InfForm_one ": InfForm_one, # fi,
-    "InfForm_two ": InfForm_two, # fi,
-    "InfForm_three ": InfForm_three, # fi,
-    "NameType_geo ": NameType_geo, # U, cz,
-    "NameType_prs ": NameType_prs, # U, cz,
-    "NameType_giv ": NameType_giv, # U, cz,
-    "NameType_sur ": NameType_sur, # U, cz,
-    "NameType_nat ": NameType_nat, # U, cz,
-    "NameType_com ": NameType_com, # U, cz,
-    "NameType_pro ": NameType_pro, # U, cz,
-    "NameType_oth ": NameType_oth, # U, cz,
-    "NounType_com ": NounType_com, # U,
-    "NounType_prop ": NounType_prop, # U,
-    "NounType_class ": NounType_class, # U,
-    "Number_abs_sing ": Number_abs_sing, # bq, U,
-    "Number_abs_plur ": Number_abs_plur, # bq, U,
-    "Number_dat_sing ": Number_dat_sing, # bq, U,
-    "Number_dat_plur ": Number_dat_plur, # bq, U,
-    "Number_erg_sing ": Number_erg_sing, # bq, U,
-    "Number_erg_plur ": Number_erg_plur, # bq, U,
-    "Number_psee_sing ": Number_psee_sing, # U,
-    "Number_psee_plur ": Number_psee_plur, # U,
-    "Number_psor_sing ": Number_psor_sing, # cz, fi, sl, U,
-    "Number_psor_plur ": Number_psor_plur, # cz, fi, sl, U,
-    "NumForm_digit ": NumForm_digit, # cz, sl, U,
-    "NumForm_roman ": NumForm_roman, # cz, sl, U,
-    "NumForm_word ": NumForm_word, # cz, sl, U,
-    "NumValue_one ": NumValue_one, # cz, U,
-    "NumValue_two ": NumValue_two, # cz, U,
-    "NumValue_three ": NumValue_three, # cz, U,
-    "PartForm_pres ": PartForm_pres, # fi,
-    "PartForm_past ": PartForm_past, # fi,
-    "PartForm_agt ": PartForm_agt, # fi,
-    "PartForm_neg ": PartForm_neg, # fi,
-    "PartType_mod ": PartType_mod, # U,
-    "PartType_emp ": PartType_emp, # U,
-    "PartType_res ": PartType_res, # U,
-    "PartType_inf ": PartType_inf, # U,
-    "PartType_vbp ": PartType_vbp, # U,
-    "Person_abs_one ": Person_abs_one, # bq, U,
-    "Person_abs_two ": Person_abs_two, # bq, U,
-    "Person_abs_three ": Person_abs_three, # bq, U,
-    "Person_dat_one ": Person_dat_one, # bq, U,
-    "Person_dat_two ": Person_dat_two, # bq, U,
-    "Person_dat_three ": Person_dat_three, # bq, U,
-    "Person_erg_one ": Person_erg_one, # bq, U,
-    "Person_erg_two ": Person_erg_two, # bq, U,
-    "Person_erg_three ": Person_erg_three, # bq, U,
-    "Person_psor_one ": Person_psor_one, # fi, U,
-    "Person_psor_two ": Person_psor_two, # fi, U,
-    "Person_psor_three ": Person_psor_three, # fi, U,
-    "Polite_inf ": Polite_inf, # bq, U,
-    "Polite_pol ": Polite_pol, # bq, U,
-    "Polite_abs_inf ": Polite_abs_inf, # bq, U,
-    "Polite_abs_pol ": Polite_abs_pol, # bq, U,
-    "Polite_erg_inf ": Polite_erg_inf, # bq, U,
-    "Polite_erg_pol ": Polite_erg_pol, # bq, U,
-    "Polite_dat_inf ": Polite_dat_inf, # bq, U,
-    "Polite_dat_pol ": Polite_dat_pol, # bq, U,
-    "Prefix_yes ": Prefix_yes, # U,
-    "PrepCase_npr ": PrepCase_npr, # cz,
-    "PrepCase_pre ": PrepCase_pre, # U,
-    "PunctSide_ini ": PunctSide_ini, # U,
-    "PunctSide_fin ": PunctSide_fin, # U,
-    "PunctType_peri ": PunctType_peri, # U,
-    "PunctType_qest ": PunctType_qest, # U,
-    "PunctType_excl ": PunctType_excl, # U,
-    "PunctType_quot ": PunctType_quot, # U,
-    "PunctType_brck ": PunctType_brck, # U,
-    "PunctType_comm ": PunctType_comm, # U,
-    "PunctType_colo ": PunctType_colo, # U,
-    "PunctType_semi ": PunctType_semi, # U,
-    "PunctType_dash ": PunctType_dash, # U,
-    "Style_arch ": Style_arch, # cz, fi, U,
-    "Style_rare ": Style_rare, # cz, fi, U,
-    "Style_poet ": Style_poet, # cz, U,
-    "Style_norm ": Style_norm, # cz, U,
-    "Style_coll ": Style_coll, # cz, U,
-    "Style_vrnc ": Style_vrnc, # cz, U,
-    "Style_sing ": Style_sing, # cz, U,
-    "Style_expr ": Style_expr, # cz, U,
-    "Style_derg ": Style_derg, # cz, U,
-    "Style_vulg ": Style_vulg, # cz, U,
-    "Style_yes ": Style_yes, # fi, U,
-    "StyleVariant_styleShort ": StyleVariant_styleShort, # cz,
-    "StyleVariant_styleBound ": StyleVariant_styleBound, # cz, sl,
-    "VerbType_aux ": VerbType_aux, # U,
-    "VerbType_cop ": VerbType_cop, # U,
-    "VerbType_mod ": VerbType_mod, # U,
-    "VerbType_light ": VerbType_light, # U,
+    "ConjType_oper ": ConjType_oper,  # cz, U,
+    "ConjType_comp ": ConjType_comp,  # cz, U,
+    "Connegative_yes ": Connegative_yes,  # fi,
+    "Derivation_minen ": Derivation_minen,  # fi,
+    "Derivation_sti ": Derivation_sti,  # fi,
+    "Derivation_inen ": Derivation_inen,  # fi,
+    "Derivation_lainen ": Derivation_lainen,  # fi,
+    "Derivation_ja ": Derivation_ja,  # fi,
+    "Derivation_ton ": Derivation_ton,  # fi,
+    "Derivation_vs ": Derivation_vs,  # fi,
+    "Derivation_ttain ": Derivation_ttain,  # fi,
+    "Derivation_ttaa ": Derivation_ttaa,  # fi,
+    "Echo_rdp ": Echo_rdp,  # U,
+    "Echo_ech ": Echo_ech,  # U,
+    "Foreign_foreign ": Foreign_foreign,  # cz, fi, U,
+    "Foreign_fscript ": Foreign_fscript,  # cz, fi, U,
+    "Foreign_tscript ": Foreign_tscript,  # cz, U,
+    "Foreign_yes ": Foreign_yes,  # sl,
+    "Gender_dat_masc ": Gender_dat_masc,  # bq, U,
+    "Gender_dat_fem ": Gender_dat_fem,  # bq, U,
+    "Gender_erg_masc ": Gender_erg_masc,  # bq,
+    "Gender_erg_fem ": Gender_erg_fem,  # bq,
+    "Gender_psor_masc ": Gender_psor_masc,  # cz, sl, U,
+    "Gender_psor_fem ": Gender_psor_fem,  # cz, sl, U,
+    "Gender_psor_neut ": Gender_psor_neut,  # sl,
+    "Hyph_yes ": Hyph_yes,  # cz, U,
+    "InfForm_one ": InfForm_one,  # fi,
+    "InfForm_two ": InfForm_two,  # fi,
+    "InfForm_three ": InfForm_three,  # fi,
+    "NameType_geo ": NameType_geo,  # U, cz,
+    "NameType_prs ": NameType_prs,  # U, cz,
+    "NameType_giv ": NameType_giv,  # U, cz,
+    "NameType_sur ": NameType_sur,  # U, cz,
+    "NameType_nat ": NameType_nat,  # U, cz,
+    "NameType_com ": NameType_com,  # U, cz,
+    "NameType_pro ": NameType_pro,  # U, cz,
+    "NameType_oth ": NameType_oth,  # U, cz,
+    "NounType_com ": NounType_com,  # U,
+    "NounType_prop ": NounType_prop,  # U,
+    "NounType_class ": NounType_class,  # U,
+    "Number_abs_sing ": Number_abs_sing,  # bq, U,
+    "Number_abs_plur ": Number_abs_plur,  # bq, U,
+    "Number_dat_sing ": Number_dat_sing,  # bq, U,
+    "Number_dat_plur ": Number_dat_plur,  # bq, U,
+    "Number_erg_sing ": Number_erg_sing,  # bq, U,
+    "Number_erg_plur ": Number_erg_plur,  # bq, U,
+    "Number_psee_sing ": Number_psee_sing,  # U,
+    "Number_psee_plur ": Number_psee_plur,  # U,
+    "Number_psor_sing ": Number_psor_sing,  # cz, fi, sl, U,
+    "Number_psor_plur ": Number_psor_plur,  # cz, fi, sl, U,
+    "NumForm_digit ": NumForm_digit,  # cz, sl, U,
+    "NumForm_roman ": NumForm_roman,  # cz, sl, U,
+    "NumForm_word ": NumForm_word,  # cz, sl, U,
+    "NumValue_one ": NumValue_one,  # cz, U,
+    "NumValue_two ": NumValue_two,  # cz, U,
+    "NumValue_three ": NumValue_three,  # cz, U,
+    "PartForm_pres ": PartForm_pres,  # fi,
+    "PartForm_past ": PartForm_past,  # fi,
+    "PartForm_agt ": PartForm_agt,  # fi,
+    "PartForm_neg ": PartForm_neg,  # fi,
+    "PartType_mod ": PartType_mod,  # U,
+    "PartType_emp ": PartType_emp,  # U,
+    "PartType_res ": PartType_res,  # U,
+    "PartType_inf ": PartType_inf,  # U,
+    "PartType_vbp ": PartType_vbp,  # U,
+    "Person_abs_one ": Person_abs_one,  # bq, U,
+    "Person_abs_two ": Person_abs_two,  # bq, U,
+    "Person_abs_three ": Person_abs_three,  # bq, U,
+    "Person_dat_one ": Person_dat_one,  # bq, U,
+    "Person_dat_two ": Person_dat_two,  # bq, U,
+    "Person_dat_three ": Person_dat_three,  # bq, U,
+    "Person_erg_one ": Person_erg_one,  # bq, U,
+    "Person_erg_two ": Person_erg_two,  # bq, U,
+    "Person_erg_three ": Person_erg_three,  # bq, U,
+    "Person_psor_one ": Person_psor_one,  # fi, U,
+    "Person_psor_two ": Person_psor_two,  # fi, U,
+    "Person_psor_three ": Person_psor_three,  # fi, U,
+    "Polite_inf ": Polite_inf,  # bq, U,
+    "Polite_pol ": Polite_pol,  # bq, U,
+    "Polite_abs_inf ": Polite_abs_inf,  # bq, U,
+    "Polite_abs_pol ": Polite_abs_pol,  # bq, U,
+    "Polite_erg_inf ": Polite_erg_inf,  # bq, U,
+    "Polite_erg_pol ": Polite_erg_pol,  # bq, U,
+    "Polite_dat_inf ": Polite_dat_inf,  # bq, U,
+    "Polite_dat_pol ": Polite_dat_pol,  # bq, U,
+    "Prefix_yes ": Prefix_yes,  # U,
+    "PrepCase_npr ": PrepCase_npr,  # cz,
+    "PrepCase_pre ": PrepCase_pre,  # U,
+    "PunctSide_ini ": PunctSide_ini,  # U,
+    "PunctSide_fin ": PunctSide_fin,  # U,
+    "PunctType_peri ": PunctType_peri,  # U,
+    "PunctType_qest ": PunctType_qest,  # U,
+    "PunctType_excl ": PunctType_excl,  # U,
+    "PunctType_quot ": PunctType_quot,  # U,
+    "PunctType_brck ": PunctType_brck,  # U,
+    "PunctType_comm ": PunctType_comm,  # U,
+    "PunctType_colo ": PunctType_colo,  # U,
+    "PunctType_semi ": PunctType_semi,  # U,
+    "PunctType_dash ": PunctType_dash,  # U,
+    "Style_arch ": Style_arch,  # cz, fi, U,
+    "Style_rare ": Style_rare,  # cz, fi, U,
+    "Style_poet ": Style_poet,  # cz, U,
+    "Style_norm ": Style_norm,  # cz, U,
+    "Style_coll ": Style_coll,  # cz, U,
+    "Style_vrnc ": Style_vrnc,  # cz, U,
+    "Style_sing ": Style_sing,  # cz, U,
+    "Style_expr ": Style_expr,  # cz, U,
+    "Style_derg ": Style_derg,  # cz, U,
+    "Style_vulg ": Style_vulg,  # cz, U,
+    "Style_yes ": Style_yes,  # fi, U,
+    "StyleVariant_styleShort ": StyleVariant_styleShort,  # cz,
+    "StyleVariant_styleBound ": StyleVariant_styleBound,  # cz, sl,
+    "VerbType_aux ": VerbType_aux,  # U,
+    "VerbType_cop ": VerbType_cop,  # U,
+    "VerbType_mod ": VerbType_mod,  # U,
+    "VerbType_light ": VerbType_light,  # U,
 }


--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -8,7 +8,7 @@ IDS = {
    "ADP": ADP,
    "ADV": ADV,
    "AUX": AUX,
-    "CONJ": CONJ, # U20
+    "CONJ": CONJ,  # U20
    "CCONJ": CCONJ,
    "DET": DET,
    "INTJ": INTJ,
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -85,7 +85,6 @@ class Scorer(object):

    def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
        assert len(tokens) == len(gold)
-
        gold_deps = set()
        gold_tags = set()
        gold_ents = set(tags_to_entities([annot[-1]
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -4,19 +4,15 @@ from __future__ import unicode_literals, absolute_import

 cimport cython
 from libc.string cimport memcpy
-from libc.stdint cimport uint64_t, uint32_t
-from murmurhash.mrmr cimport hash64, hash32
-from preshed.maps cimport map_iter, key_t
 from libc.stdint cimport uint32_t
+from murmurhash.mrmr cimport hash64, hash32
 import ujson
-import dill

 from .symbols import IDS as SYMBOLS_BY_STR
 from .symbols import NAMES as SYMBOLS_BY_INT
-
 from .typedefs cimport hash_t
-from . import util
 from .compat import json_dumps
+from . import util


 cpdef hash_t hash_string(unicode string) except 0:
@ -195,7 +191,7 @@ cdef class StringStore:
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist. Paths may be either strings or `Path`-like objects.
+            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        path = util.ensure_path(path)
        strings = list(self)
@ -225,7 +221,7 @@ cdef class StringStore:
        **exclude: Named attributes to prevent from being serialized.
        RETURNS (bytes): The serialized form of the `StringStore` object.
        """
-        return ujson.dumps(list(self))
+        return json_dumps(list(self))

    def from_bytes(self, bytes_data, **exclude):
        """Load state from a binary string.
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,8 +1,8 @@
 # coding: utf8
 #cython: optimize.unpack_method_calls=False
-
 from __future__ import unicode_literals

+
 IDS = {
    "": NIL,
    "IS_ALPHA": IS_ALPHA,
@ -464,9 +464,11 @@ IDS = {
    "LAW": LAW
 }

+
 def sort_nums(x):
    return x[1]

+
 NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
 # Unfortunate hack here, to work around problem with long cpdef enum
 # (which is generating an enormous amount of C++ in Cython 0.24+)
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -8,12 +8,11 @@ from cython.operator cimport preincrement as preinc
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 import regex as re
-
-from .strings cimport hash_string
-from . import util
 cimport cython

 from .tokens.doc cimport Doc
+from .strings cimport hash_string
+from . import util


 cdef class Tokenizer:
@ -21,7 +20,7 @@ cdef class Tokenizer:
    boundaries.
    """
    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
-            suffix_search=None, infix_finditer=None, token_match=None):
+                 suffix_search=None, infix_finditer=None, token_match=None):
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.

        vocab (Vocab): A storage container for lexical types.
@ -74,9 +73,8 @@ cdef class Tokenizer:
        RETURNS (Doc): A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
-            raise ValueError(
-                "String is too long: %d characters. Max is 2**30." % len(string)
-            )
+            msg = "String is too long: %d characters. Max is 2**30."
+            raise ValueError(msg % len(string))
        cdef int length = len(string)
        cdef Doc doc = Doc(self.vocab)
        if length == 0:
@ -122,8 +120,8 @@ cdef class Tokenizer:
        """Tokenize a stream of texts.

        texts: A sequence of unicode texts.
-        batch_size (int): The number of texts to accumulate in an internal buffer.
-        n_threads (int): The number of threads to use, if the implementation
+        batch_size (int): Number of texts to accumulate in an internal buffer.
+        n_threads (int): Number of threads to use, if the implementation
            supports multi-threading. The default tokenizer is single-threaded.
        YIELDS (Doc): A sequence of Doc objects, in order.
        """
@ -232,8 +230,8 @@ cdef class Tokenizer:
                if not matches:
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
                else:
-                    # let's say we have dyn-o-mite-dave
-                    # the regex finds the start and end positions of the hyphens
+                    # let's say we have dyn-o-mite-dave - the regex finds the
+                    # start and end positions of the hyphens
                    start = 0
                    for match in matches:
                        infix_start = match.start()
@ -293,8 +291,8 @@ cdef class Tokenizer:
        return list(self.infix_finditer(string))

    def find_prefix(self, unicode string):
-        """Find the length of a prefix that should be segmented from the string,
-        or None if no prefix rules match.
+        """Find the length of a prefix that should be segmented from the
+        string, or None if no prefix rules match.

        string (unicode): The string to segment.
        RETURNS (int): The length of the prefix if present, otherwise `None`.
@ -305,8 +303,8 @@ cdef class Tokenizer:
        return (match.end() - match.start()) if match is not None else 0

    def find_suffix(self, unicode string):
-        """Find the length of a suffix that should be segmented from the string,
-        or None if no suffix rules match.
+        """Find the length of a suffix that should be segmented from the
+        string, or None if no suffix rules match.

        string (unicode): The string to segment.
        Returns (int): The length of the suffix if present, otherwise `None`.
@ -326,8 +324,8 @@ cdef class Tokenizer:

        string (unicode): The string to specially tokenize.
        token_attrs (iterable): A sequence of dicts, where each dict describes
-            a token and its attributes. The `ORTH` fields of the attributes must
-            exactly match the string when they are concatenated.
+            a token and its attributes. The `ORTH` fields of the attributes
+            must exactly match the string when they are concatenated.
        """
        substrings = list(substrings)
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
@ -343,7 +341,7 @@ cdef class Tokenizer:
        """Save the current state to a directory.

        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist. Paths may be either strings or `Path`-like objects.
+            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        with path.open('wb') as file_:
            file_.write(self.to_bytes(**exclude))
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -476,7 +476,7 @@ cdef class Span:
        """
        # TODO: implement
        def __get__(self):
-            raise NotImplementedError()
+            raise NotImplementedError

    property n_rights:
        """RETURNS (int): The number of rightward immediate children of the
@ -484,7 +484,7 @@ cdef class Span:
        """
        # TODO: implement
        def __get__(self):
-            raise NotImplementedError()
+            raise NotImplementedError

    property subtree:
        """Tokens that descend from tokens in the span, but fall outside it.
--- a/spacy/typedefs.pyx
+++ b/spacy/typedefs.pyx
@ -1 +0,0 @@
-
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -17,8 +17,8 @@ from .compat import copy_reg, basestring_
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .vectors import Vectors
-from . import util
 from ._ml import link_vectors_to_models
+from . import util


 cdef class Vocab: