Tidy up and fix formatting and imports

2026-01-09 18:21:14 +03:00 · 2017-04-15 13:05:15 +02:00 · 2017-04-15 13:05:15 +02:00 · 0739ae7b76
commit 0739ae7b76
parent fefe6684cd
15 changed files with 251 additions and 230 deletions
--- a/spacy/compat.py
+++ b/spacy/compat.py
@ -3,7 +3,7 @@ from __future__ import unicode_literals

 import six
 import sys
-import json
+import ujson

 try:
    import cPickle as pickle
@ -28,14 +28,14 @@ if is_python2:
    unicode_ = unicode
    basestring_ = basestring
    input_ = raw_input
-    json_dumps = lambda data: json.dumps(data, indent=2).decode('utf8')
+    json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8')

 elif is_python3:
    bytes_ = bytes
    unicode_ = str
    basestring_ = str
    input_ = input
-    json_dumps = lambda data: json.dumps(data, indent=2)
+    json_dumps = lambda data: ujson.dumps(data, indent=2)


 def symlink_to(orig, dest):
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,3 +1,6 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from pathlib import Path

 from . import about
--- a/spacy/syntax/_parse_features.pyx
+++ b/spacy/syntax/_parse_features.pyx
@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx
 The atomic feature names are listed in a big enum, so that the feature tuples
 can refer to them.
 """
-from libc.string cimport memset
+# coding: utf-8
+from __future__ import unicode_literals

+from libc.string cimport memset
 from itertools import combinations
+from cymem.cymem cimport Pool

 from ..structs cimport TokenC
-
 from .stateclass cimport StateClass
 from ._state cimport StateC

-from cymem.cymem cimport Pool
-

 cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
    if token is NULL:
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@ -1,29 +1,26 @@
 # cython: profile=True
 # cython: cdivision=True
 # cython: infer_types=True
+# coding: utf-8
 from __future__ import unicode_literals
+
 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
-
 import ctypes
-import os
-
-from ..structs cimport TokenC
+from libc.stdint cimport uint32_t
+from libc.string cimport memcpy
+from cymem.cymem cimport Pool

+from .stateclass cimport StateClass
+from ._state cimport StateC, is_space_token
+from .nonproj import PseudoProjectivity
+from .nonproj import is_nonproj_tree
 from .transition_system cimport do_func_t, get_cost_func_t
 from .transition_system cimport move_cost_func_t, label_cost_func_t
 from ..gold cimport GoldParse
 from ..gold cimport GoldParseC
 from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE
 from ..lexeme cimport Lexeme
-
-from libc.stdint cimport uint32_t
-from libc.string cimport memcpy
-
-from cymem.cymem cimport Pool
-from .stateclass cimport StateClass
-from ._state cimport StateC, is_space_token
-from .nonproj import PseudoProjectivity
-from .nonproj import is_nonproj_tree
+from ..structs cimport TokenC


 DEF NON_MONOTONIC = True
--- a/spacy/syntax/beam_parser.pyx
+++ b/spacy/syntax/beam_parser.pyx
@ -1,50 +1,34 @@
+"""
+MALT-style dependency parser
+"""
 # cython: profile=True
 # cython: experimental_cpp_class_def=True
 # cython: cdivision=True
 # cython: infer_types=True
-"""
-MALT-style dependency parser
-"""
-from __future__ import unicode_literals
+# coding: utf-8
+
+from __future__ import unicode_literals, print_function
 cimport cython

 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
-
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport rand
 from libc.math cimport log, exp, isnan, isinf
-import random
-import os.path
-from os import path
-import shutil
-import json
-import math
-
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport real_hash64 as hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
-
-
-from util import Config
-
 from thinc.linear.features cimport ConjunctionExtracter
 from thinc.structs cimport FeatureC, ExampleC
-
-from thinc.extra.search cimport Beam
-from thinc.extra.search cimport MaxViolation
+from thinc.extra.search cimport Beam, MaxViolation
 from thinc.extra.eg cimport Example
 from thinc.extra.mb cimport Minibatch

 from ..structs cimport TokenC
-
 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore
-
 from .transition_system cimport TransitionSystem, Transition
-
 from ..gold cimport GoldParse
-
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings):
        id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]]
        truth.add((id_, head, dep))
    return truth == predicted
-
--- a/spacy/syntax/iterators.pyx
+++ b/spacy/syntax/iterators.pyx
@ -1,9 +1,14 @@
-from spacy.parts_of_speech cimport NOUN, PROPN, PRON
+# coding: utf-8
+from __future__ import unicode_literals
+
+from ..parts_of_speech cimport NOUN, PROPN, PRON


 def english_noun_chunks(obj):
-    '''Detect base noun phrases from a dependency parse.
-    Works on both Doc and Span.'''
+    """
+    Detect base noun phrases from a dependency parse.
+    Works on both Doc and Span.
+    """
    labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj',
              'attr', 'ROOT', 'root']
    doc = obj.doc # Ensure works on both Doc and Span.
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@ -1,17 +1,16 @@
+# coding: utf-8
 from __future__ import unicode_literals

-from .transition_system cimport Transition
-from .transition_system cimport do_func_t
-
-from ..structs cimport TokenC, Entity
-
 from thinc.typedefs cimport weight_t
-from ..gold cimport GoldParseC
-from ..gold cimport GoldParse
-from ..attrs cimport ENT_TYPE, ENT_IOB

 from .stateclass cimport StateClass
 from ._state cimport StateC
+from .transition_system cimport Transition
+from .transition_system cimport do_func_t
+from ..structs cimport TokenC, Entity
+from ..gold cimport GoldParseC
+from ..gold cimport GoldParse
+from ..attrs cimport ENT_TYPE, ENT_IOB


 cdef enum:
--- a/spacy/syntax/nonproj.pyx
+++ b/spacy/syntax/nonproj.pyx
@ -1,8 +1,9 @@
+# coding: utf-8
 from __future__ import unicode_literals
 from copy import copy

 from ..tokens.doc cimport Doc
-from spacy.attrs import DEP, HEAD
+from ..attrs import DEP, HEAD


 def ancestors(tokenid, heads):
@ -201,5 +202,3 @@ class PseudoProjectivity:
                filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts))
            filtered.append((raw_text, filtered_sents))
        return filtered
-
-
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -1,56 +1,44 @@
-# cython: infer_types=True
 """
 MALT-style dependency parser
 """
+# coding: utf-8
+# cython: infer_types=True
 from __future__ import unicode_literals
+
+from collections import Counter
+import ujson
+
 cimport cython
 cimport cython.parallel

 from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cpython.exc cimport PyErr_CheckSignals
-
 from libc.stdint cimport uint32_t, uint64_t
 from libc.string cimport memset, memcpy
 from libc.stdlib cimport malloc, calloc, free
-
-import os.path
-from collections import Counter
-from os import path
-import shutil
-import json
-import sys
-from .nonproj import PseudoProjectivity
-
-from cymem.cymem cimport Pool, Address
-from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
 from thinc.linear.avgtron cimport AveragedPerceptron
 from thinc.linalg cimport VecVec
-from thinc.structs cimport SparseArrayC
+from thinc.structs cimport SparseArrayC, FeatureC, ExampleC
+from thinc.extra.eg cimport Example
+from cymem.cymem cimport Pool, Address
+from murmurhash.mrmr cimport hash64
 from preshed.maps cimport MapStruct
 from preshed.maps cimport map_get

-from thinc.structs cimport FeatureC
-from thinc.structs cimport ExampleC
-from thinc.extra.eg cimport Example
-
-from util import Config
-
-from ..structs cimport TokenC
-
-from ..tokens.doc cimport Doc
-from ..strings cimport StringStore
-
-from .transition_system import OracleError
-from .transition_system cimport TransitionSystem, Transition
-
-from ..gold cimport GoldParse
-
 from . import _parse_features
 from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass
 from ._state cimport StateC
+from .nonproj import PseudoProjectivity
+from .transition_system import OracleError
+from .transition_system cimport TransitionSystem, Transition
+from ..structs cimport TokenC
+from ..tokens.doc cimport Doc
+from ..strings cimport StringStore
+from ..gold cimport GoldParse
+

 USE_FTRL = False
 DEBUG = False
@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron):
        return nr_feat

    def update(self, Example eg, itn=0):
-        '''Does regression on negative cost. Sort of cute?'''
+        """
+        Does regression on negative cost. Sort of cute?
+        """
        self.time += 1
        cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class)
        cdef int guess = eg.guess
@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron):


 cdef class Parser:
-    """Base class of the DependencyParser and EntityRecognizer."""
+    """
+    Base class of the DependencyParser and EntityRecognizer.
+    """
    @classmethod
    def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg):
-        """Load the statistical model from the supplied path.
+        """
+        Load the statistical model from the supplied path.

        Arguments:
            path (Path):
@ -148,7 +141,7 @@ cdef class Parser:
            The newly constructed object.
        """
        with (path / 'config.json').open() as file_:
-            cfg = json.load(file_)
+            cfg = ujson.load(file_)
        # TODO: remove this shim when we don't have to support older data
        if 'labels' in cfg and 'actions' not in cfg:
            cfg['actions'] = cfg.pop('labels')
@ -168,7 +161,8 @@ cdef class Parser:
        return self

    def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg):
-        """Create a Parser.
+        """
+        Create a Parser.

        Arguments:
            vocab (Vocab):
@ -198,7 +192,8 @@ cdef class Parser:
        return (Parser, (self.vocab, self.moves, self.model), None, None)

    def __call__(self, Doc tokens):
-        """Apply the entity recognizer, setting the annotations onto the Doc object.
+        """
+        Apply the entity recognizer, setting the annotations onto the Doc object.

        Arguments:
            doc (Doc): The document to be processed.
@ -215,7 +210,8 @@ cdef class Parser:
        self.moves.finalize_doc(tokens)

    def pipe(self, stream, int batch_size=1000, int n_threads=2):
-        """Process a stream of documents.
+        """
+        Process a stream of documents.

        Arguments:
            stream: The sequence of documents to process.
@ -303,7 +299,8 @@ cdef class Parser:
        return 0

    def update(self, Doc tokens, GoldParse gold, itn=0):
-        """Update the statistical model.
+        """
+        Update the statistical model.

        Arguments:
            doc (Doc):
@ -342,7 +339,8 @@ cdef class Parser:
        return loss

    def step_through(self, Doc doc, GoldParse gold=None):
-        """Set up a stepwise state, to introspect and control the transition sequence.
+        """
+        Set up a stepwise state, to introspect and control the transition sequence.

        Arguments:
            doc (Doc): The document to step through.
@ -426,7 +424,9 @@ cdef class StepwiseState:

    @property
    def costs(self):
-        '''Find the action-costs for the current state'''
+        """
+        Find the action-costs for the current state.
+        """
        self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs,
                self.stcls, self.gold)
        costs = {}
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -1,5 +1,9 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
+
 from ..vocab cimport EMPTY_LEXEME
 from ..structs cimport Entity
 from ..lexeme cimport Lexeme
@ -28,6 +32,6 @@ cdef class StateClass:
        top = words[self.S(0)] + '_%d' % self.S_(0).head
        second = words[self.S(1)] + '_%d' % self.S_(1).head
        third = words[self.S(2)] + '_%d' % self.S_(2).head
-        n0 = words[self.B(0)] 
-        n1 = words[self.B(1)] 
+        n0 = words[self.B(0)]
+        n1 = words[self.B(1)]
        return ' '.join((third, second, top, '|', n0, n1))
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -1,4 +1,8 @@
 # cython: infer_types=True
+# coding: utf-8
+from __future__ import unicode_literals
+
+from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 from cymem.cymem cimport Pool
 from thinc.typedefs cimport weight_t
 from collections import defaultdict
@ -6,7 +10,6 @@ from collections import defaultdict
 from ..structs cimport TokenC
 from .stateclass cimport StateClass
 from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB
-from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF


 cdef weight_t MIN_SCORE = -90000
--- a/spacy/syntax/util.py
+++ b/spacy/syntax/util.py
@ -1,18 +0,0 @@
-from os import path
-import json
-
-class Config(object):
-    def __init__(self, **kwargs):
-        for key, value in kwargs.items():
-            setattr(self, key, value)
-
-    def get(self, attr, default=None):
-        return self.__dict__.get(attr, default)
-
-    @classmethod
-    def write(cls, model_dir, name, **kwargs):
-        open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs))
-
-    @classmethod
-    def read(cls, model_dir, name):
-        return cls(**json.load(open(path.join(model_dir, '%s.json' % name))))
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1,15 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 cimport cython
+cimport numpy as np
+import numpy
+import numpy.linalg
+import struct
+
 from libc.string cimport memcpy, memset
 from libc.stdint cimport uint32_t
 from libc.math cimport sqrt

-import numpy
-import numpy.linalg
-import struct
-cimport numpy as np
-import six
-import warnings
-
+from .span cimport Span
+from .token cimport Token
 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
 from ..typedefs cimport attr_t, flags_t
@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN
 from ..parts_of_speech cimport univ_pos_t
 from ..lexeme cimport Lexeme
-from .span cimport Span
-from .token cimport Token
 from ..serialize.bits cimport BitArray
 from ..util import normalize_slice
 from ..syntax.iterators import CHUNKERS
+from ..compat import is_config


 DEF PADDING = 5
@ -76,7 +78,7 @@ cdef class Doc:

    """
    def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None):
-        '''
+        """
        Create a Doc object.

        Aside: Implementation
@ -97,7 +99,7 @@ cdef class Doc:
                A list of boolean values, of the same length as words. True
                means that the word is followed by a space, False means it is not.
                If None, defaults to [True]*len(words)
-        '''
+        """
        self.vocab = vocab
        size = 20
        self.mem = Pool()
@ -158,7 +160,7 @@ cdef class Doc:
            self.is_parsed = True

    def __getitem__(self, object i):
-        '''
+        """
        doc[i]
            Get the Token object at position i, where i is an integer.
            Negative indexing is supported, and follows the usual Python
@ -172,7 +174,7 @@ cdef class Doc:
            are not supported, as `Span` objects must be contiguous (cannot have gaps).
            You can use negative indices and open-ended ranges, which have their
            normal Python semantics.
-        '''
+        """
        if isinstance(i, slice):
            start, stop = normalize_slice(len(self), i.start, i.stop, i.step)
            return Span(self, start, stop, label=0)
@ -186,7 +188,7 @@ cdef class Doc:
            return Token.cinit(self.vocab, &self.c[i], i, self)

    def __iter__(self):
-        '''
+        """
        for token in doc
            Iterate over `Token`  objects, from which the annotations can
            be easily accessed. This is the main way of accessing Token
@ -194,7 +196,7 @@ cdef class Doc:
            Python. If faster-than-Python speeds are required, you can
            instead access the annotations as a numpy array, or access the
            underlying C data directly from Cython.
-        '''
+        """
        cdef int i
        for i in range(self.length):
            if self._py_tokens[i] is not None:
@ -203,10 +205,10 @@ cdef class Doc:
                yield Token.cinit(self.vocab, &self.c[i], i, self)

    def __len__(self):
-        '''
+        """
        len(doc)
            The number of tokens in the document.
-        '''
+        """
        return self.length

    def __unicode__(self):
@ -216,7 +218,7 @@ cdef class Doc:
        return u''.join([t.text_with_ws for t in self]).encode('utf-8')

    def __str__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

@ -228,7 +230,8 @@ cdef class Doc:
        return self

    def similarity(self, other):
-        '''Make a semantic similarity estimate. The default estimate is cosine
+        """
+        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        Arguments:
@ -237,7 +240,7 @@ cdef class Doc:

        Return:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.user_hooks:
            return self.user_hooks['similarity'](self, other)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -245,9 +248,9 @@ cdef class Doc:
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

    property has_vector:
-        '''
+        """
        A boolean value indicating whether a word vector is associated with the object.
-        '''
+        """
        def __get__(self):
            if 'has_vector' in self.user_hooks:
                return self.user_hooks['has_vector'](self)
@ -255,11 +258,11 @@ cdef class Doc:
            return any(token.has_vector for token in self)

    property vector:
-        '''
+        """
        A real-valued meaning representation. Defaults to an average of the token vectors.

        Type: numpy.ndarray[ndim=1, dtype='float32']
-        '''
+        """
        def __get__(self):
            if 'vector' in self.user_hooks:
                return self.user_hooks['vector'](self)
@ -294,17 +297,21 @@ cdef class Doc:
        return self.text

    property text:
-        '''A unicode representation of the document text.'''
+        """
+        A unicode representation of the document text.
+        """
        def __get__(self):
            return u''.join(t.text_with_ws for t in self)

    property text_with_ws:
-        '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.'''
+        """
+        An alias of Doc.text, provided for duck-type compatibility with Span and Token.
+        """
        def __get__(self):
            return self.text

    property ents:
-        '''
+        """
        Yields named-entity `Span` objects, if the entity recognizer
        has been applied to the document. Iterate over the span to get
        individual Token objects, or access the label:
@ -318,7 +325,7 @@ cdef class Doc:
            assert ents[0].label_ == 'PERSON'
            assert ents[0].orth_ == 'Best'
            assert ents[0].text == 'Mr. Best'
-        '''
+        """
        def __get__(self):
            cdef int i
            cdef const TokenC* token
@ -382,13 +389,13 @@ cdef class Doc:
                    self.c[start].ent_iob = 3

    property noun_chunks:
-        '''
+        """
        Yields base noun-phrase #[code Span] objects, if the document
        has been syntactically parsed. A base noun phrase, or
        'NP chunk', is a noun phrase that does not permit other NPs to
        be nested within it – so no NP-level coordination, no prepositional
-        phrases, and no relative clauses. For example:
-        '''
+        phrases, and no relative clauses.
+        """
        def __get__(self):
            if not self.is_parsed:
                raise ValueError(
@ -496,7 +503,8 @@ cdef class Doc:
        return output

    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
-        """Produce a dict of {attribute (int): count (ints)} frequencies, keyed
+        """
+        Produce a dict of {attribute (int): count (ints)} frequencies, keyed
        by the values of the given attribute ID.

        Example:
@ -563,8 +571,9 @@ cdef class Doc:
            self.c[i] = parsed[i]

    def from_array(self, attrs, array):
-        '''Write to a `Doc` object, from an `(M, N)` array of attributes.
-        '''
+        """
+        Write to a `Doc` object, from an `(M, N)` array of attributes.
+        """
        cdef int i, col
        cdef attr_id_t attr_id
        cdef TokenC* tokens = self.c
@ -603,19 +612,23 @@ cdef class Doc:
        return self

    def to_bytes(self):
-        '''Serialize, producing a byte string.'''
+        """
+        Serialize, producing a byte string.
+        """
        byte_string = self.vocab.serializer.pack(self)
        cdef uint32_t length = len(byte_string)
        return struct.pack('I', length) + byte_string

    def from_bytes(self, data):
-        '''Deserialize, loading from bytes.'''
+        """
+        Deserialize, loading from bytes.
+        """
        self.vocab.serializer.unpack_into(data[4:], self)
        return self

    @staticmethod
    def read_bytes(file_):
-        '''
+        """
        A static method, used to read serialized #[code Doc] objects from
        a file. For example:

@ -630,7 +643,7 @@ cdef class Doc:
                for byte_string in Doc.read_bytes(file_):
                    docs.append(Doc(nlp.vocab).from_bytes(byte_string))
            assert len(docs) == 2
-        '''
+        """
        keep_reading = True
        while keep_reading:
            try:
@ -644,7 +657,8 @@ cdef class Doc:
            yield n_bytes_str + data

    def merge(self, int start_idx, int end_idx, *args, **attributes):
-        """Retokenize the document, such that the span at doc.text[start_idx : end_idx]
+        """
+        Retokenize the document, such that the span at doc.text[start_idx : end_idx]
        is merged into a single token. If start_idx and end_idx do not mark start
        and end token boundaries, the document remains unchanged.

@ -658,7 +672,6 @@ cdef class Doc:
            token (Token):
                The newly merged token, or None if the start and end indices did
                not fall at token boundaries.
-
        """
        cdef unicode tag, lemma, ent_type
        if len(args) == 3:
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -1,26 +1,31 @@
+# coding: utf8
 from __future__ import unicode_literals
 from collections import defaultdict
+
+cimport numpy as np
 import numpy
 import numpy.linalg
-cimport numpy as np
 from libc.math cimport sqrt
-import six

+from .doc cimport token_by_start, token_by_end
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
 from ..parts_of_speech cimport univ_pos_t
 from ..util import normalize_slice
-from .doc cimport token_by_start, token_by_end
 from ..attrs cimport IS_PUNCT, IS_SPACE
 from ..lexeme cimport Lexeme
+from ..compat import is_config


 cdef class Span:
-    """A slice from a Doc object."""
+    """
+    A slice from a Doc object.
+    """
    def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None,
                  vector_norm=None):
-        '''Create a Span object from the slice doc[start : end]
+        """
+        Create a Span object from the slice doc[start : end]

        Arguments:
            doc (Doc): The parent document.
@ -30,7 +35,7 @@ cdef class Span:
            vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
        Returns:
            Span The newly constructed object.
-        '''
+        """
        if not (0 <= start <= end <= len(doc)):
            raise IndexError

@ -68,7 +73,7 @@ cdef class Span:
        return self.end - self.start

    def __repr__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.text
        return self.text.encode('utf-8')

@ -89,7 +94,8 @@ cdef class Span:
            yield self.doc[i]

    def merge(self, *args, **attributes):
-        """Retokenize the document, such that the span is merged into a single token.
+        """
+        Retokenize the document, such that the span is merged into a single token.

        Arguments:
            **attributes:
@ -102,7 +108,8 @@ cdef class Span:
        return self.doc.merge(self.start_char, self.end_char, *args, **attributes)

    def similarity(self, other):
-        '''Make a semantic similarity estimate. The default estimate is cosine
+        """
+        Make a semantic similarity estimate. The default estimate is cosine
        similarity using an average of word vectors.

        Arguments:
@ -111,7 +118,7 @@ cdef class Span:

        Return:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.doc.user_span_hooks:
            self.doc.user_span_hooks['similarity'](self, other)
        if self.vector_norm == 0.0 or other.vector_norm == 0.0:
@ -133,11 +140,12 @@ cdef class Span:
            self.end = end + 1

    property sent:
-        '''The sentence span that this span is a part of.
+        """
+        The sentence span that this span is a part of.

        Returns:
            Span The sentence this is part of.
-        '''
+        """
        def __get__(self):
            if 'sent' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['sent'](self)
@ -198,13 +206,13 @@ cdef class Span:
            return u''.join([t.text_with_ws for t in self])

    property noun_chunks:
-        '''
+        """
        Yields base noun-phrase #[code Span] objects, if the document
        has been syntactically parsed. A base noun phrase, or
        'NP chunk', is a noun phrase that does not permit other NPs to
        be nested within it – so no NP-level coordination, no prepositional
        phrases, and no relative clauses. For example:
-        '''
+        """
        def __get__(self):
            if not self.doc.is_parsed:
                raise ValueError(
@ -223,17 +231,16 @@ cdef class Span:
                yield span

    property root:
-        """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered.
+        """
+        The token within the span that's highest in the parse tree. If there's a
+        tie, the earlist is prefered.

        Returns:
            Token: The root token.

-        i.e. has the
-        shortest path to the root of the sentence (or is the root itself).
-
-        If multiple words are equally high in the tree, the first word is taken.
-
-        For example:
+        i.e. has the shortest path to the root of the sentence (or is the root
+        itself). If multiple words are equally high in the tree, the first word
+        is taken. For example:

        >>> toks = nlp(u'I like New York in Autumn.')

@ -303,7 +310,8 @@ cdef class Span:
                return self.doc[root]

    property lefts:
-        """Tokens that are to the left of the span, whose head is within the Span.
+        """
+        Tokens that are to the left of the span, whose head is within the Span.

        Yields: Token A left-child of a token of the span.
        """
@ -314,7 +322,8 @@ cdef class Span:
                        yield left

    property rights:
-        """Tokens that are to the right of the Span, whose head is within the Span.
+        """
+        Tokens that are to the right of the Span, whose head is within the Span.

        Yields: Token A right-child of a token of the span.
        """
@ -325,7 +334,8 @@ cdef class Span:
                        yield right

    property subtree:
-        """Tokens that descend from tokens in the span, but fall outside it.
+        """
+        Tokens that descend from tokens in the span, but fall outside it.

        Yields: Token A descendant of a token within the span.
        """
@ -337,7 +347,9 @@ cdef class Span:
                yield from word.subtree

    property ent_id:
-        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.root.ent_id

@ -345,9 +357,11 @@ cdef class Span:
            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/spacy-io/spaCy")
+                "tracker: http://github.com/explosion/spaCy/issues")
    property ent_id_:
-        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.root.ent_id_

@ -355,7 +369,7 @@ cdef class Span:
            # TODO
            raise NotImplementedError(
                "Can't yet set ent_id_ from Span. Vote for this feature on the issue "
-                "tracker: http://github.com/spacy-io/spaCy")
+                "tracker: http://github.com/explosion/spaCy/issues")

    property orth_:
        def __get__(self):
@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
            raise RuntimeError(
                "Array bounds exceeded while searching for root word. This likely "
                "means the parse tree is in an invalid state. Please report this "
-                "issue here: http://github.com/honnibal/spaCy/")
+                "issue here: http://github.com/explosion/spaCy/issues")
    return n
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -1,5 +1,5 @@
-# coding: utf8
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals

 from libc.string cimport memcpy
@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free
 from cython.view cimport array as cvarray
 cimport numpy as np
 np.import_array()
-
 import numpy
-import six
-

 from ..typedefs cimport hash_t
 from ..lexeme cimport Lexeme
 from .. import parts_of_speech
-
 from ..attrs cimport LEMMA
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP
 from ..parts_of_speech cimport CCONJ, PUNCT
-
 from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from ..attrs cimport IS_BRACKET
 from ..attrs cimport IS_QUOTE
@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT
 from ..attrs cimport IS_RIGHT_PUNCT
 from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from ..attrs cimport IS_OOV
-
 from ..lexeme cimport Lexeme
+from ..compat import is_config


 cdef class Token:
-    """An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
+    """
+    An individual token --- i.e. a word, punctuation symbol, whitespace, etc.
    """
    def __cinit__(self, Vocab vocab, Doc doc, int offset):
        self.vocab = vocab
@ -46,7 +42,9 @@ cdef class Token:
        return hash((self.doc, self.i))

    def __len__(self):
-        '''Number of unicode characters in token.text'''
+        """
+        Number of unicode characters in token.text.
+        """
        return self.c.lex.length

    def __unicode__(self):
@ -56,7 +54,7 @@ cdef class Token:
        return self.text.encode('utf8')

    def __str__(self):
-        if six.PY3:
+        if is_config(python3=True):
            return self.__unicode__()
        return self.__bytes__()

@ -83,27 +81,30 @@ cdef class Token:
            raise ValueError(op)

    cpdef bint check_flag(self, attr_id_t flag_id) except -1:
-        '''Check the value of a boolean flag.
+        """
+        Check the value of a boolean flag.

        Arguments:
            flag_id (int): The ID of the flag attribute.
        Returns:
            is_set (bool): Whether the flag is set.
-        '''
+        """
        return Lexeme.c_check_flag(self.c.lex, flag_id)

    def nbor(self, int i=1):
-        '''Get a neighboring token.
+        """
+        Get a neighboring token.

        Arguments:
            i (int): The relative position of the token to get. Defaults to 1.
        Returns:
            neighbor (Token): The token at position self.doc[self.i+i]
-        '''
+        """
        return self.doc[self.i+i]

    def similarity(self, other):
-        '''Compute a semantic similarity estimate. Defaults to cosine over vectors.
+        """
+        Compute a semantic similarity estimate. Defaults to cosine over vectors.

        Arguments:
            other:
@ -111,7 +112,7 @@ cdef class Token:
                Token and Lexeme objects.
        Returns:
            score (float): A scalar similarity score. Higher is more similar.
-        '''
+        """
        if 'similarity' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['similarity'](self)
        if self.vector_norm == 0 or other.vector_norm == 0:
@ -209,9 +210,9 @@ cdef class Token:
            self.c.dep = label

    property has_vector:
-        '''
+        """
        A boolean value indicating whether a word vector is associated with the object.
-        '''
+        """
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
@ -223,11 +224,11 @@ cdef class Token:
                return False

    property vector:
-        '''
+        """
        A real-valued meaning representation.

        Type: numpy.ndarray[ndim=1, dtype='float32']
-        '''
+        """
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
@ -245,6 +246,7 @@ cdef class Token:
    property repvec:
        def __get__(self):
            raise AttributeError("repvec was renamed to vector in v0.100")
+
    property has_repvec:
        def __get__(self):
            raise AttributeError("has_repvec was renamed to has_vector in v0.100")
@ -265,7 +267,8 @@ cdef class Token:

    property lefts:
        def __get__(self):
-            """The leftward immediate children of the word, in the syntactic
+            """
+            The leftward immediate children of the word, in the syntactic
            dependency parse.
            """
            cdef int nr_iter = 0
@ -282,8 +285,10 @@ cdef class Token:

    property rights:
        def __get__(self):
-            """The rightward immediate children of the word, in the syntactic
-            dependency parse."""
+            """
+            The rightward immediate children of the word, in the syntactic
+            dependency parse.
+            """
            cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i)
            tokens = []
            cdef int nr_iter = 0
@ -300,19 +305,21 @@ cdef class Token:
                yield t

    property children:
-        '''A sequence of the token's immediate syntactic children.
+        """
+        A sequence of the token's immediate syntactic children.

        Yields: Token A child token such that child.head==self
-        '''
+        """
        def __get__(self):
            yield from self.lefts
            yield from self.rights

    property subtree:
-        '''A sequence of all the token's syntactic descendents.
+        """
+        A sequence of all the token's syntactic descendents.

        Yields: Token A descendent token such that self.is_ancestor(descendent)
-        '''
+        """
        def __get__(self):
            for word in self.lefts:
                yield from word.subtree
@ -321,26 +328,29 @@ cdef class Token:
                yield from word.subtree

    property left_edge:
-        '''The leftmost token of this token's syntactic descendents.
+        """
+        The leftmost token of this token's syntactic descendents.

        Returns: Token The first token such that self.is_ancestor(token)
-        '''
+        """
        def __get__(self):
            return self.doc[self.c.l_edge]

    property right_edge:
-        '''The rightmost token of this token's syntactic descendents.
+        """
+        The rightmost token of this token's syntactic descendents.

        Returns: Token The last token such that self.is_ancestor(token)
-        '''
+        """
        def __get__(self):
            return self.doc[self.c.r_edge]

    property ancestors:
-        '''A sequence of this token's syntactic ancestors.
+        """
+        A sequence of this token's syntactic ancestors.

        Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self)
-        '''
+        """
        def __get__(self):
            cdef const TokenC* head_ptr = self.c
            # guard against infinite loop, no token can have
@ -356,25 +366,29 @@ cdef class Token:
        return self.is_ancestor(descendant)

    def is_ancestor(self, descendant):
-        '''Check whether this token is a parent, grandparent, etc. of another
+        """
+        Check whether this token is a parent, grandparent, etc. of another
        in the dependency tree.

        Arguments:
            descendant (Token): Another token.
        Returns:
            is_ancestor (bool): Whether this token is the ancestor of the descendant.
-        '''
+        """
        if self.doc is not descendant.doc:
            return False
        return any( ancestor.i == self.i for ancestor in descendant.ancestors )

    property head:
-        '''The syntactic parent, or "governor", of this token.
+        """
+        The syntactic parent, or "governor", of this token.

        Returns: Token
-        '''
+        """
        def __get__(self):
-            """The token predicted by the parser to be the head of the current token."""
+            """
+            The token predicted by the parser to be the head of the current token.
+            """
            return self.doc[self.i + self.c.head]
        def __set__(self, Token new_head):
            # this function sets the head of self to new_head
@ -467,10 +481,11 @@ cdef class Token:
            self.c.head = rel_newhead_i

    property conjuncts:
-        '''A sequence of coordinated tokens, including the token itself.
+        """
+        A sequence of coordinated tokens, including the token itself.

        Yields: Token A coordinated token
-        '''
+        """
        def __get__(self):
            """Get a list of conjoined words."""
            cdef Token word
@ -501,7 +516,9 @@ cdef class Token:
            return iob_strings[self.c.ent_iob]

    property ent_id:
-        '''An (integer) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        An (integer) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.c.ent_id

@ -509,7 +526,9 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        '''A (string) entity ID. Usually assigned by patterns in the Matcher.'''
+        """
+        A (string) entity ID. Usually assigned by patterns in the Matcher.
+        """
        def __get__(self):
            return self.vocab.strings[self.c.ent_id]