Clean up imports, unused code, whitespace, docstrings

2025-08-02 03:10:22 +03:00 · 2017-04-15 12:05:47 +02:00 · 2017-04-15 12:05:47 +02:00 · d24589aa72
commit d24589aa72
parent 561f2a3eb4
27 changed files with 77 additions and 127 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,27 +1,13 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals

-import json
 from pathlib import Path
+
 from .util import set_lang_class, get_lang_class, parse_package_meta
 from .deprecated import resolve_model_name
 from .cli import info

-from . import en
-from . import de
-from . import zh
-from . import es
-from . import it
-from . import hu
-from . import fr
-from . import pt
-from . import nl
-from . import sv
-from . import fi
-from . import bn
-from . import he
-
-from .about import *
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he


 set_lang_class(en.English.lang, en.English)
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,7 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
 IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -1,3 +1,6 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from libc.stdio cimport fopen, fclose, fread, fwrite
 from libc.string cimport memcpy

--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import io
 from pathlib import Path

 from .converters import conllu2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals

 import json
-from ...gold import read_json_file, merge_sents
 from ... import util


--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals

-import pip
 import requests
 import os
 import subprocess
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -5,8 +5,6 @@ import json
 from pathlib import Path

 from ..scorer import Scorer
-from ..tagger import Tagger
-from ..syntax.parser import Parser
 from ..gold import GoldParse, merge_sents
 from ..gold import read_json_file as read_gold_json
 from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")

    with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
-        loss = 0
        for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,4 +1,5 @@
 from pathlib import Path
+
 from . import about
 from . import util
 from .cli import download
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
 from .language_data import *


-try:
-    basestring
-except NameError:
-    basestring = str
-
-
 class English(Language):
    lang = 'en'

--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -1,13 +1,11 @@
 # cython: profile=True
+# coding: utf8
 from __future__ import unicode_literals, print_function

 import io
-import json
 import re
-import os
-from os import path
-
-import ujson as json
+import ujson
+from pathlib import Path

 from .syntax import nonproj

@ -303,7 +301,8 @@ cdef class GoldParse:
            self.heads = proj_heads

    def __len__(self):
-        """Get the number of gold-standard tokens.
+        """
+        Get the number of gold-standard tokens.

        Returns (int): The number of gold-standard tokens.
        """
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,9 +1,7 @@
-from __future__ import absolute_import
-from __future__ import unicode_literals
-import pathlib
+# coding: utf8
+from __future__ import absolute_import, unicode_literals
 from contextlib import contextmanager
 import shutil
-
 import ujson


@ -21,19 +19,18 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
 from .matcher import Matcher
-from . import attrs
-from . import orth
-from . import util
-from . import language_data
 from .lemmatizer import Lemmatizer
 from .train import Trainer
-
-from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
 from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import DependencyParser, EntityRecognizer
 from .syntax.arc_eager import ArcEager
 from .syntax.ner import BiluoPushDown
+from .attrs import IS_STOP
+from . import attrs
+from . import orth
+from . import util
+from . import language_data


 class BaseDefaults(object):
@ -150,25 +147,15 @@ class BaseDefaults(object):
        return pipeline

    token_match = language_data.TOKEN_MATCH
-
    prefixes = tuple(language_data.TOKENIZER_PREFIXES)
-
    suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
-
    infixes = tuple(language_data.TOKENIZER_INFIXES)
-
    tag_map = dict(language_data.TAG_MAP)
-
    tokenizer_exceptions = {}
-
    parser_features = get_templates('parser')
-
    entity_features = get_templates('ner')
-
    tagger_features = Tagger.feature_templates # TODO -- fix this
-
    stop_words = set()
-
    lemma_rules = {}
    lemma_exc = {}
    lemma_index = {}
@ -313,7 +300,8 @@ class Language(object):
            self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]

    def __call__(self, text, tag=True, parse=True, entity=True):
-        """Apply the pipeline to some text.  The text can span multiple sentences,
+        """
+        Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.

@ -373,7 +361,7 @@ class Language(object):
        }

        self.setup_directory(path, **configs)
-        
+
        strings_loc = path / 'vocab' / 'strings.json'
        with strings_loc.open('w', encoding='utf8') as file_:
            self.vocab.strings.dump(file_)
@ -397,4 +385,3 @@ class Language(object):
        # to taking nlp.path
        if path is not None:
            self.save_to_directory(path)
-
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,13 +1,8 @@
-from __future__ import unicode_literals, print_function
-import codecs
-import pathlib
-
-import ujson as json
+# coding: utf8
+from __future__ import unicode_literals

 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
-from .symbols import VerbForm_inf, VerbForm_none
-from .symbols import Number_sing
-from .symbols import Degree_pos
+from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos


 class Lemmatizer(object):
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,4 +1,7 @@
 # cython: embedsignature=True
+# coding: utf8
+from __future__ import unicode_literals, print_function
+
 from libc.math cimport sqrt
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
 cimport numpy as np
 np.import_array()

-
-
 from libc.string cimport memset
+import numpy

 from .orth cimport word_shape
 from .typedefs cimport attr_t, flags_t
-import numpy
-
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,5 +1,6 @@
 # cython: profile=True
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals

 from .typedefs cimport attr_t
@ -164,7 +165,7 @@ def _convert_strings(token_specs, string_store):
 def merge_phrase(matcher, doc, i, matches):
    '''Callback to merge a phrase on match'''
    ent_id, label, start, end = matches[i]
-    span = doc[start : end] 
+    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)


--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,13 +1,9 @@
 # cython: infer_types
+# coding: utf8
 from __future__ import unicode_literals

 from libc.string cimport memset

-try:
-    import ujson as json
-except ImportError:
-    import json
-
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
--- a/spacy/multi_words.py
+++ b/spacy/multi_words.py
@ -1,8 +0,0 @@
-class RegexMerger(object):
-    def __init__(self, regexes):
-        self.regexes = regexes
-
-    def __call__(self, tokens):
-        for tag, entity_type, regex in self.regexes:
-            for m in regex.finditer(tokens.string):
-                tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -1,6 +1,7 @@
-# coding: utf8
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals
+
 import unicodedata
 import re

--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals


--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -1,3 +1,6 @@
+# coding: utf8
+from __future__ import unicode_literals
+
 from .syntax.parser cimport Parser
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
@ -36,7 +39,7 @@ cdef class BeamEntityRecognizer(BeamParser):
    TransitionSystem = BiluoPushDown

    feature_templates = get_feature_templates('ner')
-    
+
    def add_label(self, label):
        Parser.add_label(self, label)
        if isinstance(label, basestring):
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,6 +1,5 @@
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
+# coding: utf8
+from __future__ import division, print_function, unicode_literals

 from .gold import tags_to_entities

--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True
+# coding: utf8
 from __future__ import unicode_literals, absolute_import

 cimport cython
 from libc.string cimport memcpy
 from libc.stdint cimport uint64_t, uint32_t
-
 from murmurhash.mrmr cimport hash64, hash32
-
 from preshed.maps cimport map_iter, key_t

 from .typedefs cimport hash_t
@ -154,11 +153,11 @@ cdef class StringStore:
                raise TypeError(type(string_or_id))
            utf8str = self._intern_utf8(byte_string, len(byte_string))
            if utf8str is NULL:
-                # TODO: We need to use 32 bit here, for compatibility with the 
+                # TODO: We need to use 32 bit here, for compatibility with the
                # vocabulary values. This makes birthday paradox probabilities
                # pretty bad.
                # We could also get unlucky here, and hash into a value that
-                # collides with the 'real' strings. 
+                # collides with the 'real' strings.
                return hash32_utf8(byte_string, len(byte_string))
            else:
                return utf8str - self.c
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,3 +1,4 @@
+# coding: utf8
 from __future__ import unicode_literals

 IDS = {
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,5 +1,7 @@
 import json
-import pathlib
+# coding: utf8
+from __future__ import unicode_literals
+
 from collections import defaultdict

 from cymem.cymem cimport Pool
@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
-
 from .attrs cimport *


--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,7 +1,7 @@
 # cython: embedsignature=True
+# coding: utf8
 from __future__ import unicode_literals

-import pathlib

 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
@ -111,7 +111,7 @@ cdef class Tokenizer:
                self.token_match)

        return (self.__class__, args, None, None)
-    
+
    cpdef Doc tokens_from_list(self, list strings):
        return Doc(self.vocab, words=strings)
        #raise NotImplementedError(
@ -276,7 +276,7 @@ cdef class Tokenizer:
            cache_hit = self._try_cache(hash_string(string), tokens)
            if cache_hit:
                pass
-            elif self.token_match and self.token_match(string): 
+            elif self.token_match and self.token_match(string):
                # We're always saying 'no' to spaces here -- the caller will
                # fix up the outermost one, with reference to the original.
                # See Issue #859
@ -377,7 +377,7 @@ cdef class Tokenizer:
        """
        for chunk, substrings in sorted(special_cases.items()):
            self.add_special_case(chunk, substrings)
-    
+
    def add_special_case(self, unicode string, substrings):
        """
        Add a special-case tokenization rule.
--- a/spacy/train.py
+++ b/spacy/train.py
@ -1,11 +1,10 @@
-from __future__ import absolute_import
-from __future__ import unicode_literals
+# coding: utf8
+from __future__ import absolute_import, unicode_literals

 import random
 import tqdm
-from .gold import GoldParse
+from .gold import GoldParse, merge_sents
 from .scorer import Scorer
-from .gold import merge_sents


 class Trainer(object):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,11 +1,10 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
-import os
+
 import io
 import json
 import re
-import os.path
-import pathlib
+from pathlib import Path
 import sys
 import textwrap

@ -23,7 +22,7 @@ except NameError: # Python 3


 LANGUAGES = {}
-_data_path = pathlib.Path(__file__).parent / 'data'
+_data_path = Path(__file__).parent / 'data'


 def set_lang_class(name, cls):
@ -163,8 +162,8 @@ def is_python2():


 def parse_package_meta(package_path, package, require=True):
-    location = os.path.join(str(package_path), package, 'meta.json')
-    if os.path.isfile(location):
+    location = package_path / package / 'meta.json'
+    if location.is_file():
        with io.open(location, encoding='utf8') as f:
            meta = json.load(f)
            return meta
@ -209,10 +208,9 @@ def print_markdown(data, **kwargs):
    which will be converted to a list of tuples."""

    def excl_value(value):
-        # don't print value if it contains absolute path of directory
-        # (i.e. personal info that shouldn't need to be shared)
-        # other conditions can be included here if necessary
-        if str(pathlib.Path(__file__).parent) in value:
+        # don't print value if it contains absolute path of directory (i.e.
+        # personal info). Other conditions can be included here if necessary.
+        if unicode_(Path(__file__).parent) in value:
            return True

    if type(data) == dict:
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,10 +1,6 @@
+# coding: utf8
 from __future__ import unicode_literals

-from libc.string cimport memset
-from libc.stdint cimport int32_t
-from libc.math cimport sqrt
-
-from pathlib import Path
 import bz2
 import ujson as json
 import re
@ -14,28 +10,28 @@ try:
 except ImportError:
    import pickle

+from libc.string cimport memset
+from libc.stdint cimport int32_t
+from libc.math cimport sqrt
+from cymem.cymem cimport Address
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile, StringCFile
-from .lemmatizer import Lemmatizer
-from .attrs import intify_attrs
 from .tokens.token cimport Token
-
-from . import attrs
-from . import symbols
-
-from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
-from . import util
-

 try:
    import copy_reg
 except ImportError:
    import copyreg as copy_reg
+from .lemmatizer import Lemmatizer
+from .attrs import intify_attrs
+from . import util
+from . import attrs
+from . import symbols


 DEF MAX_VEC_SIZE = 100000