Clean up imports, unused code, whitespace, docstrings

2025-10-16 16:54:38 +03:00 · 2017-04-15 12:05:47 +02:00 · 2017-04-15 12:05:47 +02:00 · d24589aa72
commit d24589aa72
parent 561f2a3eb4
27 changed files with 77 additions and 127 deletions
--- a/spacy/init.py
+++ b/spacy/init.py
@ -1,27 +1,13 @@
 # coding: utf8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 import json
 from pathlib import Path
 from .util import set_lang_class, get_lang_class, parse_package_meta
 from .deprecated import resolve_model_name
 from .cli import info
-from . import en
+from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
 from . import de
 from . import zh
 from . import es
 from . import it
 from . import hu
 from . import fr
 from . import pt
 from . import nl
 from . import sv
 from . import fi
 from . import bn
 from . import he
 from .about import *
 set_lang_class(en.English.lang, en.English)
--- a/spacy/attrs.pyx
+++ b/spacy/attrs.pyx
@ -1,3 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals
 IDS = {
    "": NULL_ATTR,
    "IS_ALPHA": IS_ALPHA,
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@ -1,3 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from libc.stdio cimport fopen, fclose, fread, fwrite
 from libc.string cimport memcpy
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import io
 from pathlib import Path
 from .converters import conllu2json
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -2,7 +2,6 @@
 from __future__ import unicode_literals
 import json
 from ...gold import read_json_file, merge_sents
 from ... import util
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -1,7 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pip
 import requests
 import os
 import subprocess
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -5,8 +5,6 @@ import json
 from pathlib import Path
 from ..scorer import Scorer
 from ..tagger import Tagger
 from ..syntax.parser import Parser
 from ..gold import GoldParse, merge_sents
 from ..gold import read_json_file as read_gold_json
 from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
    print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
    with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
        loss = 0
        for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
            for doc, gold in epoch:
                trainer.update(doc, gold)
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,4 +1,5 @@
 from pathlib import Path
 from . import about
 from . import util
 from .cli import download
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
 from .language_data import *
 try:
    basestring
 except NameError:
    basestring = str
 class English(Language):
    lang = 'en'
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -1,13 +1,11 @@
 # cython: profile=True
 # coding: utf8
 from __future__ import unicode_literals, print_function
 import io
 import json
 import re
-import os
+import ujson
-from os import path
+from pathlib import Path
 import ujson as json
 from .syntax import nonproj
@ -303,7 +301,8 @@ cdef class GoldParse:
            self.heads = proj_heads
    def __len__(self):
-        """Get the number of gold-standard tokens.
+        """
        Get the number of gold-standard tokens.
        Returns (int): The number of gold-standard tokens.
        """
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,9 +1,7 @@
-from __future__ import absolute_import
+# coding: utf8
-from __future__ import unicode_literals
+from __future__ import absolute_import, unicode_literals
 import pathlib
 from contextlib import contextmanager
 import shutil
 import ujson
@ -21,19 +19,18 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
 from .matcher import Matcher
 from . import attrs
 from . import orth
 from . import util
 from . import language_data
 from .lemmatizer import Lemmatizer
 from .train import Trainer
 from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
 from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import DependencyParser, EntityRecognizer
 from .syntax.arc_eager import ArcEager
 from .syntax.ner import BiluoPushDown
 from .attrs import IS_STOP
 from . import attrs
 from . import orth
 from . import util
 from . import language_data
 class BaseDefaults(object):
@ -150,25 +147,15 @@ class BaseDefaults(object):
        return pipeline
    token_match = language_data.TOKEN_MATCH
    prefixes = tuple(language_data.TOKENIZER_PREFIXES)
    suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
    infixes = tuple(language_data.TOKENIZER_INFIXES)
    tag_map = dict(language_data.TAG_MAP)
    tokenizer_exceptions = {}
    parser_features = get_templates('parser')
    entity_features = get_templates('ner')
    tagger_features = Tagger.feature_templates # TODO -- fix this
    stop_words = set()
    lemma_rules = {}
    lemma_exc = {}
    lemma_index = {}
@ -313,7 +300,8 @@ class Language(object):
            self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
    def __call__(self, text, tag=True, parse=True, entity=True):
-        """Apply the pipeline to some text.  The text can span multiple sentences,
+        """
        Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        is preserved.
@ -397,4 +385,3 @@ class Language(object):
        # to taking nlp.path
        if path is not None:
            self.save_to_directory(path)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -1,13 +1,8 @@
-from __future__ import unicode_literals, print_function
+# coding: utf8
-import codecs
+from __future__ import unicode_literals
 import pathlib
 import ujson as json
 from .symbols import POS, NOUN, VERB, ADJ, PUNCT
-from .symbols import VerbForm_inf, VerbForm_none
+from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
 from .symbols import Number_sing
 from .symbols import Degree_pos
 class Lemmatizer(object):
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -1,4 +1,7 @@
 # cython: embedsignature=True
 # coding: utf8
 from __future__ import unicode_literals, print_function
 from libc.math cimport sqrt
 from cpython.ref cimport Py_INCREF
 from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
 cimport numpy as np
 np.import_array()
 from libc.string cimport memset
 import numpy
 from .orth cimport word_shape
 from .typedefs cimport attr_t, flags_t
 import numpy
 from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 from .attrs cimport IS_BRACKET
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -1,5 +1,6 @@
 # cython: profile=True
 # cython: infer_types=True
 # coding: utf8
 from __future__ import unicode_literals
 from .typedefs cimport attr_t
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,13 +1,9 @@
 # cython: infer_types
 # coding: utf8
 from __future__ import unicode_literals
 from libc.string cimport memset
 try:
    import ujson as json
 except ImportError:
    import json
 from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
--- a/spacy/multi_words.py
+++ b/spacy/multi_words.py
@ -1,8 +0,0 @@
 class RegexMerger(object):
    def __init__(self, regexes):
        self.regexes = regexes
    def __call__(self, tokens):
        for tag, entity_type, regex in self.regexes:
            for m in regex.finditer(tokens.string):
                tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
--- a/spacy/orth.pyx
+++ b/spacy/orth.pyx
@ -1,6 +1,7 @@
 # coding: utf8
 # cython: infer_types=True
 # coding: utf8
 from __future__ import unicode_literals
 import unicodedata
 import re
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -1,3 +1,4 @@
 # coding: utf8
 from __future__ import unicode_literals
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -1,3 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from .syntax.parser cimport Parser
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -1,6 +1,5 @@
-from __future__ import division
+# coding: utf8
-from __future__ import print_function
+from __future__ import division, print_function, unicode_literals
 from __future__ import unicode_literals
 from .gold import tags_to_entities
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -1,12 +1,11 @@
 # cython: infer_types=True
 # coding: utf8
 from __future__ import unicode_literals, absolute_import
 cimport cython
 from libc.string cimport memcpy
 from libc.stdint cimport uint64_t, uint32_t
 from murmurhash.mrmr cimport hash64, hash32
 from preshed.maps cimport map_iter, key_t
 from .typedefs cimport hash_t
--- a/spacy/symbols.pyx
+++ b/spacy/symbols.pyx
@ -1,3 +1,4 @@
 # coding: utf8
 from __future__ import unicode_literals
 IDS = {
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,5 +1,7 @@
 import json
-import pathlib
+# coding: utf8
 from __future__ import unicode_literals
 from collections import defaultdict
 from cymem.cymem cimport Pool
@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
 from .attrs cimport *
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -1,7 +1,7 @@
 # cython: embedsignature=True
 # coding: utf8
 from __future__ import unicode_literals
 import pathlib
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
--- a/spacy/train.py
+++ b/spacy/train.py
@ -1,11 +1,10 @@
-from __future__ import absolute_import
+# coding: utf8
-from __future__ import unicode_literals
+from __future__ import absolute_import, unicode_literals
 import random
 import tqdm
-from .gold import GoldParse
+from .gold import GoldParse, merge_sents
 from .scorer import Scorer
 from .gold import merge_sents
 class Trainer(object):
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1,11 +1,10 @@
 # coding: utf8
 from __future__ import unicode_literals, print_function
-import os
+
 import io
 import json
 import re
-import os.path
+from pathlib import Path
 import pathlib
 import sys
 import textwrap
@ -23,7 +22,7 @@ except NameError: # Python 3
 LANGUAGES = {}
-_data_path = pathlib.Path(__file__).parent / 'data'
+_data_path = Path(__file__).parent / 'data'
 def set_lang_class(name, cls):
@ -163,8 +162,8 @@ def is_python2():
 def parse_package_meta(package_path, package, require=True):
-    location = os.path.join(str(package_path), package, 'meta.json')
+    location = package_path / package / 'meta.json'
-    if os.path.isfile(location):
+    if location.is_file():
        with io.open(location, encoding='utf8') as f:
            meta = json.load(f)
            return meta
@ -209,10 +208,9 @@ def print_markdown(data, **kwargs):
    which will be converted to a list of tuples."""
    def excl_value(value):
-        # don't print value if it contains absolute path of directory
+        # don't print value if it contains absolute path of directory (i.e.
-        # (i.e. personal info that shouldn't need to be shared)
+        # personal info). Other conditions can be included here if necessary.
-        # other conditions can be included here if necessary
+        if unicode_(Path(__file__).parent) in value:
        if str(pathlib.Path(__file__).parent) in value:
            return True
    if type(data) == dict:
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -1,10 +1,6 @@
 # coding: utf8
 from __future__ import unicode_literals
 from libc.string cimport memset
 from libc.stdint cimport int32_t
 from libc.math cimport sqrt
 from pathlib import Path
 import bz2
 import ujson as json
 import re
@ -14,28 +10,28 @@ try:
 except ImportError:
    import pickle
 from libc.string cimport memset
 from libc.stdint cimport int32_t
 from libc.math cimport sqrt
 from cymem.cymem cimport Address
 from .lexeme cimport EMPTY_LEXEME
 from .lexeme cimport Lexeme
 from .strings cimport hash_string
 from .typedefs cimport attr_t
 from .cfile cimport CFile, StringCFile
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from .tokens.token cimport Token
 from . import attrs
 from . import symbols
 from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
 from . import util
 try:
    import copy_reg
 except ImportError:
    import copyreg as copy_reg
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from . import util
 from . import attrs
 from . import symbols
 DEF MAX_VEC_SIZE = 100000
`@ -1,3 +1,4 @@`
		`# coding: utf8`
	`from __future__ import unicode_literals`	`from __future__ import unicode_literals`