mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Clean up imports, unused code, whitespace, docstrings
This commit is contained in:
parent
561f2a3eb4
commit
d24589aa72
|
@ -1,27 +1,13 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from .util import set_lang_class, get_lang_class, parse_package_meta
|
||||
from .deprecated import resolve_model_name
|
||||
from .cli import info
|
||||
|
||||
from . import en
|
||||
from . import de
|
||||
from . import zh
|
||||
from . import es
|
||||
from . import it
|
||||
from . import hu
|
||||
from . import fr
|
||||
from . import pt
|
||||
from . import nl
|
||||
from . import sv
|
||||
from . import fi
|
||||
from . import bn
|
||||
from . import he
|
||||
|
||||
from .about import *
|
||||
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
|
||||
|
||||
|
||||
set_lang_class(en.English.lang, en.English)
|
||||
|
|
|
@ -1,3 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
IDS = {
|
||||
"": NULL_ATTR,
|
||||
"IS_ALPHA": IS_ALPHA,
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.stdio cimport fopen, fclose, fread, fwrite
|
||||
from libc.string cimport memcpy
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import io
|
||||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
from ...gold import read_json_file, merge_sents
|
||||
from ... import util
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pip
|
||||
import requests
|
||||
import os
|
||||
import subprocess
|
||||
|
|
|
@ -5,8 +5,6 @@ import json
|
|||
from pathlib import Path
|
||||
|
||||
from ..scorer import Scorer
|
||||
from ..tagger import Tagger
|
||||
from ..syntax.parser import Parser
|
||||
from ..gold import GoldParse, merge_sents
|
||||
from ..gold import read_json_file as read_gold_json
|
||||
from .. import util
|
||||
|
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
|
|||
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
|
||||
|
||||
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
|
||||
loss = 0
|
||||
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
|
||||
for doc, gold in epoch:
|
||||
trainer.update(doc, gold)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from pathlib import Path
|
||||
|
||||
from . import about
|
||||
from . import util
|
||||
from .cli import download
|
||||
|
|
|
@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
|
|||
from .language_data import *
|
||||
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
basestring = str
|
||||
|
||||
|
||||
class English(Language):
|
||||
lang = 'en'
|
||||
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
# cython: profile=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import os
|
||||
from os import path
|
||||
|
||||
import ujson as json
|
||||
import ujson
|
||||
from pathlib import Path
|
||||
|
||||
from .syntax import nonproj
|
||||
|
||||
|
@ -303,7 +301,8 @@ cdef class GoldParse:
|
|||
self.heads = proj_heads
|
||||
|
||||
def __len__(self):
|
||||
"""Get the number of gold-standard tokens.
|
||||
"""
|
||||
Get the number of gold-standard tokens.
|
||||
|
||||
Returns (int): The number of gold-standard tokens.
|
||||
"""
|
||||
|
|
|
@ -1,9 +1,7 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
import pathlib
|
||||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
from contextlib import contextmanager
|
||||
import shutil
|
||||
|
||||
import ujson
|
||||
|
||||
|
||||
|
@ -21,19 +19,18 @@ from .tokenizer import Tokenizer
|
|||
from .vocab import Vocab
|
||||
from .tagger import Tagger
|
||||
from .matcher import Matcher
|
||||
from . import attrs
|
||||
from . import orth
|
||||
from . import util
|
||||
from . import language_data
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .train import Trainer
|
||||
|
||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
|
||||
from .syntax.parser import get_templates
|
||||
from .syntax.nonproj import PseudoProjectivity
|
||||
from .pipeline import DependencyParser, EntityRecognizer
|
||||
from .syntax.arc_eager import ArcEager
|
||||
from .syntax.ner import BiluoPushDown
|
||||
from .attrs import IS_STOP
|
||||
from . import attrs
|
||||
from . import orth
|
||||
from . import util
|
||||
from . import language_data
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
|
@ -150,25 +147,15 @@ class BaseDefaults(object):
|
|||
return pipeline
|
||||
|
||||
token_match = language_data.TOKEN_MATCH
|
||||
|
||||
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
|
||||
|
||||
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
|
||||
|
||||
infixes = tuple(language_data.TOKENIZER_INFIXES)
|
||||
|
||||
tag_map = dict(language_data.TAG_MAP)
|
||||
|
||||
tokenizer_exceptions = {}
|
||||
|
||||
parser_features = get_templates('parser')
|
||||
|
||||
entity_features = get_templates('ner')
|
||||
|
||||
tagger_features = Tagger.feature_templates # TODO -- fix this
|
||||
|
||||
stop_words = set()
|
||||
|
||||
lemma_rules = {}
|
||||
lemma_exc = {}
|
||||
lemma_index = {}
|
||||
|
@ -313,7 +300,8 @@ class Language(object):
|
|||
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
|
||||
|
||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
"""
|
||||
Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
|
@ -373,7 +361,7 @@ class Language(object):
|
|||
}
|
||||
|
||||
self.setup_directory(path, **configs)
|
||||
|
||||
|
||||
strings_loc = path / 'vocab' / 'strings.json'
|
||||
with strings_loc.open('w', encoding='utf8') as file_:
|
||||
self.vocab.strings.dump(file_)
|
||||
|
@ -397,4 +385,3 @@ class Language(object):
|
|||
# to taking nlp.path
|
||||
if path is not None:
|
||||
self.save_to_directory(path)
|
||||
|
||||
|
|
|
@ -1,13 +1,8 @@
|
|||
from __future__ import unicode_literals, print_function
|
||||
import codecs
|
||||
import pathlib
|
||||
|
||||
import ujson as json
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
|
||||
from .symbols import VerbForm_inf, VerbForm_none
|
||||
from .symbols import Number_sing
|
||||
from .symbols import Degree_pos
|
||||
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# cython: embedsignature=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
from libc.math cimport sqrt
|
||||
from cpython.ref cimport Py_INCREF
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
|
|||
cimport numpy as np
|
||||
np.import_array()
|
||||
|
||||
|
||||
|
||||
from libc.string cimport memset
|
||||
import numpy
|
||||
|
||||
from .orth cimport word_shape
|
||||
from .typedefs cimport attr_t, flags_t
|
||||
import numpy
|
||||
|
||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
|
||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
||||
from .attrs cimport IS_BRACKET
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
# cython: profile=True
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .typedefs cimport attr_t
|
||||
|
@ -164,7 +165,7 @@ def _convert_strings(token_specs, string_store):
|
|||
def merge_phrase(matcher, doc, i, matches):
|
||||
'''Callback to merge a phrase on match'''
|
||||
ent_id, label, start, end = matches[i]
|
||||
span = doc[start : end]
|
||||
span = doc[start : end]
|
||||
span.merge(ent_type=label, ent_id=ent_id)
|
||||
|
||||
|
||||
|
|
|
@ -1,13 +1,9 @@
|
|||
# cython: infer_types
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memset
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
|
||||
from .attrs cimport POS, IS_SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
class RegexMerger(object):
|
||||
def __init__(self, regexes):
|
||||
self.regexes = regexes
|
||||
|
||||
def __call__(self, tokens):
|
||||
for tag, entity_type, regex in self.regexes:
|
||||
for m in regex.finditer(tokens.string):
|
||||
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import unicodedata
|
||||
import re
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .syntax.parser cimport Parser
|
||||
from .syntax.beam_parser cimport BeamParser
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
|
@ -36,7 +39,7 @@ cdef class BeamEntityRecognizer(BeamParser):
|
|||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
||||
|
||||
def add_label(self, label):
|
||||
Parser.add_label(self, label)
|
||||
if isinstance(label, basestring):
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
# coding: utf8
|
||||
from __future__ import division, print_function, unicode_literals
|
||||
|
||||
from .gold import tags_to_entities
|
||||
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
# cython: infer_types=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals, absolute_import
|
||||
|
||||
cimport cython
|
||||
from libc.string cimport memcpy
|
||||
from libc.stdint cimport uint64_t, uint32_t
|
||||
|
||||
from murmurhash.mrmr cimport hash64, hash32
|
||||
|
||||
from preshed.maps cimport map_iter, key_t
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
|
@ -154,11 +153,11 @@ cdef class StringStore:
|
|||
raise TypeError(type(string_or_id))
|
||||
utf8str = self._intern_utf8(byte_string, len(byte_string))
|
||||
if utf8str is NULL:
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# TODO: We need to use 32 bit here, for compatibility with the
|
||||
# vocabulary values. This makes birthday paradox probabilities
|
||||
# pretty bad.
|
||||
# We could also get unlucky here, and hash into a value that
|
||||
# collides with the 'real' strings.
|
||||
# collides with the 'real' strings.
|
||||
return hash32_utf8(byte_string, len(byte_string))
|
||||
else:
|
||||
return utf8str - self.c
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
IDS = {
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
import json
|
||||
import pathlib
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from cymem.cymem cimport Pool
|
||||
|
@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec
|
|||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .gold cimport GoldParse
|
||||
|
||||
from .attrs cimport *
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# cython: embedsignature=True
|
||||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pathlib
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
from cython.operator cimport preincrement as preinc
|
||||
|
@ -111,7 +111,7 @@ cdef class Tokenizer:
|
|||
self.token_match)
|
||||
|
||||
return (self.__class__, args, None, None)
|
||||
|
||||
|
||||
cpdef Doc tokens_from_list(self, list strings):
|
||||
return Doc(self.vocab, words=strings)
|
||||
#raise NotImplementedError(
|
||||
|
@ -276,7 +276,7 @@ cdef class Tokenizer:
|
|||
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||
if cache_hit:
|
||||
pass
|
||||
elif self.token_match and self.token_match(string):
|
||||
elif self.token_match and self.token_match(string):
|
||||
# We're always saying 'no' to spaces here -- the caller will
|
||||
# fix up the outermost one, with reference to the original.
|
||||
# See Issue #859
|
||||
|
@ -377,7 +377,7 @@ cdef class Tokenizer:
|
|||
"""
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
"""
|
||||
Add a special-case tokenization rule.
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
# coding: utf8
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import random
|
||||
import tqdm
|
||||
from .gold import GoldParse
|
||||
from .gold import GoldParse, merge_sents
|
||||
from .scorer import Scorer
|
||||
from .gold import merge_sents
|
||||
|
||||
|
||||
class Trainer(object):
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals, print_function
|
||||
import os
|
||||
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import os.path
|
||||
import pathlib
|
||||
from pathlib import Path
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
|
@ -23,7 +22,7 @@ except NameError: # Python 3
|
|||
|
||||
|
||||
LANGUAGES = {}
|
||||
_data_path = pathlib.Path(__file__).parent / 'data'
|
||||
_data_path = Path(__file__).parent / 'data'
|
||||
|
||||
|
||||
def set_lang_class(name, cls):
|
||||
|
@ -163,8 +162,8 @@ def is_python2():
|
|||
|
||||
|
||||
def parse_package_meta(package_path, package, require=True):
|
||||
location = os.path.join(str(package_path), package, 'meta.json')
|
||||
if os.path.isfile(location):
|
||||
location = package_path / package / 'meta.json'
|
||||
if location.is_file():
|
||||
with io.open(location, encoding='utf8') as f:
|
||||
meta = json.load(f)
|
||||
return meta
|
||||
|
@ -209,10 +208,9 @@ def print_markdown(data, **kwargs):
|
|||
which will be converted to a list of tuples."""
|
||||
|
||||
def excl_value(value):
|
||||
# don't print value if it contains absolute path of directory
|
||||
# (i.e. personal info that shouldn't need to be shared)
|
||||
# other conditions can be included here if necessary
|
||||
if str(pathlib.Path(__file__).parent) in value:
|
||||
# don't print value if it contains absolute path of directory (i.e.
|
||||
# personal info). Other conditions can be included here if necessary.
|
||||
if unicode_(Path(__file__).parent) in value:
|
||||
return True
|
||||
|
||||
if type(data) == dict:
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from libc.string cimport memset
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.math cimport sqrt
|
||||
|
||||
from pathlib import Path
|
||||
import bz2
|
||||
import ujson as json
|
||||
import re
|
||||
|
@ -14,28 +10,28 @@ try:
|
|||
except ImportError:
|
||||
import pickle
|
||||
|
||||
from libc.string cimport memset
|
||||
from libc.stdint cimport int32_t
|
||||
from libc.math cimport sqrt
|
||||
from cymem.cymem cimport Address
|
||||
from .lexeme cimport EMPTY_LEXEME
|
||||
from .lexeme cimport Lexeme
|
||||
from .strings cimport hash_string
|
||||
from .typedefs cimport attr_t
|
||||
from .cfile cimport CFile, StringCFile
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .tokens.token cimport Token
|
||||
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
||||
from cymem.cymem cimport Address
|
||||
from .serialize.packer cimport Packer
|
||||
from .attrs cimport PROB, LANG
|
||||
from . import util
|
||||
|
||||
|
||||
try:
|
||||
import copy_reg
|
||||
except ImportError:
|
||||
import copyreg as copy_reg
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from . import util
|
||||
from . import attrs
|
||||
from . import symbols
|
||||
|
||||
|
||||
DEF MAX_VEC_SIZE = 100000
|
||||
|
|
Loading…
Reference in New Issue
Block a user