Clean up imports, unused code, whitespace, docstrings

This commit is contained in:
ines 2017-04-15 12:05:47 +02:00
parent 561f2a3eb4
commit d24589aa72
27 changed files with 77 additions and 127 deletions

View File

@ -1,27 +1,13 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals
import json
from pathlib import Path from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta from .util import set_lang_class, get_lang_class, parse_package_meta
from .deprecated import resolve_model_name from .deprecated import resolve_model_name
from .cli import info from .cli import info
from . import en from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
from . import de
from . import zh
from . import es
from . import it
from . import hu
from . import fr
from . import pt
from . import nl
from . import sv
from . import fi
from . import bn
from . import he
from .about import *
set_lang_class(en.English.lang, en.English) set_lang_class(en.English.lang, en.English)

View File

@ -1,3 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
IDS = { IDS = {
"": NULL_ATTR, "": NULL_ATTR,
"IS_ALPHA": IS_ALPHA, "IS_ALPHA": IS_ALPHA,

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite from libc.stdio cimport fopen, fclose, fread, fwrite
from libc.string cimport memcpy from libc.string cimport memcpy

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import io
from pathlib import Path from pathlib import Path
from .converters import conllu2json from .converters import conllu2json

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import json import json
from ...gold import read_json_file, merge_sents
from ... import util from ... import util

View File

@ -1,7 +1,6 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pip
import requests import requests
import os import os
import subprocess import subprocess

View File

@ -5,8 +5,6 @@ import json
from pathlib import Path from pathlib import Path
from ..scorer import Scorer from ..scorer import Scorer
from ..tagger import Tagger
from ..syntax.parser import Parser
from ..gold import GoldParse, merge_sents from ..gold import GoldParse, merge_sents
from ..gold import read_json_file as read_gold_json from ..gold import read_json_file as read_gold_json
from .. import util from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
for doc, gold in epoch: for doc, gold in epoch:
trainer.update(doc, gold) trainer.update(doc, gold)

View File

@ -1,4 +1,5 @@
from pathlib import Path from pathlib import Path
from . import about from . import about
from . import util from . import util
from .cli import download from .cli import download

View File

@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
from .language_data import * from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language): class English(Language):
lang = 'en' lang = 'en'

View File

@ -1,13 +1,11 @@
# cython: profile=True # cython: profile=True
# coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import io import io
import json
import re import re
import os import ujson
from os import path from pathlib import Path
import ujson as json
from .syntax import nonproj from .syntax import nonproj
@ -303,7 +301,8 @@ cdef class GoldParse:
self.heads = proj_heads self.heads = proj_heads
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens. Returns (int): The number of gold-standard tokens.
""" """

View File

@ -1,9 +1,7 @@
from __future__ import absolute_import # coding: utf8
from __future__ import unicode_literals from __future__ import absolute_import, unicode_literals
import pathlib
from contextlib import contextmanager from contextlib import contextmanager
import shutil import shutil
import ujson import ujson
@ -21,19 +19,18 @@ from .tokenizer import Tokenizer
from .vocab import Vocab from .vocab import Vocab
from .tagger import Tagger from .tagger import Tagger
from .matcher import Matcher from .matcher import Matcher
from . import attrs
from . import orth
from . import util
from . import language_data
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .train import Trainer from .train import Trainer
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer from .pipeline import DependencyParser, EntityRecognizer
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .attrs import IS_STOP
from . import attrs
from . import orth
from . import util
from . import language_data
class BaseDefaults(object): class BaseDefaults(object):
@ -150,25 +147,15 @@ class BaseDefaults(object):
return pipeline return pipeline
token_match = language_data.TOKEN_MATCH token_match = language_data.TOKEN_MATCH
prefixes = tuple(language_data.TOKENIZER_PREFIXES) prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES) suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES) infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP) tag_map = dict(language_data.TAG_MAP)
tokenizer_exceptions = {} tokenizer_exceptions = {}
parser_features = get_templates('parser') parser_features = get_templates('parser')
entity_features = get_templates('ner') entity_features = get_templates('ner')
tagger_features = Tagger.feature_templates # TODO -- fix this tagger_features = Tagger.feature_templates # TODO -- fix this
stop_words = set() stop_words = set()
lemma_rules = {} lemma_rules = {}
lemma_exc = {} lemma_exc = {}
lemma_index = {} lemma_index = {}
@ -313,7 +300,8 @@ class Language(object):
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity] self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
def __call__(self, text, tag=True, parse=True, entity=True): def __call__(self, text, tag=True, parse=True, entity=True):
"""Apply the pipeline to some text. The text can span multiple sentences, """
Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
is preserved. is preserved.
@ -397,4 +385,3 @@ class Language(object):
# to taking nlp.path # to taking nlp.path
if path is not None: if path is not None:
self.save_to_directory(path) self.save_to_directory(path)

View File

@ -1,13 +1,8 @@
from __future__ import unicode_literals, print_function # coding: utf8
import codecs from __future__ import unicode_literals
import pathlib
import ujson as json
from .symbols import POS, NOUN, VERB, ADJ, PUNCT from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
from .symbols import Number_sing
from .symbols import Degree_pos
class Lemmatizer(object): class Lemmatizer(object):

View File

@ -1,4 +1,7 @@
# cython: embedsignature=True # cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals, print_function
from libc.math cimport sqrt from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
cimport numpy as np cimport numpy as np
np.import_array() np.import_array()
from libc.string cimport memset from libc.string cimport memset
import numpy
from .orth cimport word_shape from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t from .typedefs cimport attr_t, flags_t
import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET from .attrs cimport IS_BRACKET

View File

@ -1,5 +1,6 @@
# cython: profile=True # cython: profile=True
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .typedefs cimport attr_t from .typedefs cimport attr_t

View File

@ -1,13 +1,9 @@
# cython: infer_types # cython: infer_types
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memset from libc.string cimport memset
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
from .attrs cimport POS, IS_SPACE from .attrs cimport POS, IS_SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS

View File

@ -1,8 +0,0 @@
class RegexMerger(object):
def __init__(self, regexes):
self.regexes = regexes
def __call__(self, tokens):
for tag, entity_type, regex in self.regexes:
for m in regex.finditer(tokens.string):
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)

View File

@ -1,6 +1,7 @@
# coding: utf8
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import unicodedata import unicodedata
import re import re

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from .syntax.parser cimport Parser from .syntax.parser cimport Parser
from .syntax.beam_parser cimport BeamParser from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown from .syntax.ner cimport BiluoPushDown

View File

@ -1,6 +1,5 @@
from __future__ import division # coding: utf8
from __future__ import print_function from __future__ import division, print_function, unicode_literals
from __future__ import unicode_literals
from .gold import tags_to_entities from .gold import tags_to_entities

View File

@ -1,12 +1,11 @@
# cython: infer_types=True # cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals, absolute_import from __future__ import unicode_literals, absolute_import
cimport cython cimport cython
from libc.string cimport memcpy from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32 from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t from .typedefs cimport hash_t

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
IDS = { IDS = {

View File

@ -1,5 +1,7 @@
import json import json
import pathlib # coding: utf8
from __future__ import unicode_literals
from collections import defaultdict from collections import defaultdict
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .gold cimport GoldParse from .gold cimport GoldParse
from .attrs cimport * from .attrs cimport *

View File

@ -1,7 +1,7 @@
# cython: embedsignature=True # cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pathlib
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc from cython.operator cimport preincrement as preinc

View File

@ -1,11 +1,10 @@
from __future__ import absolute_import # coding: utf8
from __future__ import unicode_literals from __future__ import absolute_import, unicode_literals
import random import random
import tqdm import tqdm
from .gold import GoldParse from .gold import GoldParse, merge_sents
from .scorer import Scorer from .scorer import Scorer
from .gold import merge_sents
class Trainer(object): class Trainer(object):

View File

@ -1,11 +1,10 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
import os
import io import io
import json import json
import re import re
import os.path from pathlib import Path
import pathlib
import sys import sys
import textwrap import textwrap
@ -23,7 +22,7 @@ except NameError: # Python 3
LANGUAGES = {} LANGUAGES = {}
_data_path = pathlib.Path(__file__).parent / 'data' _data_path = Path(__file__).parent / 'data'
def set_lang_class(name, cls): def set_lang_class(name, cls):
@ -163,8 +162,8 @@ def is_python2():
def parse_package_meta(package_path, package, require=True): def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json') location = package_path / package / 'meta.json'
if os.path.isfile(location): if location.is_file():
with io.open(location, encoding='utf8') as f: with io.open(location, encoding='utf8') as f:
meta = json.load(f) meta = json.load(f)
return meta return meta
@ -209,10 +208,9 @@ def print_markdown(data, **kwargs):
which will be converted to a list of tuples.""" which will be converted to a list of tuples."""
def excl_value(value): def excl_value(value):
# don't print value if it contains absolute path of directory # don't print value if it contains absolute path of directory (i.e.
# (i.e. personal info that shouldn't need to be shared) # personal info). Other conditions can be included here if necessary.
# other conditions can be included here if necessary if unicode_(Path(__file__).parent) in value:
if str(pathlib.Path(__file__).parent) in value:
return True return True
if type(data) == dict: if type(data) == dict:

View File

@ -1,10 +1,6 @@
# coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from pathlib import Path
import bz2 import bz2
import ujson as json import ujson as json
import re import re
@ -14,28 +10,28 @@ try:
except ImportError: except ImportError:
import pickle import pickle
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from .lexeme cimport EMPTY_LEXEME from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .strings cimport hash_string from .strings cimport hash_string
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .cfile cimport CFile, StringCFile from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .tokens.token cimport Token from .tokens.token cimport Token
from . import attrs
from . import symbols
from cymem.cymem cimport Address
from .serialize.packer cimport Packer from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG from .attrs cimport PROB, LANG
from . import util
try: try:
import copy_reg import copy_reg
except ImportError: except ImportError:
import copyreg as copy_reg import copyreg as copy_reg
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from . import util
from . import attrs
from . import symbols
DEF MAX_VEC_SIZE = 100000 DEF MAX_VEC_SIZE = 100000