Clean up imports, unused code, whitespace, docstrings

This commit is contained in:
ines 2017-04-15 12:05:47 +02:00
parent 561f2a3eb4
commit d24589aa72
27 changed files with 77 additions and 127 deletions

View File

@ -1,27 +1,13 @@
# coding: utf8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
import json
from pathlib import Path
from .util import set_lang_class, get_lang_class, parse_package_meta
from .deprecated import resolve_model_name
from .cli import info
from . import en
from . import de
from . import zh
from . import es
from . import it
from . import hu
from . import fr
from . import pt
from . import nl
from . import sv
from . import fi
from . import bn
from . import he
from .about import *
from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he
set_lang_class(en.English.lang, en.English)

View File

@ -1,3 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
IDS = {
"": NULL_ATTR,
"IS_ALPHA": IS_ALPHA,

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.stdio cimport fopen, fclose, fread, fwrite
from libc.string cimport memcpy

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import io
from pathlib import Path
from .converters import conllu2json

View File

@ -2,7 +2,6 @@
from __future__ import unicode_literals
import json
from ...gold import read_json_file, merge_sents
from ... import util

View File

@ -1,7 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
import pip
import requests
import os
import subprocess

View File

@ -5,8 +5,6 @@ import json
from pathlib import Path
from ..scorer import Scorer
from ..tagger import Tagger
from ..syntax.parser import Parser
from ..gold import GoldParse, merge_sents
from ..gold import read_json_file as read_gold_json
from .. import util
@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_
print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %")
with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer:
loss = 0
for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)):
for doc, gold in epoch:
trainer.update(doc, gold)

View File

@ -1,4 +1,5 @@
from pathlib import Path
from . import about
from . import util
from .cli import download

View File

@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading
from .language_data import *
try:
basestring
except NameError:
basestring = str
class English(Language):
lang = 'en'

View File

@ -1,13 +1,11 @@
# cython: profile=True
# coding: utf8
from __future__ import unicode_literals, print_function
import io
import json
import re
import os
from os import path
import ujson as json
import ujson
from pathlib import Path
from .syntax import nonproj
@ -303,7 +301,8 @@ cdef class GoldParse:
self.heads = proj_heads
def __len__(self):
"""Get the number of gold-standard tokens.
"""
Get the number of gold-standard tokens.
Returns (int): The number of gold-standard tokens.
"""

View File

@ -1,9 +1,7 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import pathlib
# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import shutil
import ujson
@ -21,19 +19,18 @@ from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .matcher import Matcher
from . import attrs
from . import orth
from . import util
from . import language_data
from .lemmatizer import Lemmatizer
from .train import Trainer
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .pipeline import DependencyParser, EntityRecognizer
from .syntax.arc_eager import ArcEager
from .syntax.ner import BiluoPushDown
from .attrs import IS_STOP
from . import attrs
from . import orth
from . import util
from . import language_data
class BaseDefaults(object):
@ -150,25 +147,15 @@ class BaseDefaults(object):
return pipeline
token_match = language_data.TOKEN_MATCH
prefixes = tuple(language_data.TOKENIZER_PREFIXES)
suffixes = tuple(language_data.TOKENIZER_SUFFIXES)
infixes = tuple(language_data.TOKENIZER_INFIXES)
tag_map = dict(language_data.TAG_MAP)
tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
tagger_features = Tagger.feature_templates # TODO -- fix this
stop_words = set()
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
@ -313,7 +300,8 @@ class Language(object):
self.pipeline = [self.tagger, self.parser, self.matcher, self.entity]
def __call__(self, text, tag=True, parse=True, entity=True):
"""Apply the pipeline to some text. The text can span multiple sentences,
"""
Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
@ -373,7 +361,7 @@ class Language(object):
}
self.setup_directory(path, **configs)
strings_loc = path / 'vocab' / 'strings.json'
with strings_loc.open('w', encoding='utf8') as file_:
self.vocab.strings.dump(file_)
@ -397,4 +385,3 @@ class Language(object):
# to taking nlp.path
if path is not None:
self.save_to_directory(path)

View File

@ -1,13 +1,8 @@
from __future__ import unicode_literals, print_function
import codecs
import pathlib
import ujson as json
# coding: utf8
from __future__ import unicode_literals
from .symbols import POS, NOUN, VERB, ADJ, PUNCT
from .symbols import VerbForm_inf, VerbForm_none
from .symbols import Number_sing
from .symbols import Degree_pos
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class Lemmatizer(object):

View File

@ -1,4 +1,7 @@
# cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals, print_function
from libc.math cimport sqrt
from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool
@ -9,14 +12,11 @@ from cython.view cimport array as cvarray
cimport numpy as np
np.import_array()
from libc.string cimport memset
import numpy
from .orth cimport word_shape
from .typedefs cimport attr_t, flags_t
import numpy
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET

View File

@ -1,5 +1,6 @@
# cython: profile=True
# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals
from .typedefs cimport attr_t
@ -164,7 +165,7 @@ def _convert_strings(token_specs, string_store):
def merge_phrase(matcher, doc, i, matches):
'''Callback to merge a phrase on match'''
ent_id, label, start, end = matches[i]
span = doc[start : end]
span = doc[start : end]
span.merge(ent_type=label, ent_id=ent_id)

View File

@ -1,13 +1,9 @@
# cython: infer_types
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
from .attrs cimport POS, IS_SPACE
from .parts_of_speech import IDS as POS_IDS

View File

@ -1,8 +0,0 @@
class RegexMerger(object):
def __init__(self, regexes):
self.regexes = regexes
def __call__(self, tokens):
for tag, entity_type, regex in self.regexes:
for m in regex.finditer(tokens.string):
tokens.merge(m.start(), m.end(), tag, m.group(), entity_type)

View File

@ -1,6 +1,7 @@
# coding: utf8
# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals
import unicodedata
import re

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals

View File

@ -1,3 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from .syntax.parser cimport Parser
from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
@ -36,7 +39,7 @@ cdef class BeamEntityRecognizer(BeamParser):
TransitionSystem = BiluoPushDown
feature_templates = get_feature_templates('ner')
def add_label(self, label):
Parser.add_label(self, label)
if isinstance(label, basestring):

View File

@ -1,6 +1,5 @@
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
# coding: utf8
from __future__ import division, print_function, unicode_literals
from .gold import tags_to_entities

View File

@ -1,12 +1,11 @@
# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals, absolute_import
cimport cython
from libc.string cimport memcpy
from libc.stdint cimport uint64_t, uint32_t
from murmurhash.mrmr cimport hash64, hash32
from preshed.maps cimport map_iter, key_t
from .typedefs cimport hash_t
@ -154,11 +153,11 @@ cdef class StringStore:
raise TypeError(type(string_or_id))
utf8str = self._intern_utf8(byte_string, len(byte_string))
if utf8str is NULL:
# TODO: We need to use 32 bit here, for compatibility with the
# TODO: We need to use 32 bit here, for compatibility with the
# vocabulary values. This makes birthday paradox probabilities
# pretty bad.
# We could also get unlucky here, and hash into a value that
# collides with the 'real' strings.
# collides with the 'real' strings.
return hash32_utf8(byte_string, len(byte_string))
else:
return utf8str - self.c

View File

@ -1,3 +1,4 @@
# coding: utf8
from __future__ import unicode_literals
IDS = {

View File

@ -1,5 +1,7 @@
import json
import pathlib
# coding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from cymem.cymem cimport Pool
@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *

View File

@ -1,7 +1,7 @@
# cython: embedsignature=True
# coding: utf8
from __future__ import unicode_literals
import pathlib
from cython.operator cimport dereference as deref
from cython.operator cimport preincrement as preinc
@ -111,7 +111,7 @@ cdef class Tokenizer:
self.token_match)
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
return Doc(self.vocab, words=strings)
#raise NotImplementedError(
@ -276,7 +276,7 @@ cdef class Tokenizer:
cache_hit = self._try_cache(hash_string(string), tokens)
if cache_hit:
pass
elif self.token_match and self.token_match(string):
elif self.token_match and self.token_match(string):
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
@ -377,7 +377,7 @@ cdef class Tokenizer:
"""
for chunk, substrings in sorted(special_cases.items()):
self.add_special_case(chunk, substrings)
def add_special_case(self, unicode string, substrings):
"""
Add a special-case tokenization rule.

View File

@ -1,11 +1,10 @@
from __future__ import absolute_import
from __future__ import unicode_literals
# coding: utf8
from __future__ import absolute_import, unicode_literals
import random
import tqdm
from .gold import GoldParse
from .gold import GoldParse, merge_sents
from .scorer import Scorer
from .gold import merge_sents
class Trainer(object):

View File

@ -1,11 +1,10 @@
# coding: utf8
from __future__ import unicode_literals, print_function
import os
import io
import json
import re
import os.path
import pathlib
from pathlib import Path
import sys
import textwrap
@ -23,7 +22,7 @@ except NameError: # Python 3
LANGUAGES = {}
_data_path = pathlib.Path(__file__).parent / 'data'
_data_path = Path(__file__).parent / 'data'
def set_lang_class(name, cls):
@ -163,8 +162,8 @@ def is_python2():
def parse_package_meta(package_path, package, require=True):
location = os.path.join(str(package_path), package, 'meta.json')
if os.path.isfile(location):
location = package_path / package / 'meta.json'
if location.is_file():
with io.open(location, encoding='utf8') as f:
meta = json.load(f)
return meta
@ -209,10 +208,9 @@ def print_markdown(data, **kwargs):
which will be converted to a list of tuples."""
def excl_value(value):
# don't print value if it contains absolute path of directory
# (i.e. personal info that shouldn't need to be shared)
# other conditions can be included here if necessary
if str(pathlib.Path(__file__).parent) in value:
# don't print value if it contains absolute path of directory (i.e.
# personal info). Other conditions can be included here if necessary.
if unicode_(Path(__file__).parent) in value:
return True
if type(data) == dict:

View File

@ -1,10 +1,6 @@
# coding: utf8
from __future__ import unicode_literals
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from pathlib import Path
import bz2
import ujson as json
import re
@ -14,28 +10,28 @@ try:
except ImportError:
import pickle
from libc.string cimport memset
from libc.stdint cimport int32_t
from libc.math cimport sqrt
from cymem.cymem cimport Address
from .lexeme cimport EMPTY_LEXEME
from .lexeme cimport Lexeme
from .strings cimport hash_string
from .typedefs cimport attr_t
from .cfile cimport CFile, StringCFile
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .tokens.token cimport Token
from . import attrs
from . import symbols
from cymem.cymem cimport Address
from .serialize.packer cimport Packer
from .attrs cimport PROB, LANG
from . import util
try:
import copy_reg
except ImportError:
import copyreg as copy_reg
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from . import util
from . import attrs
from . import symbols
DEF MAX_VEC_SIZE = 100000