Tidy up and auto-format

This commit is contained in:
Ines Montani 2020-03-25 12:28:12 +01:00
parent b71dd44dbc
commit 828acffc12
32 changed files with 1828 additions and 1793 deletions

View File

@ -225,7 +225,9 @@ def train(
exits=1,
)
msg.text("Extending component from base model '{}'".format(pipe))
disabled_pipes = nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
disabled_pipes = nlp.disable_pipes(
[p for p in nlp.pipe_names if p not in pipeline]
)
else:
msg.text("Starting with blank model '{}'".format(lang))
lang_cls = util.get_lang_class(lang)
@ -415,10 +417,10 @@ def train(
losses=losses,
)
except ValueError as e:
msg.warn("Error during training")
err = "Error during training"
if init_tok2vec:
msg.warn("Did you provide the same parameters during 'train' as during 'pretrain'?")
msg.fail("Original error message: {}".format(e), exits=1)
err += " Did you provide the same parameters during 'train' as during 'pretrain'?"
msg.fail(err, "Original error message: {}".format(e), exits=1)
if raw_text:
# If raw text is available, perform 'rehearsal' updates,
# which use unlabelled data to reduce overfitting.
@ -546,7 +548,10 @@ def train(
)
break
except Exception as e:
msg.warn("Aborting and saving the final best model. Encountered exception: {}".format(e))
msg.warn(
"Aborting and saving the final best model. "
"Encountered exception: {}".format(e)
)
finally:
best_pipes = nlp.pipe_names
if disabled_pipes:
@ -563,13 +568,20 @@ def train(
final_meta["speed"].setdefault("gpu", None)
# combine cpu and gpu speeds with the base model speeds
if final_meta["speed"]["cpu"] and meta["speed"]["cpu"]:
speed = _get_total_speed([final_meta["speed"]["cpu"], meta["speed"]["cpu"]])
speed = _get_total_speed(
[final_meta["speed"]["cpu"], meta["speed"]["cpu"]]
)
final_meta["speed"]["cpu"] = speed
if final_meta["speed"]["gpu"] and meta["speed"]["gpu"]:
speed = _get_total_speed([final_meta["speed"]["gpu"], meta["speed"]["gpu"]])
speed = _get_total_speed(
[final_meta["speed"]["gpu"], meta["speed"]["gpu"]]
)
final_meta["speed"]["gpu"] = speed
# if there were no speeds to update, overwrite with meta
if final_meta["speed"]["cpu"] is None and final_meta["speed"]["gpu"] is None:
if (
final_meta["speed"]["cpu"] is None
and final_meta["speed"]["gpu"] is None
):
final_meta["speed"].update(meta["speed"])
# note: beam speeds are not combined with the base model
if has_beam_widths:

View File

@ -146,9 +146,14 @@ def parse_deps(orig_doc, options={}):
retokenizer.merge(span, attrs=attrs)
fine_grained = options.get("fine_grained")
add_lemma = options.get("add_lemma")
words = [{"text": w.text,
words = [
{
"text": w.text,
"tag": w.tag_ if fine_grained else w.pos_,
"lemma": w.lemma_ if add_lemma else None} for w in doc]
"lemma": w.lemma_ if add_lemma else None,
}
for w in doc
]
arcs = []
for word in doc:

View File

@ -3,7 +3,13 @@ from __future__ import unicode_literals
import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_WORDS_LEMMA, TPL_DEP_ARCS, TPL_ENTS
from .templates import (
TPL_DEP_SVG,
TPL_DEP_WORDS,
TPL_DEP_WORDS_LEMMA,
TPL_DEP_ARCS,
TPL_ENTS,
)
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
from ..util import minify_html, escape_html, registry
from ..errors import Errors
@ -83,7 +89,10 @@ class DependencyRenderer(object):
self.width = self.offset_x + len(words) * self.distance
self.height = self.offset_y + 3 * self.word_spacing
self.id = render_id
words = [self.render_word(w["text"], w["tag"], w.get("lemma", None), i) for i, w in enumerate(words)]
words = [
self.render_word(w["text"], w["tag"], w.get("lemma", None), i)
for i, w in enumerate(words)
]
arcs = [
self.render_arrow(a["label"], a["start"], a["end"], a["dir"], i)
for i, a in enumerate(arcs)
@ -101,7 +110,9 @@ class DependencyRenderer(object):
lang=self.lang,
)
def render_word(self, text, tag, lemma, i,):
def render_word(
self, text, tag, lemma, i,
):
"""Render individual word.
text (unicode): Word text.
@ -115,7 +126,9 @@ class DependencyRenderer(object):
x = self.width - x
html_text = escape_html(text)
if lemma is not None:
return TPL_DEP_WORDS_LEMMA.format(text=html_text, tag=tag, lemma=lemma, x=x, y=y)
return TPL_DEP_WORDS_LEMMA.format(
text=html_text, tag=tag, lemma=lemma, x=x, y=y
)
return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y)
def render_arrow(self, label, start, end, direction, i):

View File

@ -112,7 +112,6 @@ class Warnings(object):
"in problems with the vocab further on in the pipeline.")
@add_codes
class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS, LIST_PUNCT, LIST_QUOTES
from ..char_classes import LIST_CURRENCY, CURRENCY, UNITS, PUNCT
from ..char_classes import CURRENCY, UNITS, PUNCT
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES

View File

@ -10,5 +10,5 @@ Example sentences to test spaCy and its language models.
sentences = [
"bilbon ko castinga egin da eta nik jakin ez zuetako inork egin al du edota parte hartu duen ezagunik ba al du",
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira"
"gaur telebistan entzunda denok martetik gatoz hortaz martzianoak gara beno nire ustez batzuk beste batzuk baino martzianoagoak dira",
]

View File

@ -59,7 +59,6 @@ behin
""".split()
def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]

View File

@ -16,7 +16,9 @@ _hyphen_suffixes += " " + _hyphen_suffixes.upper()
_prefixes = TOKENIZER_PREFIXES + [
r"(?:({pe})[{el}])(?=[{a}])".format(a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision))
r"(?:({pe})[{el}])(?=[{a}])".format(
a=ALPHA, el=ELISION, pe=merge_chars(_prefixes_elision)
)
]
_suffixes = (
@ -33,7 +35,9 @@ _suffixes = (
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
),
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
r"(?<=[{a}])[{h}]({hs})".format(a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)),
r"(?<=[{a}])[{h}]({hs})".format(
a=ALPHA, h=HYPHENS, hs=merge_chars(_hyphen_suffixes)
),
]
)

View File

@ -6,7 +6,7 @@ import re
from .punctuation import ELISION, HYPHENS
from ..tokenizer_exceptions import URL_PATTERN
from ..char_classes import ALPHA_LOWER, ALPHA
from ...symbols import ORTH, LEMMA, TAG
from ...symbols import ORTH, LEMMA
# not using the large _tokenizer_exceptions_list by default as it slows down the tokenizer
# from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES
from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import ALPHA, HYPHENS, CONCAT_QUOTES
from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
@ -10,14 +10,7 @@ from ..char_classes import ALPHA_LOWER, ALPHA_UPPER
ELISION = "'"
_prefixes = (
[
r"'[0-9][0-9]",
r"[0-9]+°",
]
+ TOKENIZER_PREFIXES
)
_prefixes = [r"'[0-9][0-9]", r"[0-9]+°"] + BASE_TOKENIZER_PREFIXES
_infixes = (
@ -31,7 +24,7 @@ _infixes = (
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])(?:{h})(?=[{al}])".format(a=ALPHA, h=HYPHENS, al=ALPHA_LOWER),
r"(?<=[{a}0-9])[:<>=\/](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION)
r"(?<=[{a}][{el}])(?=[{a}0-9\"])".format(a=ALPHA, el=ELISION),
]
)

View File

@ -10,7 +10,7 @@ _exc = {
"l'art.": [{ORTH: "l'"}, {ORTH: "art."}],
"nell'art.": [{ORTH: "nell'"}, {ORTH: "art."}],
"po'": [{ORTH: "po'", LEMMA: "poco"}],
"sett..": [{ORTH: "sett."}, {ORTH: "."}]
"sett..": [{ORTH: "sett."}, {ORTH: "."}],
}
for orth in [
@ -32,7 +32,7 @@ for orth in [
"col.",
"Cost.",
"d.C.",
'de"'
'de"',
"distr.",
"E'",
"ecc.",

View File

@ -29,7 +29,9 @@ class LithuanianDefaults(Language.Defaults):
infixes = TOKENIZER_INFIXES
suffixes = TOKENIZER_SUFFIXES
mod_base_exceptions = {exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")}
mod_base_exceptions = {
exc: val for exc, val in BASE_EXCEPTIONS.items() if not exc.endswith(".")
}
del mod_base_exceptions["8)"]
tokenizer_exceptions = update_exc(mod_base_exceptions, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS

View File

@ -24,7 +24,6 @@ _prefixes = (
)
_infixes = (
LIST_ELLIPSES
+ _list_icons

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, NORM
from ...symbols import ORTH
_exc = {}

View File

@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, AUX, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
from ...symbols import POS, AUX, ADJ, CCONJ, NUM, ADV, ADP, X, VERB
from ...symbols import NOUN, PART, INTJ, PRON
# Source https://universaldependencies.org/tagset-conversion/sk-snk-uposf.html
# fmt: off

View File

@ -37,7 +37,7 @@ URL_PATTERN = (
r"|"
# host & domain names
# mods: match is case-sensitive, so include [A-Z]
"(?:"
"(?:" # noqa
"(?:"
"[A-Za-z0-9\u00a1-\uffff]"
"[A-Za-z0-9\u00a1-\uffff_-]{0,62}"

View File

@ -612,7 +612,7 @@ class Language(object):
link_vectors_to_models(self.vocab)
if self.vocab.vectors.data.shape[1]:
cfg["pretrained_vectors"] = self.vocab.vectors.name
cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
cfg["pretrained_dims"] = self.vocab.vectors.data.shape[1]
if sgd is None:
sgd = create_default_optimizer(Model.ops)
self._optimizer = sgd
@ -857,7 +857,14 @@ class Language(object):
procs = [
mp.Process(
target=_apply_pipes,
args=(self.make_doc, pipes, rch, sch, Underscore.get_state(), load_nlp.VECTORS),
args=(
self.make_doc,
pipes,
rch,
sch,
Underscore.get_state(),
load_nlp.VECTORS,
),
)
for rch, sch in zip(texts_q, bytedocs_send_ch)
]

View File

@ -222,11 +222,9 @@ class EntityRuler(object):
for label, pattern, ent_id in zip(
phrase_pattern_labels,
self.nlp.pipe(phrase_pattern_texts),
phrase_pattern_ids
phrase_pattern_ids,
):
phrase_pattern = {
"label": label, "pattern": pattern, "id": ent_id
}
phrase_pattern = {"label": label, "pattern": pattern, "id": ent_id}
if ent_id:
phrase_pattern["id"] = ent_id
phrase_patterns.append(phrase_pattern)

View File

@ -71,9 +71,7 @@ def test_doc_array_to_from_string_attrs(en_vocab, attrs):
def test_doc_array_idx(en_vocab):
"""Test that Doc.to_array can retrieve token start indices"""
words = ["An", "example", "sentence"]
doc = Doc(en_vocab, words=words)
offsets = Doc(en_vocab, words=words).to_array("IDX")
assert offsets[0] == 0
assert offsets[1] == 3
assert offsets[2] == 11

View File

@ -10,7 +10,13 @@ def test_eu_tokenizer_handles_long_text(eu_tokenizer):
assert len(tokens) == 5
@pytest.mark.parametrize("text,length", [("milesker ederra joan zen hitzaldia plazer hutsa", 7), ("astelehen guztia sofan pasau biot", 5)])
@pytest.mark.parametrize(
"text,length",
[
("milesker ederra joan zen hitzaldia plazer hutsa", 7),
("astelehen guztia sofan pasau biot", 5),
],
)
def test_eu_tokenizer_handles_cnts(eu_tokenizer, text, length):
tokens = eu_tokenizer(text)
assert len(tokens) == length

View File

@ -297,12 +297,7 @@ WIKI_TESTS = [
]
EXTRA_TESTS = (
DOT_TESTS
+ QUOTE_TESTS
+ NUMBER_TESTS
+ HYPHEN_TESTS
+ WIKI_TESTS
+ TYPO_TESTS
DOT_TESTS + QUOTE_TESTS + NUMBER_TESTS + HYPHEN_TESTS + WIKI_TESTS + TYPO_TESTS
)
# normal: default tests + 10% of extra tests
@ -311,7 +306,14 @@ TESTS.extend([x for i, x in enumerate(EXTRA_TESTS) if i % 10 == 0])
# slow: remaining 90% of extra tests
SLOW_TESTS = [x for i, x in enumerate(EXTRA_TESTS) if i % 10 != 0]
TESTS.extend([pytest.param(x[0], x[1], marks=pytest.mark.slow()) if not isinstance(x[0], tuple) else x for x in SLOW_TESTS])
TESTS.extend(
[
pytest.param(x[0], x[1], marks=pytest.mark.slow())
if not isinstance(x[0], tuple)
else x
for x in SLOW_TESTS
]
)
@pytest.mark.parametrize("text,expected_tokens", TESTS)

View File

@ -6,7 +6,8 @@ import re
from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
from ..doc.test_underscore import clean_underscore
from ..doc.test_underscore import clean_underscore # noqa: F401
@pytest.fixture

View File

@ -152,10 +152,5 @@ def test_entity_ruler_validate(nlp):
def test_entity_ruler_properties(nlp, patterns):
ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True)
assert sorted(ruler.labels) == sorted([
"HELLO",
"BYE",
"COMPLEX",
"TECH_ORG"
])
assert sorted(ruler.labels) == sorted(["HELLO", "BYE", "COMPLEX", "TECH_ORG"])
assert sorted(ruler.ent_ids) == ["a1", "a2"]

View File

@ -23,4 +23,3 @@ def test_issue4725():
docs = ["Kurt is in London."] * 10
for _ in nlp.pipe(docs, batch_size=2, n_process=2):
pass

View File

@ -9,11 +9,12 @@ def test_issue4849():
nlp = English()
ruler = EntityRuler(
nlp, patterns=[
{"label": "PERSON", "pattern": 'joe biden', "id": 'joe-biden'},
{"label": "PERSON", "pattern": 'bernie sanders', "id": 'bernie-sanders'},
nlp,
patterns=[
{"label": "PERSON", "pattern": "joe biden", "id": "joe-biden"},
{"label": "PERSON", "pattern": "bernie sanders", "id": "bernie-sanders"},
],
phrase_matcher_attr="LOWER"
phrase_matcher_attr="LOWER",
)
nlp.add_pipe(ruler)
@ -27,10 +28,10 @@ def test_issue4849():
count_ents = 0
for doc in nlp.pipe([text], n_process=1):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert(count_ents == 2)
assert count_ents == 2
# USING 2 PROCESSES
count_ents = 0
for doc in nlp.pipe([text], n_process=2):
count_ents += len([ent for ent in doc.ents if ent.ent_id > 0])
assert (count_ents == 2)
assert count_ents == 2

View File

@ -22,7 +22,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer):
tokenizer_bytes = tokenizer.to_bytes()
Tokenizer(en_vocab).from_bytes(tokenizer_bytes)
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC", "ORTH": "."}]})
tokenizer = Tokenizer(en_vocab, rules={"ABC.": [{"ORTH": "ABC"}, {"ORTH": "."}]})
tokenizer.rules = {}
tokenizer_bytes = tokenizer.to_bytes()
tokenizer_reloaded = Tokenizer(en_vocab).from_bytes(tokenizer_bytes)

View File

@ -28,7 +28,9 @@ def make_tempdir():
shutil.rmtree(path2str(d))
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None):
def get_doc(
vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None
):
"""Create Doc object from given vocab, words and annotations."""
if deps and not heads:
heads = [0] * len(deps)