Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-10-18 11:27:38 +02:00
parent fb11852750
commit 181c01f629
23 changed files with 101 additions and 107 deletions

View File

@ -1,15 +1,11 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS from .stop_words import STOP_WORDS
#from .lemmatizer import LOOKUP
#from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS from ..norm_exceptions import BASE_NORMS
@ -21,17 +17,18 @@ from ...util import update_exc, add_lookups
class LuxembourgishDefaults(Language.Defaults): class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS) lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'lb' lex_attr_getters[LANG] = lambda text: "lb"
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS stop_words = STOP_WORDS
#suffixes = TOKENIZER_SUFFIXES tag_map = TAG_MAP
#lemma_lookup = LOOKUP
class Luxembourgish(Language): class Luxembourgish(Language):
lang = 'lb' lang = "lb"
Defaults = LuxembourgishDefaults Defaults = LuxembourgishDefaults
__all__ = ['Luxembourgish'] __all__ = ["Luxembourgish"]

View File

@ -9,10 +9,10 @@ Example sentences to test spaCy and its language models.
""" """
sentences = [ sentences = [
"An der Zäit hunn sech den Nordwand an dSonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.", "An der Zäit hunn sech den Nordwand an dSonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.", "Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.", "Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
"Um Enn huet den Nordwand säi Kampf opginn.", "Um Enn huet den Nordwand säi Kampf opginn.",
"Dunn huet dSonn dLoft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.", "Dunn huet dSonn dLoft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
"Do huet den Nordwand missen zouginn, dass dSonn vun hinnen zwee de Stäerkste wier." "Do huet den Nordwand missen zouginn, dass dSonn vun hinnen zwee de Stäerkste wier.",
] ]

View File

@ -4,29 +4,34 @@ from __future__ import unicode_literals
from ...attrs import LIKE_NUM from ...attrs import LIKE_NUM
_num_words = set(""" _num_words = set(
"""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
honnert dausend millioun milliard billioun billiard trillioun triliard honnert dausend millioun milliard billioun billiard trillioun triliard
""".split()) """.split()
)
_ordinal_words = set(""" _ordinal_words = set(
"""
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
honnertsten dausendsten milliounsten honnertsten dausendsten milliounsten
milliardsten billiounsten billiardsten trilliounsten trilliardsten milliardsten billiounsten billiardsten trilliounsten trilliardsten
""".split()) """.split()
)
def like_num(text): def like_num(text):
""" """
check if text resembles a number check if text resembles a number
""" """
text = text.replace(',', '').replace('.', '') text = text.replace(",", "").replace(".", "")
if text.isdigit(): if text.isdigit():
return True return True
if text.count('/') == 1: if text.count("/") == 1:
num, denom = text.split('/') num, denom = text.split("/")
if num.isdigit() and denom.isdigit(): if num.isdigit() and denom.isdigit():
return True return True
if text in _num_words: if text in _num_words:
@ -36,6 +41,4 @@ def like_num(text):
return False return False
LEX_ATTRS = { LEX_ATTRS = {LIKE_NUM: like_num}
LIKE_NUM: like_num
}

View File

@ -2,15 +2,11 @@
from __future__ import unicode_literals from __future__ import unicode_literals
# TODO # TODO
# norm execptions: find a possibility to deal with the zillions of spelling variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) # norm execptions: find a possibility to deal with the zillions of spelling
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# here one could include the most common spelling mistakes # here one could include the most common spelling mistakes
_exc = { _exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
"datt": "dass",
"wgl.": "weg.",
"wgl.": "wegl.",
"vläicht": "viläicht"}
NORM_EXCEPTIONS = {} NORM_EXCEPTIONS = {}

View File

@ -1,25 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:;<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[0-9])-(?=[0-9])",
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -1,7 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
STOP_WORDS = set(""" STOP_WORDS = set(
"""
a a
à à
äis äis
@ -209,4 +210,5 @@ ze
zu zu
zum zum
zwar zwar
""".split()) """.split()
)

View File

@ -1,8 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX from ...symbols import NOUN, PART, SPACE, AUX
# TODO: tag map is still using POS tags from an internal training set. # TODO: tag map is still using POS tags from an internal training set.
# These POS tags have to be modified to match those from Universal Dependencies # These POS tags have to be modified to match those from Universal Dependencies

View File

@ -1,7 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA from ...symbols import ORTH, LEMMA, NORM
from ..punctuation import TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_PREFIXES
# TODO # TODO
@ -9,16 +9,20 @@ from ..punctuation import TOKENIZER_PREFIXES
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions) # treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working. # how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d", "D", r"\' "]] _prefixes = [
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d", "D"]
]
_exc = { _exc = {
"d'mannst": [ "d'mannst": [
{ORTH: "d'", LEMMA: "d'"}, {ORTH: "d'", LEMMA: "d'"},
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"}], {ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
],
"d'éischt": [ "d'éischt": [
{ORTH: "d'", LEMMA: "d'"}, {ORTH: "d'", LEMMA: "d'"},
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}] {ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
],
} }
# translate / delete what is not necessary # translate / delete what is not necessary
@ -32,14 +36,32 @@ for exc_data in [
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"}, {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"}, {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"}, {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]: {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
# to be extended # to be extended
for orth in [ for orth in [
"z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.", "z.B.",
"q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]: "Dipl.",
"Dr.",
"etc.",
"i.e.",
"o.k.",
"O.K.",
"p.a.",
"p.s.",
"P.S.",
"phil.",
"q.e.d.",
"R.I.P.",
"rer.",
"sen.",
"ë.a.",
"U.S.",
"U.S.A.",
]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]

View File

@ -183,7 +183,9 @@ class EntityRuler(object):
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try: try:
current_index = self.nlp.pipe_names.index(self.name) current_index = self.nlp.pipe_names.index(self.name)
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]] subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
]
except ValueError: except ValueError:
subsequent_pipes = [] subsequent_pipes = []
with self.nlp.disable_pipes(*subsequent_pipes): with self.nlp.disable_pipes(*subsequent_pipes):

View File

@ -219,7 +219,9 @@ class Scorer(object):
DOCS: https://spacy.io/api/scorer#score DOCS: https://spacy.io/api/scorer#score
""" """
if len(doc) != len(gold): if len(doc) != len(gold):
gold = GoldParse.from_annot_tuples(doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)) gold = GoldParse.from_annot_tuples(
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
)
gold_deps = set() gold_deps = set()
gold_tags = set() gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot])) gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))

View File

@ -134,10 +134,12 @@ def ko_tokenizer():
pytest.importorskip("natto") pytest.importorskip("natto")
return get_lang_class("ko").Defaults.create_tokenizer() return get_lang_class("ko").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def lb_tokenizer(): def lb_tokenizer():
return get_lang_class("lb").Defaults.create_tokenizer() return get_lang_class("lb").Defaults.create_tokenizer()
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def lt_tokenizer(): def lt_tokenizer():
return get_lang_class("lt").Defaults.create_tokenizer() return get_lang_class("lt").Defaults.create_tokenizer()

View File

@ -1,5 +1,4 @@
# coding: utf-8 # coding: utf-8
# from __future__ import unicolb_literals
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@ -9,4 +8,3 @@ import pytest
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text): def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 1 assert len(tokens) == 1

View File

@ -1,5 +1,4 @@
# coding: utf-8 # coding: utf-8
#from __future__ import unicolb_literals
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
@ -21,6 +20,3 @@ def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text): def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
tokens = lb_tokenizer(text) tokens = lb_tokenizer(text)
assert len(tokens) == 3 assert len(tokens) == 3

View File

@ -1,6 +1,5 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from __future__ import unicode_literals
import pytest import pytest

View File

@ -159,14 +159,14 @@ def test_matcher_remove():
# should give two matches # should give two matches
results1 = matcher(nlp(text)) results1 = matcher(nlp(text))
assert(len(results1) == 2) assert len(results1) == 2
# removing once should work # removing once should work
matcher.remove("Rule") matcher.remove("Rule")
# should not return any maches anymore # should not return any maches anymore
results2 = matcher(nlp(text)) results2 = matcher(nlp(text))
assert (len(results2) == 0) assert len(results2) == 0
# removing again should throw an error # removing again should throw an error
with pytest.raises(ValueError): with pytest.raises(ValueError):

View File

@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
moves.add_action(move_types.index("L"), label) moves.add_action(move_types.index("L"), label)
moves.add_action(move_types.index("U"), label) moves.add_action(move_types.index("U"), label)
moves.preprocess_gold(gold) moves.preprocess_gold(gold)
seq = moves.get_oracle_sequence(doc, gold) moves.get_oracle_sequence(doc, gold)
def test_oracle_moves_whitespace(en_vocab): def test_oracle_moves_whitespace(en_vocab):

View File

@ -323,7 +323,7 @@ def test_issue3456():
nlp = English() nlp = English()
nlp.add_pipe(nlp.create_pipe("tagger")) nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.begin_training() nlp.begin_training()
list(nlp.pipe(['hi', ''])) list(nlp.pipe(["hi", ""]))
def test_issue3468(): def test_issue3468():

View File

@ -76,7 +76,6 @@ def test_issue4042_bug2():
output_dir.mkdir() output_dir.mkdir()
ner1.to_disk(output_dir) ner1.to_disk(output_dir)
nlp2 = English(vocab)
ner2 = EntityRecognizer(vocab) ner2 = EntityRecognizer(vocab)
ner2.from_disk(output_dir) ner2.from_disk(output_dir)
assert len(ner2.labels) == 2 assert len(ner2.labels) == 2

View File

@ -1,13 +1,8 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest
import spacy
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import EntityRuler from spacy.pipeline import EntityRuler
from spacy.tokens import Span
def test_issue4267(): def test_issue4267():

View File

@ -6,6 +6,6 @@ from spacy.tokens import DocBin
def test_issue4367(): def test_issue4367():
"""Test that docbin init goes well""" """Test that docbin init goes well"""
doc_bin_1 = DocBin() DocBin()
doc_bin_2 = DocBin(attrs=["LEMMA"]) DocBin(attrs=["LEMMA"])
doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"]) DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])

View File

@ -74,4 +74,4 @@ def test_serialize_doc_bin():
# Deserialize later, e.g. in a new process # Deserialize later, e.g. in a new process
nlp = spacy.blank("en") nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data) doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab)) list(doc_bin.get_docs(nlp.vocab))

View File

@ -48,8 +48,13 @@ URLS_SHOULD_MATCH = [
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"ssh://login@server.com:12345/repository.git", "ssh://login@server.com:12345/repository.git",
"svn+ssh://user@ssh.yourdomain.com/path", "svn+ssh://user@ssh.yourdomain.com/path",
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()), pytest.param(
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()), "chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
marks=pytest.mark.xfail(),
),
pytest.param(
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
),
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()), pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
pytest.param( pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()

View File

@ -50,12 +50,13 @@ def ngrams_vocab(en_vocab, ngrams_vectors):
def data(): def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f") return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
@pytest.fixture @pytest.fixture
def most_similar_vectors_data(): def most_similar_vectors_data():
return numpy.asarray([[0.0, 1.0, 2.0], return numpy.asarray(
[1.0, -2.0, 4.0], [[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
[1.0, 1.0, -1.0], dtype="f",
[2.0, 3.0, 1.0]], dtype="f") )
@pytest.fixture @pytest.fixture