Tidy up and auto-format

This commit is contained in:
Ines Montani 2019-10-18 11:27:38 +02:00
parent fb11852750
commit 181c01f629
23 changed files with 101 additions and 107 deletions

View File

@ -1,15 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
from .norm_exceptions import NORM_EXCEPTIONS
from .punctuation import TOKENIZER_INFIXES
from .lex_attrs import LEX_ATTRS
from .tag_map import TAG_MAP
from .stop_words import STOP_WORDS
#from .lemmatizer import LOOKUP
#from .syntax_iterators import SYNTAX_ITERATORS
from ..tokenizer_exceptions import BASE_EXCEPTIONS
from ..norm_exceptions import BASE_NORMS
@ -21,17 +17,18 @@ from ...util import update_exc, add_lookups
class LuxembourgishDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: 'lb'
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
lex_attr_getters[LANG] = lambda text: "lb"
lex_attr_getters[NORM] = add_lookups(
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
)
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
stop_words = STOP_WORDS
#suffixes = TOKENIZER_SUFFIXES
#lemma_lookup = LOOKUP
tag_map = TAG_MAP
class Luxembourgish(Language):
lang = 'lb'
lang = "lb"
Defaults = LuxembourgishDefaults
__all__ = ['Luxembourgish']
__all__ = ["Luxembourgish"]

View File

@ -9,10 +9,10 @@ Example sentences to test spaCy and its language models.
"""
sentences = [
"An der Zäit hunn sech den Nordwand an dSonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
"Um Enn huet den Nordwand säi Kampf opginn.",
"Dunn huet dSonn dLoft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
"Do huet den Nordwand missen zouginn, dass dSonn vun hinnen zwee de Stäerkste wier."
"An der Zäit hunn sech den Nordwand an dSonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
"Um Enn huet den Nordwand säi Kampf opginn.",
"Dunn huet dSonn dLoft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
"Do huet den Nordwand missen zouginn, dass dSonn vun hinnen zwee de Stäerkste wier.",
]

View File

@ -4,29 +4,34 @@ from __future__ import unicode_literals
from ...attrs import LIKE_NUM
_num_words = set("""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
_num_words = set(
"""
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
honnert dausend millioun milliard billioun billiard trillioun triliard
""".split())
""".split()
)
_ordinal_words = set("""
_ordinal_words = set(
"""
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
honnertsten dausendsten milliounsten
milliardsten billiounsten billiardsten trilliounsten trilliardsten
""".split())
""".split()
)
def like_num(text):
"""
check if text resembles a number
"""
text = text.replace(',', '').replace('.', '')
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count('/') == 1:
num, denom = text.split('/')
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
if text in _num_words:
@ -36,6 +41,4 @@ def like_num(text):
return False
LEX_ATTRS = {
LIKE_NUM: like_num
}
LEX_ATTRS = {LIKE_NUM: like_num}

View File

@ -2,19 +2,15 @@
from __future__ import unicode_literals
# TODO
# norm execptions: find a possibility to deal with the zillions of spelling variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# norm execptions: find a possibility to deal with the zillions of spelling
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
# here one could include the most common spelling mistakes
_exc = {
"datt": "dass",
"wgl.": "weg.",
"wgl.": "wegl.",
"vläicht": "viläicht"}
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
NORM_EXCEPTIONS = {}
for string, norm in _exc.items():
NORM_EXCEPTIONS[string] = norm
NORM_EXCEPTIONS[string.title()] = norm
NORM_EXCEPTIONS[string.title()] = norm

View File

@ -1,25 +0,0 @@
# coding: utf8
from __future__ import unicode_literals
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
_quotes = CONCAT_QUOTES.replace("'", "")
_infixes = (
LIST_ELLIPSES
+ LIST_ICONS
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:;<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[0-9])-(?=[0-9])",
]
)
TOKENIZER_INFIXES = _infixes

View File

@ -1,7 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
STOP_WORDS = set("""
STOP_WORDS = set(
"""
a
à
äis
@ -209,4 +210,5 @@ ze
zu
zum
zwar
""".split())
""".split()
)

View File

@ -1,11 +1,11 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
from ...symbols import NOUN, PART, SPACE, AUX
# TODO: tag map is still using POS tags from an internal training set.
# These POS tags have to be modified to match those from Universal Dependencies
# These POS tags have to be modified to match those from Universal Dependencies
TAG_MAP = {
"$": {POS: PUNCT},

View File

@ -1,7 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
from ...symbols import ORTH, LEMMA, NORM
from ..punctuation import TOKENIZER_PREFIXES
# TODO
@ -9,16 +9,20 @@ from ..punctuation import TOKENIZER_PREFIXES
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d", "D", r"\' "]]
_prefixes = [
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d", "D"]
]
_exc = {
"d'mannst": [
{ORTH: "d'", LEMMA: "d'"},
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"}],
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
],
"d'éischt": [
{ORTH: "d'", LEMMA: "d'"},
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}]
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
],
}
# translate / delete what is not necessary
@ -26,20 +30,38 @@ _exc = {
for exc_data in [
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]:
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
]:
_exc[exc_data[ORTH]] = [exc_data]
# to be extended
for orth in [
"z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.",
"q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]:
"z.B.",
"Dipl.",
"Dr.",
"etc.",
"i.e.",
"o.k.",
"O.K.",
"p.a.",
"p.s.",
"P.S.",
"phil.",
"q.e.d.",
"R.I.P.",
"rer.",
"sen.",
"ë.a.",
"U.S.",
"U.S.A.",
]:
_exc[orth] = [{ORTH: orth}]

View File

@ -183,7 +183,9 @@ class EntityRuler(object):
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try:
current_index = self.nlp.pipe_names.index(self.name)
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
subsequent_pipes = [
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
]
except ValueError:
subsequent_pipes = []
with self.nlp.disable_pipes(*subsequent_pipes):

View File

@ -219,7 +219,9 @@ class Scorer(object):
DOCS: https://spacy.io/api/scorer#score
"""
if len(doc) != len(gold):
gold = GoldParse.from_annot_tuples(doc, tuple(zip(*gold.orig_annot)) + (gold.cats,))
gold = GoldParse.from_annot_tuples(
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
)
gold_deps = set()
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))

View File

@ -134,10 +134,12 @@ def ko_tokenizer():
pytest.importorskip("natto")
return get_lang_class("ko").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def lb_tokenizer():
return get_lang_class("lb").Defaults.create_tokenizer()
@pytest.fixture(scope="session")
def lt_tokenizer():
return get_lang_class("lt").Defaults.create_tokenizer()

View File

@ -1,5 +1,4 @@
# coding: utf-8
# from __future__ import unicolb_literals
from __future__ import unicode_literals
import pytest
@ -9,4 +8,3 @@ import pytest
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 1

View File

@ -1,5 +1,4 @@
# coding: utf-8
#from __future__ import unicolb_literals
from __future__ import unicode_literals
import pytest
@ -21,6 +20,3 @@ def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
tokens = lb_tokenizer(text)
assert len(tokens) == 3

View File

@ -1,6 +1,5 @@
# coding: utf-8
from __future__ import unicode_literals
from __future__ import unicode_literals
import pytest

View File

@ -159,14 +159,14 @@ def test_matcher_remove():
# should give two matches
results1 = matcher(nlp(text))
assert(len(results1) == 2)
assert len(results1) == 2
# removing once should work
matcher.remove("Rule")
# should not return any maches anymore
results2 = matcher(nlp(text))
assert (len(results2) == 0)
assert len(results2) == 0
# removing again should throw an error
with pytest.raises(ValueError):

View File

@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
moves.add_action(move_types.index("L"), label)
moves.add_action(move_types.index("U"), label)
moves.preprocess_gold(gold)
seq = moves.get_oracle_sequence(doc, gold)
moves.get_oracle_sequence(doc, gold)
def test_oracle_moves_whitespace(en_vocab):

View File

@ -323,7 +323,7 @@ def test_issue3456():
nlp = English()
nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.begin_training()
list(nlp.pipe(['hi', '']))
list(nlp.pipe(["hi", ""]))
def test_issue3468():

View File

@ -76,7 +76,6 @@ def test_issue4042_bug2():
output_dir.mkdir()
ner1.to_disk(output_dir)
nlp2 = English(vocab)
ner2 = EntityRecognizer(vocab)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2

View File

@ -1,13 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
def test_issue4267():

View File

@ -6,6 +6,6 @@ from spacy.tokens import DocBin
def test_issue4367():
"""Test that docbin init goes well"""
doc_bin_1 = DocBin()
doc_bin_2 = DocBin(attrs=["LEMMA"])
doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
DocBin()
DocBin(attrs=["LEMMA"])
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])

View File

@ -74,4 +74,4 @@ def test_serialize_doc_bin():
# Deserialize later, e.g. in a new process
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data)
docs = list(doc_bin.get_docs(nlp.vocab))
list(doc_bin.get_docs(nlp.vocab))

View File

@ -48,8 +48,13 @@ URLS_SHOULD_MATCH = [
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"ssh://login@server.com:12345/repository.git",
"svn+ssh://user@ssh.yourdomain.com/path",
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
pytest.param(
"chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
marks=pytest.mark.xfail(),
),
pytest.param(
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
),
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
pytest.param(
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()

View File

@ -50,12 +50,13 @@ def ngrams_vocab(en_vocab, ngrams_vectors):
def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
@pytest.fixture
def most_similar_vectors_data():
return numpy.asarray([[0.0, 1.0, 2.0],
[1.0, -2.0, 4.0],
[1.0, 1.0, -1.0],
[2.0, 3.0, 1.0]], dtype="f")
return numpy.asarray(
[[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
dtype="f",
)
@pytest.fixture