mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 00:46:28 +03:00
Tidy up and auto-format
This commit is contained in:
parent
fb11852750
commit
181c01f629
|
@ -1,15 +1,11 @@
|
|||
# coding: utf8
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||
from .norm_exceptions import NORM_EXCEPTIONS
|
||||
from .punctuation import TOKENIZER_INFIXES
|
||||
from .lex_attrs import LEX_ATTRS
|
||||
from .tag_map import TAG_MAP
|
||||
from .stop_words import STOP_WORDS
|
||||
#from .lemmatizer import LOOKUP
|
||||
#from .syntax_iterators import SYNTAX_ITERATORS
|
||||
|
||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||
from ..norm_exceptions import BASE_NORMS
|
||||
|
@ -21,17 +17,18 @@ from ...util import update_exc, add_lookups
|
|||
class LuxembourgishDefaults(Language.Defaults):
|
||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||
lex_attr_getters.update(LEX_ATTRS)
|
||||
lex_attr_getters[LANG] = lambda text: 'lb'
|
||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
||||
lex_attr_getters[LANG] = lambda text: "lb"
|
||||
lex_attr_getters[NORM] = add_lookups(
|
||||
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
||||
)
|
||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
#suffixes = TOKENIZER_SUFFIXES
|
||||
#lemma_lookup = LOOKUP
|
||||
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class Luxembourgish(Language):
|
||||
lang = 'lb'
|
||||
lang = "lb"
|
||||
Defaults = LuxembourgishDefaults
|
||||
|
||||
|
||||
__all__ = ['Luxembourgish']
|
||||
__all__ = ["Luxembourgish"]
|
||||
|
|
|
@ -9,10 +9,10 @@ Example sentences to test spaCy and its language models.
|
|||
"""
|
||||
|
||||
sentences = [
|
||||
"An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
|
||||
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
|
||||
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
|
||||
"Um Enn huet den Nordwand säi Kampf opginn.",
|
||||
"Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
|
||||
"Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier."
|
||||
"An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
|
||||
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
|
||||
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
|
||||
"Um Enn huet den Nordwand säi Kampf opginn.",
|
||||
"Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
|
||||
"Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier.",
|
||||
]
|
||||
|
|
|
@ -4,29 +4,34 @@ from __future__ import unicode_literals
|
|||
from ...attrs import LIKE_NUM
|
||||
|
||||
|
||||
_num_words = set("""
|
||||
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
|
||||
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
|
||||
_num_words = set(
|
||||
"""
|
||||
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
|
||||
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
|
||||
honnert dausend millioun milliard billioun billiard trillioun triliard
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
_ordinal_words = set("""
|
||||
_ordinal_words = set(
|
||||
"""
|
||||
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
|
||||
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
|
||||
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
|
||||
honnertsten dausendsten milliounsten
|
||||
milliardsten billiounsten billiardsten trilliounsten trilliardsten
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
||||
|
||||
def like_num(text):
|
||||
"""
|
||||
check if text resembles a number
|
||||
"""
|
||||
text = text.replace(',', '').replace('.', '')
|
||||
text = text.replace(",", "").replace(".", "")
|
||||
if text.isdigit():
|
||||
return True
|
||||
if text.count('/') == 1:
|
||||
num, denom = text.split('/')
|
||||
if text.count("/") == 1:
|
||||
num, denom = text.split("/")
|
||||
if num.isdigit() and denom.isdigit():
|
||||
return True
|
||||
if text in _num_words:
|
||||
|
@ -36,6 +41,4 @@ def like_num(text):
|
|||
return False
|
||||
|
||||
|
||||
LEX_ATTRS = {
|
||||
LIKE_NUM: like_num
|
||||
}
|
||||
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||
|
|
|
@ -2,19 +2,15 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
# TODO
|
||||
# norm execptions: find a possibility to deal with the zillions of spelling variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||
|
||||
# norm execptions: find a possibility to deal with the zillions of spelling
|
||||
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||
# here one could include the most common spelling mistakes
|
||||
|
||||
_exc = {
|
||||
"datt": "dass",
|
||||
"wgl.": "weg.",
|
||||
"wgl.": "wegl.",
|
||||
"vläicht": "viläicht"}
|
||||
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
|
||||
|
||||
|
||||
NORM_EXCEPTIONS = {}
|
||||
|
||||
for string, norm in _exc.items():
|
||||
NORM_EXCEPTIONS[string] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
||||
NORM_EXCEPTIONS[string.title()] = norm
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
||||
|
||||
|
||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
||||
|
||||
_infixes = (
|
||||
LIST_ELLIPSES
|
||||
+ LIST_ICONS
|
||||
+ [
|
||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||
r'(?<=[{a}])[:;<>=](?=[{a}])'.format(a=ALPHA),
|
||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||
r"(?<=[0-9])-(?=[0-9])",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = _infixes
|
|
@ -1,7 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
STOP_WORDS = set("""
|
||||
STOP_WORDS = set(
|
||||
"""
|
||||
a
|
||||
à
|
||||
äis
|
||||
|
@ -209,4 +210,5 @@ ze
|
|||
zu
|
||||
zum
|
||||
zwar
|
||||
""".split())
|
||||
""".split()
|
||||
)
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
||||
from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
|
||||
from ...symbols import NOUN, PART, SPACE, AUX
|
||||
|
||||
# TODO: tag map is still using POS tags from an internal training set.
|
||||
# These POS tags have to be modified to match those from Universal Dependencies
|
||||
# These POS tags have to be modified to match those from Universal Dependencies
|
||||
|
||||
TAG_MAP = {
|
||||
"$": {POS: PUNCT},
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
||||
from ...symbols import ORTH, LEMMA, NORM
|
||||
from ..punctuation import TOKENIZER_PREFIXES
|
||||
|
||||
# TODO
|
||||
|
@ -9,16 +9,20 @@ from ..punctuation import TOKENIZER_PREFIXES
|
|||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||
|
||||
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
||||
_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’", r"\' "]]
|
||||
_prefixes = [
|
||||
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"]
|
||||
]
|
||||
|
||||
|
||||
_exc = {
|
||||
"d'mannst": [
|
||||
{ORTH: "d'", LEMMA: "d'"},
|
||||
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"}],
|
||||
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
|
||||
],
|
||||
"d'éischt": [
|
||||
{ORTH: "d'", LEMMA: "d'"},
|
||||
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}]
|
||||
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
|
||||
],
|
||||
}
|
||||
|
||||
# translate / delete what is not necessary
|
||||
|
@ -26,20 +30,38 @@ _exc = {
|
|||
for exc_data in [
|
||||
{ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
|
||||
{ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
|
||||
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
||||
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
||||
{ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
|
||||
{ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
|
||||
{ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
|
||||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]:
|
||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||
]:
|
||||
_exc[exc_data[ORTH]] = [exc_data]
|
||||
|
||||
|
||||
# to be extended
|
||||
for orth in [
|
||||
"z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.",
|
||||
"q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]:
|
||||
"z.B.",
|
||||
"Dipl.",
|
||||
"Dr.",
|
||||
"etc.",
|
||||
"i.e.",
|
||||
"o.k.",
|
||||
"O.K.",
|
||||
"p.a.",
|
||||
"p.s.",
|
||||
"P.S.",
|
||||
"phil.",
|
||||
"q.e.d.",
|
||||
"R.I.P.",
|
||||
"rer.",
|
||||
"sen.",
|
||||
"ë.a.",
|
||||
"U.S.",
|
||||
"U.S.A.",
|
||||
]:
|
||||
_exc[orth] = [{ORTH: orth}]
|
||||
|
||||
|
||||
|
|
|
@ -183,7 +183,9 @@ class EntityRuler(object):
|
|||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
try:
|
||||
current_index = self.nlp.pipe_names.index(self.name)
|
||||
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
|
||||
subsequent_pipes = [
|
||||
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
||||
]
|
||||
except ValueError:
|
||||
subsequent_pipes = []
|
||||
with self.nlp.disable_pipes(*subsequent_pipes):
|
||||
|
|
|
@ -219,7 +219,9 @@ class Scorer(object):
|
|||
DOCS: https://spacy.io/api/scorer#score
|
||||
"""
|
||||
if len(doc) != len(gold):
|
||||
gold = GoldParse.from_annot_tuples(doc, tuple(zip(*gold.orig_annot)) + (gold.cats,))
|
||||
gold = GoldParse.from_annot_tuples(
|
||||
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
|
||||
)
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||
|
|
|
@ -134,10 +134,12 @@ def ko_tokenizer():
|
|||
pytest.importorskip("natto")
|
||||
return get_lang_class("ko").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lb_tokenizer():
|
||||
return get_lang_class("lb").Defaults.create_tokenizer()
|
||||
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def lt_tokenizer():
|
||||
return get_lang_class("lt").Defaults.create_tokenizer()
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# coding: utf-8
|
||||
# from __future__ import unicolb_literals
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
@ -9,4 +8,3 @@ import pytest
|
|||
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 1
|
||||
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
# coding: utf-8
|
||||
#from __future__ import unicolb_literals
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
@ -21,6 +20,3 @@ def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
|
|||
def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
|
||||
tokens = lb_tokenizer(text)
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
|
|
@ -159,14 +159,14 @@ def test_matcher_remove():
|
|||
|
||||
# should give two matches
|
||||
results1 = matcher(nlp(text))
|
||||
assert(len(results1) == 2)
|
||||
assert len(results1) == 2
|
||||
|
||||
# removing once should work
|
||||
matcher.remove("Rule")
|
||||
|
||||
# should not return any maches anymore
|
||||
results2 = matcher(nlp(text))
|
||||
assert (len(results2) == 0)
|
||||
assert len(results2) == 0
|
||||
|
||||
# removing again should throw an error
|
||||
with pytest.raises(ValueError):
|
||||
|
|
|
@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
|||
moves.add_action(move_types.index("L"), label)
|
||||
moves.add_action(move_types.index("U"), label)
|
||||
moves.preprocess_gold(gold)
|
||||
seq = moves.get_oracle_sequence(doc, gold)
|
||||
moves.get_oracle_sequence(doc, gold)
|
||||
|
||||
|
||||
def test_oracle_moves_whitespace(en_vocab):
|
||||
|
|
|
@ -323,7 +323,7 @@ def test_issue3456():
|
|||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||
nlp.begin_training()
|
||||
list(nlp.pipe(['hi', '']))
|
||||
list(nlp.pipe(["hi", ""]))
|
||||
|
||||
|
||||
def test_issue3468():
|
||||
|
|
|
@ -76,7 +76,6 @@ def test_issue4042_bug2():
|
|||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
nlp2 = English(vocab)
|
||||
ner2 = EntityRecognizer(vocab)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
||||
|
|
|
@ -1,13 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
import spacy
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.pipeline import EntityRuler
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
def test_issue4267():
|
||||
|
|
|
@ -6,6 +6,6 @@ from spacy.tokens import DocBin
|
|||
|
||||
def test_issue4367():
|
||||
"""Test that docbin init goes well"""
|
||||
doc_bin_1 = DocBin()
|
||||
doc_bin_2 = DocBin(attrs=["LEMMA"])
|
||||
doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||
DocBin()
|
||||
DocBin(attrs=["LEMMA"])
|
||||
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||
|
|
|
@ -74,4 +74,4 @@ def test_serialize_doc_bin():
|
|||
# Deserialize later, e.g. in a new process
|
||||
nlp = spacy.blank("en")
|
||||
doc_bin = DocBin().from_bytes(bytes_data)
|
||||
docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
list(doc_bin.get_docs(nlp.vocab))
|
||||
|
|
|
@ -48,8 +48,13 @@ URLS_SHOULD_MATCH = [
|
|||
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||
"ssh://login@server.com:12345/repository.git",
|
||||
"svn+ssh://user@ssh.yourdomain.com/path",
|
||||
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
||||
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
||||
pytest.param(
|
||||
"chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
|
||||
marks=pytest.mark.xfail(),
|
||||
),
|
||||
pytest.param(
|
||||
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
||||
),
|
||||
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
||||
pytest.param(
|
||||
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||
|
|
|
@ -50,12 +50,13 @@ def ngrams_vocab(en_vocab, ngrams_vectors):
|
|||
def data():
|
||||
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def most_similar_vectors_data():
|
||||
return numpy.asarray([[0.0, 1.0, 2.0],
|
||||
[1.0, -2.0, 4.0],
|
||||
[1.0, 1.0, -1.0],
|
||||
[2.0, 3.0, 1.0]], dtype="f")
|
||||
return numpy.asarray(
|
||||
[[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
|
||||
dtype="f",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
|
Loading…
Reference in New Issue
Block a user