mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Tidy up and auto-format
This commit is contained in:
parent
fb11852750
commit
181c01f629
|
@ -1,15 +1,11 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
|
||||||
from .norm_exceptions import NORM_EXCEPTIONS
|
from .norm_exceptions import NORM_EXCEPTIONS
|
||||||
from .punctuation import TOKENIZER_INFIXES
|
|
||||||
from .lex_attrs import LEX_ATTRS
|
from .lex_attrs import LEX_ATTRS
|
||||||
from .tag_map import TAG_MAP
|
from .tag_map import TAG_MAP
|
||||||
from .stop_words import STOP_WORDS
|
from .stop_words import STOP_WORDS
|
||||||
#from .lemmatizer import LOOKUP
|
|
||||||
#from .syntax_iterators import SYNTAX_ITERATORS
|
|
||||||
|
|
||||||
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
from ..tokenizer_exceptions import BASE_EXCEPTIONS
|
||||||
from ..norm_exceptions import BASE_NORMS
|
from ..norm_exceptions import BASE_NORMS
|
||||||
|
@ -21,17 +17,18 @@ from ...util import update_exc, add_lookups
|
||||||
class LuxembourgishDefaults(Language.Defaults):
|
class LuxembourgishDefaults(Language.Defaults):
|
||||||
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
|
||||||
lex_attr_getters.update(LEX_ATTRS)
|
lex_attr_getters.update(LEX_ATTRS)
|
||||||
lex_attr_getters[LANG] = lambda text: 'lb'
|
lex_attr_getters[LANG] = lambda text: "lb"
|
||||||
lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
|
lex_attr_getters[NORM] = add_lookups(
|
||||||
|
Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
|
||||||
|
)
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
#suffixes = TOKENIZER_SUFFIXES
|
tag_map = TAG_MAP
|
||||||
#lemma_lookup = LOOKUP
|
|
||||||
|
|
||||||
|
|
||||||
class Luxembourgish(Language):
|
class Luxembourgish(Language):
|
||||||
lang = 'lb'
|
lang = "lb"
|
||||||
Defaults = LuxembourgishDefaults
|
Defaults = LuxembourgishDefaults
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Luxembourgish']
|
__all__ = ["Luxembourgish"]
|
||||||
|
|
|
@ -9,10 +9,10 @@ Example sentences to test spaCy and its language models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
sentences = [
|
sentences = [
|
||||||
"An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
|
"An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
|
||||||
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
|
"Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
|
||||||
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
|
"Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
|
||||||
"Um Enn huet den Nordwand säi Kampf opginn.",
|
"Um Enn huet den Nordwand säi Kampf opginn.",
|
||||||
"Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
|
"Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
|
||||||
"Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier."
|
"Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier.",
|
||||||
]
|
]
|
||||||
|
|
|
@ -4,29 +4,34 @@ from __future__ import unicode_literals
|
||||||
from ...attrs import LIKE_NUM
|
from ...attrs import LIKE_NUM
|
||||||
|
|
||||||
|
|
||||||
_num_words = set("""
|
_num_words = set(
|
||||||
|
"""
|
||||||
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
|
null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
|
||||||
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
|
véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
|
||||||
honnert dausend millioun milliard billioun billiard trillioun triliard
|
honnert dausend millioun milliard billioun billiard trillioun triliard
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
_ordinal_words = set("""
|
_ordinal_words = set(
|
||||||
|
"""
|
||||||
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
|
éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
|
||||||
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
|
zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
|
||||||
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
|
drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
|
||||||
honnertsten dausendsten milliounsten
|
honnertsten dausendsten milliounsten
|
||||||
milliardsten billiounsten billiardsten trilliounsten trilliardsten
|
milliardsten billiounsten billiardsten trilliounsten trilliardsten
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def like_num(text):
|
def like_num(text):
|
||||||
"""
|
"""
|
||||||
check if text resembles a number
|
check if text resembles a number
|
||||||
"""
|
"""
|
||||||
text = text.replace(',', '').replace('.', '')
|
text = text.replace(",", "").replace(".", "")
|
||||||
if text.isdigit():
|
if text.isdigit():
|
||||||
return True
|
return True
|
||||||
if text.count('/') == 1:
|
if text.count("/") == 1:
|
||||||
num, denom = text.split('/')
|
num, denom = text.split("/")
|
||||||
if num.isdigit() and denom.isdigit():
|
if num.isdigit() and denom.isdigit():
|
||||||
return True
|
return True
|
||||||
if text in _num_words:
|
if text in _num_words:
|
||||||
|
@ -36,6 +41,4 @@ def like_num(text):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
LEX_ATTRS = {
|
LEX_ATTRS = {LIKE_NUM: like_num}
|
||||||
LIKE_NUM: like_num
|
|
||||||
}
|
|
||||||
|
|
|
@ -2,15 +2,11 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
# norm execptions: find a possibility to deal with the zillions of spelling variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
# norm execptions: find a possibility to deal with the zillions of spelling
|
||||||
|
# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
|
||||||
# here one could include the most common spelling mistakes
|
# here one could include the most common spelling mistakes
|
||||||
|
|
||||||
_exc = {
|
_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
|
||||||
"datt": "dass",
|
|
||||||
"wgl.": "weg.",
|
|
||||||
"wgl.": "wegl.",
|
|
||||||
"vläicht": "viläicht"}
|
|
||||||
|
|
||||||
|
|
||||||
NORM_EXCEPTIONS = {}
|
NORM_EXCEPTIONS = {}
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
# coding: utf8
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from ..char_classes import LIST_ELLIPSES, LIST_ICONS
|
|
||||||
from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER
|
|
||||||
|
|
||||||
|
|
||||||
_quotes = CONCAT_QUOTES.replace("'", "")
|
|
||||||
|
|
||||||
_infixes = (
|
|
||||||
LIST_ELLIPSES
|
|
||||||
+ LIST_ICONS
|
|
||||||
+ [
|
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
|
||||||
r'(?<=[{a}])[:;<>=](?=[{a}])'.format(a=ALPHA),
|
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
|
||||||
r"(?<=[0-9])-(?=[0-9])",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = _infixes
|
|
|
@ -1,7 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
STOP_WORDS = set("""
|
STOP_WORDS = set(
|
||||||
|
"""
|
||||||
a
|
a
|
||||||
à
|
à
|
||||||
äis
|
äis
|
||||||
|
@ -209,4 +210,5 @@ ze
|
||||||
zu
|
zu
|
||||||
zum
|
zum
|
||||||
zwar
|
zwar
|
||||||
""".split())
|
""".split()
|
||||||
|
)
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
|
from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
|
||||||
from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
|
from ...symbols import NOUN, PART, SPACE, AUX
|
||||||
|
|
||||||
# TODO: tag map is still using POS tags from an internal training set.
|
# TODO: tag map is still using POS tags from an internal training set.
|
||||||
# These POS tags have to be modified to match those from Universal Dependencies
|
# These POS tags have to be modified to match those from Universal Dependencies
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA
|
from ...symbols import ORTH, LEMMA, NORM
|
||||||
from ..punctuation import TOKENIZER_PREFIXES
|
from ..punctuation import TOKENIZER_PREFIXES
|
||||||
|
|
||||||
# TODO
|
# TODO
|
||||||
|
@ -9,16 +9,20 @@ from ..punctuation import TOKENIZER_PREFIXES
|
||||||
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
|
||||||
|
|
||||||
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
# how to write the tokenisation exeption for the articles d' / D' ? This one is not working.
|
||||||
_prefixes = [prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’", r"\' "]]
|
_prefixes = [
|
||||||
|
prefix for prefix in TOKENIZER_PREFIXES if prefix not in ["d'", "D'", "d’", "D’"]
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
_exc = {
|
_exc = {
|
||||||
"d'mannst": [
|
"d'mannst": [
|
||||||
{ORTH: "d'", LEMMA: "d'"},
|
{ORTH: "d'", LEMMA: "d'"},
|
||||||
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"}],
|
{ORTH: "mannst", LEMMA: "mann", NORM: "mann"},
|
||||||
|
],
|
||||||
"d'éischt": [
|
"d'éischt": [
|
||||||
{ORTH: "d'", LEMMA: "d'"},
|
{ORTH: "d'", LEMMA: "d'"},
|
||||||
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"}]
|
{ORTH: "éischt", LEMMA: "éischt", NORM: "éischt"},
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
# translate / delete what is not necessary
|
# translate / delete what is not necessary
|
||||||
|
@ -32,14 +36,32 @@ for exc_data in [
|
||||||
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
{ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
|
||||||
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
{ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
|
||||||
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
{ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
|
||||||
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"}]:
|
{ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
|
||||||
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
|
||||||
# to be extended
|
# to be extended
|
||||||
for orth in [
|
for orth in [
|
||||||
"z.B.", "Dipl.", "Dr.", "etc.", "i.e.", "o.k.", "O.K.", "p.a.", "p.s.", "P.S.", "phil.",
|
"z.B.",
|
||||||
"q.e.d.", "R.I.P.", "rer.", "sen.", "ë.a.", "U.S.", "U.S.A."]:
|
"Dipl.",
|
||||||
|
"Dr.",
|
||||||
|
"etc.",
|
||||||
|
"i.e.",
|
||||||
|
"o.k.",
|
||||||
|
"O.K.",
|
||||||
|
"p.a.",
|
||||||
|
"p.s.",
|
||||||
|
"P.S.",
|
||||||
|
"phil.",
|
||||||
|
"q.e.d.",
|
||||||
|
"R.I.P.",
|
||||||
|
"rer.",
|
||||||
|
"sen.",
|
||||||
|
"ë.a.",
|
||||||
|
"U.S.",
|
||||||
|
"U.S.A.",
|
||||||
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -183,7 +183,9 @@ class EntityRuler(object):
|
||||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||||
try:
|
try:
|
||||||
current_index = self.nlp.pipe_names.index(self.name)
|
current_index = self.nlp.pipe_names.index(self.name)
|
||||||
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
|
subsequent_pipes = [
|
||||||
|
pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
|
||||||
|
]
|
||||||
except ValueError:
|
except ValueError:
|
||||||
subsequent_pipes = []
|
subsequent_pipes = []
|
||||||
with self.nlp.disable_pipes(*subsequent_pipes):
|
with self.nlp.disable_pipes(*subsequent_pipes):
|
||||||
|
|
|
@ -219,7 +219,9 @@ class Scorer(object):
|
||||||
DOCS: https://spacy.io/api/scorer#score
|
DOCS: https://spacy.io/api/scorer#score
|
||||||
"""
|
"""
|
||||||
if len(doc) != len(gold):
|
if len(doc) != len(gold):
|
||||||
gold = GoldParse.from_annot_tuples(doc, tuple(zip(*gold.orig_annot)) + (gold.cats,))
|
gold = GoldParse.from_annot_tuples(
|
||||||
|
doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
|
||||||
|
)
|
||||||
gold_deps = set()
|
gold_deps = set()
|
||||||
gold_tags = set()
|
gold_tags = set()
|
||||||
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
|
||||||
|
|
|
@ -134,10 +134,12 @@ def ko_tokenizer():
|
||||||
pytest.importorskip("natto")
|
pytest.importorskip("natto")
|
||||||
return get_lang_class("ko").Defaults.create_tokenizer()
|
return get_lang_class("ko").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lb_tokenizer():
|
def lb_tokenizer():
|
||||||
return get_lang_class("lb").Defaults.create_tokenizer()
|
return get_lang_class("lb").Defaults.create_tokenizer()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def lt_tokenizer():
|
def lt_tokenizer():
|
||||||
return get_lang_class("lt").Defaults.create_tokenizer()
|
return get_lang_class("lt").Defaults.create_tokenizer()
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
# from __future__ import unicolb_literals
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -9,4 +8,3 @@ import pytest
|
||||||
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
assert len(tokens) == 1
|
assert len(tokens) == 1
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
#from __future__ import unicolb_literals
|
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
@ -21,6 +20,3 @@ def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
|
||||||
def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
|
def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
|
||||||
tokens = lb_tokenizer(text)
|
tokens = lb_tokenizer(text)
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
|
@ -159,14 +159,14 @@ def test_matcher_remove():
|
||||||
|
|
||||||
# should give two matches
|
# should give two matches
|
||||||
results1 = matcher(nlp(text))
|
results1 = matcher(nlp(text))
|
||||||
assert(len(results1) == 2)
|
assert len(results1) == 2
|
||||||
|
|
||||||
# removing once should work
|
# removing once should work
|
||||||
matcher.remove("Rule")
|
matcher.remove("Rule")
|
||||||
|
|
||||||
# should not return any maches anymore
|
# should not return any maches anymore
|
||||||
results2 = matcher(nlp(text))
|
results2 = matcher(nlp(text))
|
||||||
assert (len(results2) == 0)
|
assert len(results2) == 0
|
||||||
|
|
||||||
# removing again should throw an error
|
# removing again should throw an error
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
|
|
|
@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
|
||||||
moves.add_action(move_types.index("L"), label)
|
moves.add_action(move_types.index("L"), label)
|
||||||
moves.add_action(move_types.index("U"), label)
|
moves.add_action(move_types.index("U"), label)
|
||||||
moves.preprocess_gold(gold)
|
moves.preprocess_gold(gold)
|
||||||
seq = moves.get_oracle_sequence(doc, gold)
|
moves.get_oracle_sequence(doc, gold)
|
||||||
|
|
||||||
|
|
||||||
def test_oracle_moves_whitespace(en_vocab):
|
def test_oracle_moves_whitespace(en_vocab):
|
||||||
|
|
|
@ -323,7 +323,7 @@ def test_issue3456():
|
||||||
nlp = English()
|
nlp = English()
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
nlp.begin_training()
|
nlp.begin_training()
|
||||||
list(nlp.pipe(['hi', '']))
|
list(nlp.pipe(["hi", ""]))
|
||||||
|
|
||||||
|
|
||||||
def test_issue3468():
|
def test_issue3468():
|
||||||
|
|
|
@ -76,7 +76,6 @@ def test_issue4042_bug2():
|
||||||
output_dir.mkdir()
|
output_dir.mkdir()
|
||||||
ner1.to_disk(output_dir)
|
ner1.to_disk(output_dir)
|
||||||
|
|
||||||
nlp2 = English(vocab)
|
|
||||||
ner2 = EntityRecognizer(vocab)
|
ner2 = EntityRecognizer(vocab)
|
||||||
ner2.from_disk(output_dir)
|
ner2.from_disk(output_dir)
|
||||||
assert len(ner2.labels) == 2
|
assert len(ner2.labels) == 2
|
||||||
|
|
|
@ -1,13 +1,8 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import spacy
|
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from spacy.pipeline import EntityRuler
|
from spacy.pipeline import EntityRuler
|
||||||
from spacy.tokens import Span
|
|
||||||
|
|
||||||
|
|
||||||
def test_issue4267():
|
def test_issue4267():
|
||||||
|
|
|
@ -6,6 +6,6 @@ from spacy.tokens import DocBin
|
||||||
|
|
||||||
def test_issue4367():
|
def test_issue4367():
|
||||||
"""Test that docbin init goes well"""
|
"""Test that docbin init goes well"""
|
||||||
doc_bin_1 = DocBin()
|
DocBin()
|
||||||
doc_bin_2 = DocBin(attrs=["LEMMA"])
|
DocBin(attrs=["LEMMA"])
|
||||||
doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
|
||||||
|
|
|
@ -74,4 +74,4 @@ def test_serialize_doc_bin():
|
||||||
# Deserialize later, e.g. in a new process
|
# Deserialize later, e.g. in a new process
|
||||||
nlp = spacy.blank("en")
|
nlp = spacy.blank("en")
|
||||||
doc_bin = DocBin().from_bytes(bytes_data)
|
doc_bin = DocBin().from_bytes(bytes_data)
|
||||||
docs = list(doc_bin.get_docs(nlp.vocab))
|
list(doc_bin.get_docs(nlp.vocab))
|
||||||
|
|
|
@ -48,8 +48,13 @@ URLS_SHOULD_MATCH = [
|
||||||
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
|
||||||
"ssh://login@server.com:12345/repository.git",
|
"ssh://login@server.com:12345/repository.git",
|
||||||
"svn+ssh://user@ssh.yourdomain.com/path",
|
"svn+ssh://user@ssh.yourdomain.com/path",
|
||||||
pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
pytest.param(
|
||||||
pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
|
"chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
|
||||||
|
marks=pytest.mark.xfail(),
|
||||||
|
),
|
||||||
|
pytest.param(
|
||||||
|
"chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
|
||||||
|
),
|
||||||
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
|
||||||
pytest.param(
|
pytest.param(
|
||||||
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
"http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
|
||||||
|
|
|
@ -50,12 +50,13 @@ def ngrams_vocab(en_vocab, ngrams_vectors):
|
||||||
def data():
|
def data():
|
||||||
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
|
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def most_similar_vectors_data():
|
def most_similar_vectors_data():
|
||||||
return numpy.asarray([[0.0, 1.0, 2.0],
|
return numpy.asarray(
|
||||||
[1.0, -2.0, 4.0],
|
[[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
|
||||||
[1.0, 1.0, -1.0],
|
dtype="f",
|
||||||
[2.0, 3.0, 1.0]], dtype="f")
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
Loading…
Reference in New Issue
Block a user