diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..cf1ecbc76 --- /dev/null +++ b/.flake8 @@ -0,0 +1,4 @@ +[flake8] +ignore = E203, E266, E501, W503 +max-line-length = 80 +select = B,C,E,F,W,T4,B9 diff --git a/requirements.txt b/requirements.txt index cabd28c3b..e2a1860ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,7 @@ ujson>=1.35 dill>=0.2,<0.3 regex==2018.01.10 requests>=2.13.0,<3.0.0 -pytest>=3.6.0,<4.0.0 +pytest>=4.0.0,<5.0.0 pytest-timeout>=1.3.0,<2.0.0 mock>=2.0.0,<3.0.0 pathlib==1.0.1; python_version < "3.4" diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index a78eb04c7..394fca1a5 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import pytest -from io import StringIO, BytesIO from spacy.util import get_lang_class @@ -11,126 +10,135 @@ def pytest_addoption(parser): def pytest_runtest_setup(item): - for opt in ['slow']: + for opt in ["slow"]: if opt in item.keywords and not item.config.getoption("--%s" % opt): pytest.skip("need --%s option to run" % opt) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tokenizer(): - return get_lang_class('xx').Defaults.create_tokenizer() + return get_lang_class("xx").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def en_tokenizer(): - return get_lang_class('en').Defaults.create_tokenizer() + return get_lang_class("en").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def en_vocab(): - return get_lang_class('en').Defaults.create_vocab() + return get_lang_class("en").Defaults.create_vocab() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def en_parser(en_vocab): - nlp = get_lang_class('en')(en_vocab) - return nlp.create_pipe('parser') + nlp = get_lang_class("en")(en_vocab) + return nlp.create_pipe("parser") -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def es_tokenizer(): - return get_lang_class('es').Defaults.create_tokenizer() + return get_lang_class("es").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def de_tokenizer(): - return get_lang_class('de').Defaults.create_tokenizer() + return get_lang_class("de").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def fr_tokenizer(): - return get_lang_class('fr').Defaults.create_tokenizer() + return get_lang_class("fr").Defaults.create_tokenizer() @pytest.fixture def hu_tokenizer(): - return get_lang_class('hu').Defaults.create_tokenizer() + return get_lang_class("hu").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def fi_tokenizer(): - return get_lang_class('fi').Defaults.create_tokenizer() + return get_lang_class("fi").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def ro_tokenizer(): - return get_lang_class('ro').Defaults.create_tokenizer() + return get_lang_class("ro").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def id_tokenizer(): - return get_lang_class('id').Defaults.create_tokenizer() + return get_lang_class("id").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def sv_tokenizer(): - return get_lang_class('sv').Defaults.create_tokenizer() + return get_lang_class("sv").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def bn_tokenizer(): - return get_lang_class('bn').Defaults.create_tokenizer() + return get_lang_class("bn").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def ga_tokenizer(): - return get_lang_class('ga').Defaults.create_tokenizer() + return get_lang_class("ga").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def he_tokenizer(): - return get_lang_class('he').Defaults.create_tokenizer() + return get_lang_class("he").Defaults.create_tokenizer() -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def nb_tokenizer(): - return get_lang_class('nb').Defaults.create_tokenizer() + return get_lang_class("nb").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def da_tokenizer(): - return get_lang_class('da').Defaults.create_tokenizer() + return get_lang_class("da").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def ja_tokenizer(): mecab = pytest.importorskip("MeCab") - return get_lang_class('ja').Defaults.create_tokenizer() + return get_lang_class("ja").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def th_tokenizer(): pythainlp = pytest.importorskip("pythainlp") - return get_lang_class('th').Defaults.create_tokenizer() + return get_lang_class("th").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def tr_tokenizer(): - return get_lang_class('tr').Defaults.create_tokenizer() + return get_lang_class("tr").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def tt_tokenizer(): - return get_lang_class('tt').Defaults.create_tokenizer() + return get_lang_class("tt").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def el_tokenizer(): - return get_lang_class('el').Defaults.create_tokenizer() + return get_lang_class("el").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def ar_tokenizer(): - return get_lang_class('ar').Defaults.create_tokenizer() + return get_lang_class("ar").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def ur_tokenizer(): - return get_lang_class('ur').Defaults.create_tokenizer() + return get_lang_class("ur").Defaults.create_tokenizer() -@pytest.fixture(scope='session') + +@pytest.fixture(scope="session") def ru_tokenizer(): - pymorphy = pytest.importorskip('pymorphy2') - return get_lang_class('ru').Defaults.create_tokenizer() + pymorphy = pytest.importorskip("pymorphy2") + return get_lang_class("ru").Defaults.create_tokenizer() diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 27541875b..c18fce966 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -38,7 +38,7 @@ def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): def test_doc_array_tag(en_tokenizer): text = "A nice sentence." - pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] + pos = ["DET", "ADJ", "NOUN", "PUNCT"] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos) assert doc[0].pos != doc[1].pos != doc[2].pos != doc[3].pos @@ -51,7 +51,7 @@ def test_doc_array_tag(en_tokenizer): def test_doc_array_dep(en_tokenizer): text = "A nice sentence." - deps = ['det', 'amod', 'ROOT', 'punct'] + deps = ["det", "amod", "ROOT", "punct"] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) feats_array = doc.to_array((ORTH, DEP)) diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 652ea00cf..ce42b39b9 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -9,7 +9,7 @@ from spacy.lemmatizer import Lemmatizer @pytest.fixture def lemmatizer(): - return Lemmatizer(lookup={'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'}) + return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"}) @pytest.fixture @@ -23,15 +23,15 @@ def test_empty_doc(vocab): def test_single_word(vocab): - doc = Doc(vocab, words=['a']) - assert doc.text == 'a ' - doc = Doc(vocab, words=['a'], spaces=[False]) - assert doc.text == 'a' + doc = Doc(vocab, words=["a"]) + assert doc.text == "a " + doc = Doc(vocab, words=["a"], spaces=[False]) + assert doc.text == "a" def test_lookup_lemmatization(vocab): - doc = Doc(vocab, words=['dogs', 'dogses']) - assert doc[0].text == 'dogs' - assert doc[0].lemma_ == 'dog' - assert doc[1].text == 'dogses' - assert doc[1].lemma_ == 'dogses' + doc = Doc(vocab, words=["dogs", "dogses"]) + assert doc[0].text == "dogs" + assert doc[0].lemma_ == "dog" + assert doc[1].text == "dogses" + assert doc[1].lemma_ == "dogses" diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 658fcb128..1896f4fd6 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -10,7 +10,7 @@ from spacy.attrs import LEMMA from ..util import get_doc -@pytest.mark.parametrize('text', [["one", "two", "three"]]) +@pytest.mark.parametrize("text", [["one", "two", "three"]]) def test_doc_api_compare_by_string_position(en_vocab, text): doc = Doc(en_vocab, words=text) # Get the tokens in this order, so their ID ordering doesn't match the idx @@ -28,80 +28,81 @@ def test_doc_api_compare_by_string_position(en_vocab, text): def test_doc_api_getitem(en_tokenizer): text = "Give it back! He pleaded." tokens = en_tokenizer(text) - assert tokens[0].text == 'Give' - assert tokens[-1].text == '.' + assert tokens[0].text == "Give" + assert tokens[-1].text == "." with pytest.raises(IndexError): tokens[len(tokens)] def to_str(span): - return '/'.join(token.text for token in span) + return "/".join(token.text for token in span) span = tokens[1:1] assert not to_str(span) span = tokens[1:4] - assert to_str(span) == 'it/back/!' + assert to_str(span) == "it/back/!" span = tokens[1:4:1] - assert to_str(span) == 'it/back/!' + assert to_str(span) == "it/back/!" with pytest.raises(ValueError): tokens[1:4:2] with pytest.raises(ValueError): tokens[1:4:-1] span = tokens[-3:6] - assert to_str(span) == 'He/pleaded' + assert to_str(span) == "He/pleaded" span = tokens[4:-1] - assert to_str(span) == 'He/pleaded' + assert to_str(span) == "He/pleaded" span = tokens[-5:-3] - assert to_str(span) == 'back/!' + assert to_str(span) == "back/!" span = tokens[5:4] assert span.start == span.end == 5 and not to_str(span) span = tokens[4:-3] assert span.start == span.end == 4 and not to_str(span) span = tokens[:] - assert to_str(span) == 'Give/it/back/!/He/pleaded/.' + assert to_str(span) == "Give/it/back/!/He/pleaded/." span = tokens[4:] - assert to_str(span) == 'He/pleaded/.' + assert to_str(span) == "He/pleaded/." span = tokens[:4] - assert to_str(span) == 'Give/it/back/!' + assert to_str(span) == "Give/it/back/!" span = tokens[:-3] - assert to_str(span) == 'Give/it/back/!' + assert to_str(span) == "Give/it/back/!" span = tokens[-3:] - assert to_str(span) == 'He/pleaded/.' + assert to_str(span) == "He/pleaded/." span = tokens[4:50] - assert to_str(span) == 'He/pleaded/.' + assert to_str(span) == "He/pleaded/." span = tokens[-50:4] - assert to_str(span) == 'Give/it/back/!' + assert to_str(span) == "Give/it/back/!" span = tokens[-50:-40] assert span.start == span.end == 0 and not to_str(span) span = tokens[40:50] assert span.start == span.end == 7 and not to_str(span) span = tokens[1:4] - assert span[0].orth_ == 'it' + assert span[0].orth_ == "it" subspan = span[:] - assert to_str(subspan) == 'it/back/!' + assert to_str(subspan) == "it/back/!" subspan = span[:2] - assert to_str(subspan) == 'it/back' + assert to_str(subspan) == "it/back" subspan = span[1:] - assert to_str(subspan) == 'back/!' + assert to_str(subspan) == "back/!" subspan = span[:-1] - assert to_str(subspan) == 'it/back' + assert to_str(subspan) == "it/back" subspan = span[-2:] - assert to_str(subspan) == 'back/!' + assert to_str(subspan) == "back/!" subspan = span[1:2] - assert to_str(subspan) == 'back' + assert to_str(subspan) == "back" subspan = span[-2:-1] - assert to_str(subspan) == 'back' + assert to_str(subspan) == "back" subspan = span[-50:50] - assert to_str(subspan) == 'it/back/!' + assert to_str(subspan) == "it/back/!" subspan = span[50:-50] assert subspan.start == subspan.end == 4 and not to_str(subspan) -@pytest.mark.parametrize('text', ["Give it back! He pleaded.", - " Give it back! He pleaded. "]) +@pytest.mark.parametrize( + "text", ["Give it back! He pleaded.", " Give it back! He pleaded. "] +) def test_doc_api_serialize(en_tokenizer, text): tokens = en_tokenizer(text) new_tokens = Doc(tokens.vocab).from_bytes(tokens.to_bytes()) @@ -110,13 +111,15 @@ def test_doc_api_serialize(en_tokenizer, text): assert [t.orth for t in tokens] == [t.orth for t in new_tokens] new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(tensor=False), tensor=False) + tokens.to_bytes(tensor=False), tensor=False + ) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] new_tokens = Doc(tokens.vocab).from_bytes( - tokens.to_bytes(sentiment=False), sentiment=False) + tokens.to_bytes(sentiment=False), sentiment=False + ) assert tokens.text == new_tokens.text assert [t.text for t in tokens] == [t.text for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] @@ -126,10 +129,10 @@ def test_doc_api_set_ents(en_tokenizer): text = "I use goggle chrone to surf the web" tokens = en_tokenizer(text) assert len(tokens.ents) == 0 - tokens.ents = [(tokens.vocab.strings['PRODUCT'], 2, 4)] + tokens.ents = [(tokens.vocab.strings["PRODUCT"], 2, 4)] assert len(list(tokens.ents)) == 1 assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] - assert tokens.ents[0].label_ == 'PRODUCT' + assert tokens.ents[0].label_ == "PRODUCT" assert tokens.ents[0].start == 2 assert tokens.ents[0].end == 4 @@ -140,21 +143,31 @@ def test_doc_api_merge(en_tokenizer): # merge 'The Beach Boys' doc = en_tokenizer(text) assert len(doc) == 9 - doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA', - ent_type='TYPE') + doc.merge( + doc[4].idx, + doc[6].idx + len(doc[6]), + tag="NAMED", + lemma="LEMMA", + ent_type="TYPE", + ) assert len(doc) == 7 - assert doc[4].text == 'the beach boys' - assert doc[4].text_with_ws == 'the beach boys ' - assert doc[4].tag_ == 'NAMED' + assert doc[4].text == "the beach boys" + assert doc[4].text_with_ws == "the beach boys " + assert doc[4].tag_ == "NAMED" # merge 'all night' doc = en_tokenizer(text) assert len(doc) == 9 - doc.merge(doc[7].idx, doc[8].idx + len(doc[8]), tag='NAMED', lemma='LEMMA', - ent_type='TYPE') + doc.merge( + doc[7].idx, + doc[8].idx + len(doc[8]), + tag="NAMED", + lemma="LEMMA", + ent_type="TYPE", + ) assert len(doc) == 8 - assert doc[7].text == 'all night' - assert doc[7].text_with_ws == 'all night' + assert doc[7].text == "all night" + assert doc[7].text_with_ws == "all night" def test_doc_api_merge_children(en_tokenizer): @@ -162,8 +175,13 @@ def test_doc_api_merge_children(en_tokenizer): text = "WKRO played songs by the beach boys all night" doc = en_tokenizer(text) assert len(doc) == 9 - doc.merge(doc[4].idx, doc[6].idx + len(doc[6]), tag='NAMED', lemma='LEMMA', - ent_type='TYPE') + doc.merge( + doc[4].idx, + doc[6].idx + len(doc[6]), + tag="NAMED", + lemma="LEMMA", + ent_type="TYPE", + ) for word in doc: if word.i < word.head.i: @@ -175,8 +193,8 @@ def test_doc_api_merge_children(en_tokenizer): def test_doc_api_merge_hang(en_tokenizer): text = "through North and South Carolina" doc = en_tokenizer(text) - doc.merge(18, 32, tag='', lemma='', ent_type='ORG') - doc.merge(8, 32, tag='', lemma='', ent_type='ORG') + doc.merge(18, 32, tag="", lemma="", ent_type="ORG") + doc.merge(8, 32, tag="", lemma="", ent_type="ORG") def test_doc_api_retokenizer(en_tokenizer): @@ -184,19 +202,19 @@ def test_doc_api_retokenizer(en_tokenizer): with doc.retokenize() as retokenizer: retokenizer.merge(doc[4:7]) assert len(doc) == 7 - assert doc[4].text == 'the beach boys' + assert doc[4].text == "the beach boys" def test_doc_api_retokenizer_attrs(en_tokenizer): doc = en_tokenizer("WKRO played songs by the beach boys all night") # test both string and integer attributes and values - attrs = {LEMMA: 'boys', 'ENT_TYPE': doc.vocab.strings['ORG']} + attrs = {LEMMA: "boys", "ENT_TYPE": doc.vocab.strings["ORG"]} with doc.retokenize() as retokenizer: retokenizer.merge(doc[4:7], attrs=attrs) assert len(doc) == 7 - assert doc[4].text == 'the beach boys' - assert doc[4].lemma_ == 'boys' - assert doc[4].ent_type_ == 'ORG' + assert doc[4].text == "the beach boys" + assert doc[4].lemma_ == "boys" + assert doc[4].ent_type_ == "ORG" @pytest.mark.xfail @@ -205,11 +223,11 @@ def test_doc_api_retokenizer_lex_attrs(en_tokenizer): doc = en_tokenizer("WKRO played beach boys songs") assert not any(token.is_stop for token in doc) with doc.retokenize() as retokenizer: - retokenizer.merge(doc[2:4], attrs={'LEMMA': 'boys', 'IS_STOP': True}) - assert doc[2].text == 'beach boys' - assert doc[2].lemma_ == 'boys' + retokenizer.merge(doc[2:4], attrs={"LEMMA": "boys", "IS_STOP": True}) + assert doc[2].text == "beach boys" + assert doc[2].lemma_ == "boys" assert doc[2].is_stop - new_doc = Doc(doc.vocab, words=['beach boys']) + new_doc = Doc(doc.vocab, words=["beach boys"]) assert new_doc[0].is_stop @@ -222,21 +240,25 @@ def test_doc_api_sents_empty_string(en_tokenizer): def test_doc_api_runtime_error(en_tokenizer): # Example that caused run-time error while parsing Reddit + # fmt: off text = "67% of black households are single parent \n\n72% of all black babies born out of wedlock \n\n50% of all black kids don\u2019t finish high school" - deps = ['nsubj', 'prep', 'amod', 'pobj', 'ROOT', 'amod', 'attr', '', - 'nummod', 'prep', 'det', 'amod', 'pobj', 'acl', 'prep', 'prep', - 'pobj', '', 'nummod', 'prep', 'det', 'amod', 'pobj', 'aux', 'neg', - 'ROOT', 'amod', 'dobj'] + deps = ["nsubj", "prep", "amod", "pobj", "ROOT", "amod", "attr", "", + "nummod", "prep", "det", "amod", "pobj", "acl", "prep", "prep", + "pobj", "", "nummod", "prep", "det", "amod", "pobj", "aux", "neg", + "ROOT", "amod", "dobj"] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps) nps = [] for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): + while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] if len(np) > 1: - nps.append((np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_)) + nps.append( + (np.start_char, np.end_char, np.root.tag_, np.text, np.root.ent_type_) + ) for np in nps: start, end, tag, lemma, ent_type = np doc.merge(start, end, tag=tag, lemma=lemma, ent_type=ent_type) @@ -244,57 +266,76 @@ def test_doc_api_runtime_error(en_tokenizer): def test_doc_api_right_edge(en_tokenizer): """Test for bug occurring from Unshift action, causing incorrect right edge""" + # fmt: off text = "I have proposed to myself, for the sake of such as live under the government of the Romans, to translate those books into the Greek tongue." heads = [2, 1, 0, -1, -1, -3, 15, 1, -2, -1, 1, -3, -1, -1, 1, -2, -1, 1, -2, -7, 1, -19, 1, -2, -3, 2, 1, -3, -26] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - assert doc[6].text == 'for' + assert doc[6].text == "for" subtree = [w.text for w in doc[6].subtree] - assert subtree == ['for', 'the', 'sake', 'of', 'such', 'as', - 'live', 'under', 'the', 'government', 'of', 'the', 'Romans', ','] - assert doc[6].right_edge.text == ',' + assert subtree == [ + "for", + "the", + "sake", + "of", + "such", + "as", + "live", + "under", + "the", + "government", + "of", + "the", + "Romans", + ",", + ] + assert doc[6].right_edge.text == "," def test_doc_api_has_vector(): vocab = Vocab() vocab.reset_vectors(width=2) - vocab.set_vector('kitten', vector=numpy.asarray([0., 2.], dtype='f')) - doc = Doc(vocab, words=['kitten']) + vocab.set_vector("kitten", vector=numpy.asarray([0.0, 2.0], dtype="f")) + doc = Doc(vocab, words=["kitten"]) assert doc.has_vector def test_doc_api_similarity_match(): - doc = Doc(Vocab(), words=['a']) + doc = Doc(Vocab(), words=["a"]) with pytest.warns(None): assert doc.similarity(doc[0]) == 1.0 - assert doc.similarity(doc.vocab['a']) == 1.0 - doc2 = Doc(doc.vocab, words=['a', 'b', 'c']) + assert doc.similarity(doc.vocab["a"]) == 1.0 + doc2 = Doc(doc.vocab, words=["a", "b", "c"]) with pytest.warns(None): assert doc.similarity(doc2[:1]) == 1.0 assert doc.similarity(doc2) == 0.0 def test_lowest_common_ancestor(en_tokenizer): - tokens = en_tokenizer('the lazy dog slept') + tokens = en_tokenizer("the lazy dog slept") doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0]) lca = doc.get_lca_matrix() - assert(lca[1, 1] == 1) - assert(lca[0, 1] == 2) - assert(lca[1, 2] == 2) + assert lca[1, 1] == 1 + assert lca[0, 1] == 2 + assert lca[1, 2] == 2 def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" - text = 'I like New York in Autumn.' + text = "I like New York in Autumn." heads = [1, 0, 1, -2, -3, -1, -5] - tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags) # full method parse_tree(text) is a trivial composition trees = doc.print_tree() assert len(trees) > 0 tree = trees[0] - assert all(k in list(tree.keys()) for k in ['word', 'lemma', 'NE', 'POS_fine', 'POS_coarse', 'arc', 'modifiers']) - assert tree['word'] == 'like' # check root is correct + assert all( + k in list(tree.keys()) + for k in ["word", "lemma", "NE", "POS_fine", "POS_coarse", "arc", "modifiers"] + ) + assert tree["word"] == "like" # check root is correct diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index 85133461b..f4bfb2f1e 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -7,37 +7,38 @@ from spacy.compat import pickle, unicode_ def test_pickle_single_doc(): nlp = Language() - doc = nlp('pickle roundtrip') + doc = nlp("pickle roundtrip") data = pickle.dumps(doc, 1) doc2 = pickle.loads(data) - assert doc2.text == 'pickle roundtrip' + assert doc2.text == "pickle roundtrip" def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): _ = nlp.vocab[unicode_(i)] - one_pickled = pickle.dumps(nlp('0'), -1) + one_pickled = pickle.dumps(nlp("0"), -1) docs = list(nlp.pipe(unicode_(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) - assert many_unpickled[0].text == '0' - assert many_unpickled[-1].text == '99' + assert many_unpickled[0].text == "0" + assert many_unpickled[-1].text == "99" assert len(many_unpickled) == 100 def test_user_data_from_disk(): nlp = Language() - doc = nlp('Hello') + doc = nlp("Hello") doc.user_data[(0, 1)] = False b = doc.to_bytes() doc2 = doc.__class__(doc.vocab).from_bytes(b) assert doc2.user_data[(0, 1)] == False + def test_user_data_unpickles(): nlp = Language() - doc = nlp('Hello') + doc = nlp("Hello") doc.user_data[(0, 1)] = False b = pickle.dumps(doc) doc2 = pickle.loads(b) @@ -46,10 +47,11 @@ def test_user_data_unpickles(): def test_hooks_unpickle(): def inner_func(d1, d2): - return 'hello!' + return "hello!" + nlp = Language() - doc = nlp('Hello') - doc.user_hooks['similarity'] = inner_func + doc = nlp("Hello") + doc.user_hooks["similarity"] = inner_func b = pickle.dumps(doc) doc2 = pickle.loads(b) - assert doc2.similarity(None) == 'hello!' + assert doc2.similarity(None) == "hello!" diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 86f05dc39..6fd22b3ff 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -11,10 +11,12 @@ from ..util import get_doc @pytest.fixture def doc(en_tokenizer): + # fmt: off text = "This is a sentence. This is another sentence. And a third." heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1] - deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', - 'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct'] + deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", + "attr", "punct", "ROOT", "det", "npadvmod", "punct"] + # fmt: on tokens = en_tokenizer(text) return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) @@ -39,17 +41,17 @@ def test_spans_sent_spans(doc): def test_spans_root(doc): span = doc[2:4] assert len(span) == 2 - assert span.text == 'a sentence' - assert span.root.text == 'sentence' - assert span.root.head.text == 'is' + assert span.text == "a sentence" + assert span.root.text == "sentence" + assert span.root.head.text == "is" def test_spans_string_fn(doc): span = doc[0:4] assert len(span) == 4 - assert span.text == 'This is a sentence' - assert span.upper_ == 'THIS IS A SENTENCE' - assert span.lower_ == 'this is a sentence' + assert span.text == "This is a sentence" + assert span.upper_ == "THIS IS A SENTENCE" + assert span.lower_ == "this is a sentence" def test_spans_root2(en_tokenizer): @@ -57,15 +59,15 @@ def test_spans_root2(en_tokenizer): heads = [0, 3, -1, -2, -4] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) - assert doc[-2:].root.text == 'Carolina' + assert doc[-2:].root.text == "Carolina" def test_spans_span_sent(doc, doc_not_parsed): """Test span.sent property""" assert len(list(doc.sents)) - assert doc[:2].sent.root.text == 'is' - assert doc[:2].sent.text == 'This is a sentence .' - assert doc[6:7].sent.root.left_edge.text == 'This' + assert doc[:2].sent.root.text == "is" + assert doc[:2].sent.text == "This is a sentence ." + assert doc[6:7].sent.root.left_edge.text == "This" # test on manual sbd doc_not_parsed[0].is_sent_start = True doc_not_parsed[5].is_sent_start = True @@ -75,23 +77,23 @@ def test_spans_span_sent(doc, doc_not_parsed): def test_spans_lca_matrix(en_tokenizer): """Test span's lca matrix generation""" - tokens = en_tokenizer('the lazy dog slept') + tokens = en_tokenizer("the lazy dog slept") doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[2, 1, 1, 0]) lca = doc[:2].get_lca_matrix() - assert(lca[0, 0] == 0) - assert(lca[0, 1] == -1) - assert(lca[1, 0] == -1) - assert(lca[1, 1] == 1) + assert lca[0, 0] == 0 + assert lca[0, 1] == -1 + assert lca[1, 0] == -1 + assert lca[1, 1] == 1 def test_span_similarity_match(): - doc = Doc(Vocab(), words=['a', 'b', 'a', 'b']) + doc = Doc(Vocab(), words=["a", "b", "a", "b"]) span1 = doc[:2] span2 = doc[2:] with pytest.warns(None): assert span1.similarity(span2) == 1.0 assert span1.similarity(doc) == 0.0 - assert span1[:1].similarity(doc.vocab['a']) == 1.0 + assert span1[:1].similarity(doc.vocab["a"]) == 1.0 def test_spans_default_sentiment(en_tokenizer): @@ -102,8 +104,8 @@ def test_spans_default_sentiment(en_tokenizer): tokens.vocab[tokens[2].text].sentiment = -2.0 doc = Doc(tokens.vocab, words=[t.text for t in tokens]) assert doc[:2].sentiment == 3.0 / 2 - assert doc[-2:].sentiment == -2. / 2 - assert doc[:-1].sentiment == (3.+-2) / 3. + assert doc[-2:].sentiment == -2.0 / 2 + assert doc[:-1].sentiment == (3.0 + -2) / 3.0 def test_spans_override_sentiment(en_tokenizer): @@ -113,7 +115,7 @@ def test_spans_override_sentiment(en_tokenizer): tokens.vocab[tokens[0].text].sentiment = 3.0 tokens.vocab[tokens[2].text].sentiment = -2.0 doc = Doc(tokens.vocab, words=[t.text for t in tokens]) - doc.user_span_hooks['sentiment'] = lambda span: 10.0 + doc.user_span_hooks["sentiment"] = lambda span: 10.0 assert doc[:2].sentiment == 10.0 assert doc[-2:].sentiment == 10.0 assert doc[:-1].sentiment == 10.0 @@ -132,10 +134,10 @@ def test_spans_are_hashable(en_tokenizer): def test_spans_by_character(doc): span1 = doc[1:-2] - span2 = doc.char_span(span1.start_char, span1.end_char, label='GPE') + span2 = doc.char_span(span1.start_char, span1.end_char, label="GPE") assert span1.start_char == span2.start_char assert span1.end_char == span2.end_char - assert span2.label_ == 'GPE' + assert span2.label_ == "GPE" def test_span_to_array(doc): @@ -151,12 +153,13 @@ def test_span_as_doc(doc): span_doc = span.as_doc() assert span.text == span_doc.text.strip() + def test_span_ents_property(doc): """Test span.ents for the """ doc.ents = [ - (doc.vocab.strings['PRODUCT'], 0, 1), - (doc.vocab.strings['PRODUCT'], 7, 8), - (doc.vocab.strings['PRODUCT'], 11, 14) + (doc.vocab.strings["PRODUCT"], 0, 1), + (doc.vocab.strings["PRODUCT"], 7, 8), + (doc.vocab.strings["PRODUCT"], 11, 14), ] assert len(list(doc.ents)) == 3 sentences = list(doc.sents) diff --git a/spacy/tests/doc/test_span_merge.py b/spacy/tests/doc/test_span_merge.py index 24ab17b8e..458e6bbe3 100644 --- a/spacy/tests/doc/test_span_merge.py +++ b/spacy/tests/doc/test_span_merge.py @@ -13,22 +13,23 @@ def test_spans_merge_tokens(en_tokenizer): tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert len(doc) == 4 - assert doc[0].head.text == 'Angeles' - assert doc[1].head.text == 'start' - doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', ent_type='GPE') + assert doc[0].head.text == "Angeles" + assert doc[1].head.text == "start" + doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", ent_type="GPE") assert len(doc) == 3 - assert doc[0].text == 'Los Angeles' - assert doc[0].head.text == 'start' + assert doc[0].text == "Los Angeles" + assert doc[0].head.text == "start" doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert len(doc) == 4 - assert doc[0].head.text == 'Angeles' - assert doc[1].head.text == 'start' - doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE') + assert doc[0].head.text == "Angeles" + assert doc[1].head.text == "start" + doc.merge(0, len("Los Angeles"), tag="NNP", lemma="Los Angeles", label="GPE") assert len(doc) == 3 - assert doc[0].text == 'Los Angeles' - assert doc[0].head.text == 'start' - assert doc[0].ent_type_ == 'GPE' + assert doc[0].text == "Los Angeles" + assert doc[0].head.text == "start" + assert doc[0].ent_type_ == "GPE" + def test_spans_merge_heads(en_tokenizer): text = "I found a pilates class near work." @@ -37,8 +38,13 @@ def test_spans_merge_heads(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert len(doc) == 8 - doc.merge(doc[3].idx, doc[4].idx + len(doc[4]), tag=doc[4].tag_, - lemma='pilates class', ent_type='O') + doc.merge( + doc[3].idx, + doc[4].idx + len(doc[4]), + tag=doc[4].tag_, + lemma="pilates class", + ent_type="O", + ) assert len(doc) == 7 assert doc[0].head.i == 1 assert doc[1].head.i == 1 @@ -55,8 +61,9 @@ def test_span_np_merges(en_tokenizer): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) assert doc[4].head.i == 1 - doc.merge(doc[2].idx, doc[4].idx + len(doc[4]), tag='NP', lemma='tool', - ent_type='O') + doc.merge( + doc[2].idx, doc[4].idx + len(doc[4]), tag="NP", lemma="tool", ent_type="O" + ) assert doc[2].head.i == 1 text = "displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript." @@ -69,7 +76,6 @@ def test_span_np_merges(en_tokenizer): merged = doc.merge(start, end, tag=label, lemma=lemma, ent_type=label) assert merged != None, (start, end, label, lemma) - text = "One test with entities like New York City so the ents list is not void" heads = [1, 11, -1, -1, -1, 1, 1, -3, 4, 2, 1, 1, 0, -1, -2] tokens = en_tokenizer(text) @@ -80,15 +86,23 @@ def test_span_np_merges(en_tokenizer): def test_spans_entity_merge(en_tokenizer): + # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale.\n" heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2, -13, -1] - tags = ['NNP', 'NNP', 'VBZ', 'DT', 'VB', 'RP', 'NN', 'WP', 'VBZ', 'IN', 'NNP', 'CC', 'VBZ', 'NNP', 'NNP', '.', 'SP'] - ents = [(0, 2, 'PERSON'), (10, 11, 'GPE'), (13, 15, 'PERSON')] + tags = ["NNP", "NNP", "VBZ", "DT", "VB", "RP", "NN", "WP", "VBZ", "IN", "NNP", "CC", "VBZ", "NNP", "NNP", ".", "SP"] + ents = [(0, 2, "PERSON"), (10, 11, "GPE"), (13, 15, "PERSON")] + # fmt: on tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], heads=heads, tags=tags, ents=ents + ) assert len(doc) == 17 for ent in doc.ents: - label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent)) + label, lemma, type_ = ( + ent.root.tag_, + ent.root.lemma_, + max(w.ent_type_ for w in ent), + ) ent.merge(label=label, lemma=lemma, ent_type=type_) # check looping is ok assert len(doc) == 15 @@ -98,8 +112,10 @@ def test_spans_entity_merge_iob(): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) - doc.ents = [(doc.vocab.strings.add('ent-abc'), 0, 3), - (doc.vocab.strings.add('ent-d'), 3, 4)] + doc.ents = [ + (doc.vocab.strings.add("ent-abc"), 0, 3), + (doc.vocab.strings.add("ent-d"), 3, 4), + ] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" @@ -110,33 +126,37 @@ def test_spans_entity_merge_iob(): def test_spans_sentence_update_after_merge(en_tokenizer): + # fmt: off text = "Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale." heads = [1, 1, 0, 1, 2, -1, -4, -5, 1, 0, -1, -1, -3, -4, 1, -2, -7] deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', 'punct', 'nsubj', 'ROOT', 'prep', 'pobj', 'cc', 'conj', 'compound', 'dobj', 'punct'] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1, sent2 = list(doc.sents) init_len = len(sent1) init_len2 = len(sent2) - doc[0:2].merge(label='none', lemma='none', ent_type='none') - doc[-2:].merge(label='none', lemma='none', ent_type='none') + doc[0:2].merge(label="none", lemma="none", ent_type="none") + doc[-2:].merge(label="none", lemma="none", ent_type="none") assert len(sent1) == init_len - 1 assert len(sent2) == init_len2 - 1 def test_spans_subtree_size_check(en_tokenizer): + # fmt: off text = "Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale" heads = [1, 1, 0, 1, 2, -1, -4, 1, -2, -1, -1, -3, -10, 1, -2] - deps = ['compound', 'nsubj', 'ROOT', 'det', 'amod', 'prt', 'attr', - 'nsubj', 'relcl', 'prep', 'pobj', 'cc', 'conj', 'compound', - 'dobj'] + deps = ["compound", "nsubj", "ROOT", "det", "amod", "prt", "attr", + "nsubj", "relcl", "prep", "pobj", "cc", "conj", "compound", + "dobj"] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) - doc[0:2].merge(label='none', lemma='none', ent_type='none') + doc[0:2].merge(label="none", lemma="none", ent_type="none") assert len(list(sent1.root.subtree)) == init_len - 1 diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 511ebbad0..5713c5c07 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -13,31 +13,35 @@ from ..util import get_doc @pytest.fixture def doc(en_tokenizer): + # fmt: off text = "This is a sentence. This is another sentence. And a third." heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 0, 1, -2, -1] - deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', - 'attr', 'punct', 'ROOT', 'det', 'npadvmod', 'punct'] + deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", + "attr", "punct", "ROOT", "det", "npadvmod", "punct"] + # fmt: on tokens = en_tokenizer(text) return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) def test_doc_token_api_strings(en_tokenizer): text = "Give it back! He pleaded." - pos = ['VERB', 'PRON', 'PART', 'PUNCT', 'PRON', 'VERB', 'PUNCT'] + pos = ["VERB", "PRON", "PART", "PUNCT", "PRON", "VERB", "PUNCT"] heads = [0, -1, -2, -3, 1, 0, -1] - deps = ['ROOT', 'dobj', 'prt', 'punct', 'nsubj', 'ROOT', 'punct'] + deps = ["ROOT", "dobj", "prt", "punct", "nsubj", "ROOT", "punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps) - assert doc[0].orth_ == 'Give' - assert doc[0].text == 'Give' - assert doc[0].text_with_ws == 'Give ' - assert doc[0].lower_ == 'give' - assert doc[0].shape_ == 'Xxxx' - assert doc[0].prefix_ == 'G' - assert doc[0].suffix_ == 'ive' - assert doc[0].pos_ == 'VERB' - assert doc[0].dep_ == 'ROOT' + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], pos=pos, heads=heads, deps=deps + ) + assert doc[0].orth_ == "Give" + assert doc[0].text == "Give" + assert doc[0].text_with_ws == "Give " + assert doc[0].lower_ == "give" + assert doc[0].shape_ == "Xxxx" + assert doc[0].prefix_ == "G" + assert doc[0].suffix_ == "ive" + assert doc[0].pos_ == "VERB" + assert doc[0].dep_ == "ROOT" def test_doc_token_api_flags(en_tokenizer): @@ -53,7 +57,7 @@ def test_doc_token_api_flags(en_tokenizer): # TODO: Test more of these, esp. if a bug is found -@pytest.mark.parametrize('text', ["Give it back! He pleaded."]) +@pytest.mark.parametrize("text", ["Give it back! He pleaded."]) def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text): word = text.split()[0] en_tokenizer.vocab[word].prob = -1 @@ -61,11 +65,11 @@ def test_doc_token_api_prob_inherited_from_vocab(en_tokenizer, text): assert tokens[0].prob != 0 -@pytest.mark.parametrize('text', ["one two"]) +@pytest.mark.parametrize("text", ["one two"]) def test_doc_token_api_str_builtin(en_tokenizer, text): tokens = en_tokenizer(text) - assert str(tokens[0]) == text.split(' ')[0] - assert str(tokens[1]) == text.split(' ')[1] + assert str(tokens[0]) == text.split(" ")[0] + assert str(tokens[1]) == text.split(" ")[1] def test_doc_token_api_is_properties(en_vocab): @@ -83,16 +87,16 @@ def test_doc_token_api_is_properties(en_vocab): def test_doc_token_api_vectors(): vocab = Vocab() vocab.reset_vectors(width=2) - vocab.set_vector('apples', vector=numpy.asarray([0., 2.], dtype='f')) - vocab.set_vector('oranges', vector=numpy.asarray([0., 1.], dtype='f')) - doc = Doc(vocab, words=['apples', 'oranges', 'oov']) + vocab.set_vector("apples", vector=numpy.asarray([0.0, 2.0], dtype="f")) + vocab.set_vector("oranges", vector=numpy.asarray([0.0, 1.0], dtype="f")) + doc = Doc(vocab, words=["apples", "oranges", "oov"]) assert doc.has_vector assert doc[0].has_vector assert doc[1].has_vector assert not doc[2].has_vector - apples_norm = (0*0 + 2*2) ** 0.5 - oranges_norm = (0*0 + 1*1) ** 0.5 - cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm) + apples_norm = (0 * 0 + 2 * 2) ** 0.5 + oranges_norm = (0 * 0 + 1 * 1) ** 0.5 + cosine = ((0 * 0) + (2 * 1)) / (apples_norm * oranges_norm) assert doc[0].similarity(doc[1]) == cosine @@ -165,7 +169,7 @@ def test_doc_token_api_head_setter(en_tokenizer): def test_is_sent_start(en_tokenizer): - doc = en_tokenizer('This is a sentence. This is another.') + doc = en_tokenizer("This is a sentence. This is another.") assert doc[5].is_sent_start is None doc[5].is_sent_start = True assert doc[5].is_sent_start is True @@ -174,17 +178,17 @@ def test_is_sent_start(en_tokenizer): def test_set_pos(): - doc = Doc(Vocab(), words=['hello', 'world']) - doc[0].pos_ = 'NOUN' - assert doc[0].pos_ == 'NOUN' + doc = Doc(Vocab(), words=["hello", "world"]) + doc[0].pos_ = "NOUN" + assert doc[0].pos_ == "NOUN" doc[1].pos = VERB - assert doc[1].pos_ == 'VERB' + assert doc[1].pos_ == "VERB" def test_tokens_sent(doc): """Test token.sent property""" assert len(list(doc.sents)) == 3 - assert doc[1].sent.text == 'This is a sentence .' - assert doc[7].sent.text == 'This is another sentence .' - assert doc[1].sent.root.left_edge.text == 'This' - assert doc[7].sent.root.left_edge.text == 'This' + assert doc[1].sent.text == "This is a sentence ." + assert doc[7].sent.text == "This is another sentence ." + assert doc[1].sent.root.left_edge.text == "This" + assert doc[7].sent.root.left_edge.text == "This" diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 0c3143245..8b077525a 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -20,7 +20,7 @@ def test_doc_underscore_getattr_setattr(): doc = Mock() doc.doc = doc doc.user_data = {} - Underscore.doc_extensions['hello'] = (False, None, None, None) + Underscore.doc_extensions["hello"] = (False, None, None, None) doc._ = Underscore(Underscore.doc_extensions, doc) assert doc._.hello == False doc._.hello = True @@ -29,8 +29,9 @@ def test_doc_underscore_getattr_setattr(): def test_create_span_underscore(): span = Mock(doc=Mock(), start=0, end=2) - uscore = Underscore(Underscore.span_extensions, span, - start=span.start, end=span.end) + uscore = Underscore( + Underscore.span_extensions, span, start=span.start, end=span.end + ) assert uscore._doc is span.doc assert uscore._start is span.start assert uscore._end is span.end @@ -38,60 +39,70 @@ def test_create_span_underscore(): def test_span_underscore_getter_setter(): span = Mock(doc=Mock(), start=0, end=2) - Underscore.span_extensions['hello'] = (None, None, - lambda s: (s.start, 'hi'), - lambda s, value: setattr(s, 'start', - value)) - span._ = Underscore(Underscore.span_extensions, span, - start=span.start, end=span.end) + Underscore.span_extensions["hello"] = ( + None, + None, + lambda s: (s.start, "hi"), + lambda s, value: setattr(s, "start", value), + ) + span._ = Underscore( + Underscore.span_extensions, span, start=span.start, end=span.end + ) - assert span._.hello == (0, 'hi') + assert span._.hello == (0, "hi") span._.hello = 1 - assert span._.hello == (1, 'hi') + assert span._.hello == (1, "hi") def test_token_underscore_method(): - token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: 'cheese') - Underscore.token_extensions['hello'] = (None, token.say_cheese, - None, None) + token = Mock(doc=Mock(), idx=7, say_cheese=lambda token: "cheese") + Underscore.token_extensions["hello"] = (None, token.say_cheese, None, None) token._ = Underscore(Underscore.token_extensions, token, start=token.idx) - assert token._.hello() == 'cheese' + assert token._.hello() == "cheese" -@pytest.mark.parametrize('obj', [Doc, Span, Token]) +@pytest.mark.parametrize("obj", [Doc, Span, Token]) def test_doc_underscore_remove_extension(obj): - ext_name = 'to_be_removed' + ext_name = "to_be_removed" obj.set_extension(ext_name, default=False) assert obj.has_extension(ext_name) obj.remove_extension(ext_name) assert not obj.has_extension(ext_name) -@pytest.mark.parametrize('obj', [Doc, Span, Token]) +@pytest.mark.parametrize("obj", [Doc, Span, Token]) def test_underscore_raises_for_dup(obj): - obj.set_extension('test', default=None) + obj.set_extension("test", default=None) with pytest.raises(ValueError): - obj.set_extension('test', default=None) + obj.set_extension("test", default=None) -@pytest.mark.parametrize('invalid_kwargs', [ - {'getter': None, 'setter': lambda: None}, - {'default': None, 'method': lambda: None, 'getter': lambda: None}, - {'setter': lambda: None}, - {'default': None, 'method': lambda: None}, - {'getter': True}]) +@pytest.mark.parametrize( + "invalid_kwargs", + [ + {"getter": None, "setter": lambda: None}, + {"default": None, "method": lambda: None, "getter": lambda: None}, + {"setter": lambda: None}, + {"default": None, "method": lambda: None}, + {"getter": True}, + ], +) def test_underscore_raises_for_invalid(invalid_kwargs): - invalid_kwargs['force'] = True + invalid_kwargs["force"] = True with pytest.raises(ValueError): - Doc.set_extension('test', **invalid_kwargs) + Doc.set_extension("test", **invalid_kwargs) -@pytest.mark.parametrize('valid_kwargs', [ - {'getter': lambda: None}, - {'getter': lambda: None, 'setter': lambda: None}, - {'default': 'hello'}, - {'default': None}, - {'method': lambda: None}]) +@pytest.mark.parametrize( + "valid_kwargs", + [ + {"getter": lambda: None}, + {"getter": lambda: None, "setter": lambda: None}, + {"default": "hello"}, + {"default": None}, + {"method": lambda: None}, + ], +) def test_underscore_accepts_valid(valid_kwargs): - valid_kwargs['force'] = True - Doc.set_extension('test', **valid_kwargs) + valid_kwargs["force"] = True + Doc.set_extension("test", **valid_kwargs) diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py index 323118002..3cfc380d2 100644 --- a/spacy/tests/lang/ar/test_exceptions.py +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["ق.م", "إلخ", "ص.ب", "ت."]) +@pytest.mark.parametrize("text", ["ق.م", "إلخ", "ص.ب", "ت."]) def test_ar_tokenizer_handles_abbr(ar_tokenizer, text): tokens = ar_tokenizer(text) assert len(tokens) == 1 @@ -18,7 +18,7 @@ def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): assert tokens[6].lemma_ == "قبل الميلاد" -def test_ar_tokenizer_handles_exc_in_text(ar_tokenizer): +def test_ar_tokenizer_handles_exc_in_text_2(ar_tokenizer): text = "يبلغ طول مضيق طارق 14كم " tokens = ar_tokenizer(text) assert len(tokens) == 6 diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py index 772fc07fa..5c7cf8198 100644 --- a/spacy/tests/lang/bn/test_tokenizer.py +++ b/spacy/tests/lang/bn/test_tokenizer.py @@ -6,16 +6,22 @@ import pytest TESTCASES = [ # punctuation tests - ('আমি বাংলায় গান গাই!', ['আমি', 'বাংলায়', 'গান', 'গাই', '!']), - ('আমি বাংলায় কথা কই।', ['আমি', 'বাংলায়', 'কথা', 'কই', '।']), - ('বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?', ['বসুন্ধরা', 'জনসম্মুখে', 'দোষ', 'স্বীকার', 'করলো', 'না', '?']), - ('টাকা থাকলে কি না হয়!', ['টাকা', 'থাকলে', 'কি', 'না', 'হয়', '!']), + ("আমি বাংলায় গান গাই!", ["আমি", "বাংলায়", "গান", "গাই", "!"]), + ("আমি বাংলায় কথা কই।", ["আমি", "বাংলায়", "কথা", "কই", "।"]), + ( + "বসুন্ধরা জনসম্মুখে দোষ স্বীকার করলো না?", + ["বসুন্ধরা", "জনসম্মুখে", "দোষ", "স্বীকার", "করলো", "না", "?"], + ), + ("টাকা থাকলে কি না হয়!", ["টাকা", "থাকলে", "কি", "না", "হয়", "!"]), # abbreviations - ('ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।', ['ডঃ', 'খালেদ', 'বললেন', 'ঢাকায়', '৩৫', 'ডিগ্রি', 'সে.', '।']) + ( + "ডঃ খালেদ বললেন ঢাকায় ৩৫ ডিগ্রি সে.।", + ["ডঃ", "খালেদ", "বললেন", "ঢাকায়", "৩৫", "ডিগ্রি", "সে.", "।"], + ), ] -@pytest.mark.parametrize('text,expected_tokens', TESTCASES) +@pytest.mark.parametrize("text,expected_tokens", TESTCASES) def test_bn_tokenizer_handles_testcases(bn_tokenizer, text, expected_tokens): tokens = bn_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index 2e6493210..87ea5acd6 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -4,19 +4,19 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."]) +@pytest.mark.parametrize("text", ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."]) def test_da_tokenizer_handles_abbr(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."]) +@pytest.mark.parametrize("text", ["Jul.", "jul.", "Tor.", "Tors."]) def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["1.", "10.", "31."]) +@pytest.mark.parametrize("text", ["1.", "10.", "31."]) def test_da_tokenizer_handles_dates(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 @@ -37,8 +37,9 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): assert tokens[7].text == "." -@pytest.mark.parametrize('text,norm', [ - ("akvarium", "akvarie"), ("bedstemoder", "bedstemor")]) +@pytest.mark.parametrize( + "text,norm", [("akvarium", "akvarie"), ("bedstemoder", "bedstemor")] +) def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): tokens = da_tokenizer(text) assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/da/test_lemma.py b/spacy/tests/lang/da/test_lemma.py index 3cfd7f329..5f9b371c0 100644 --- a/spacy/tests/lang/da/test_lemma.py +++ b/spacy/tests/lang/da/test_lemma.py @@ -4,11 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('string,lemma', [ - ('affaldsgruppernes', 'affaldsgruppe'), - ('detailhandelsstrukturernes', 'detailhandelsstruktur'), - ('kolesterols', 'kolesterol'), - ('åsyns', 'åsyn')]) +@pytest.mark.parametrize( + "string,lemma", + [ + ("affaldsgruppernes", "affaldsgruppe"), + ("detailhandelsstrukturernes", "detailhandelsstruktur"), + ("kolesterols", "kolesterol"), + ("åsyns", "åsyn"), + ], +) def test_da_lemmatizer_lookup_assigns(da_tokenizer, string, lemma): tokens = da_tokenizer(string) assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/da/test_prefix_suffix_infix.py b/spacy/tests/lang/da/test_prefix_suffix_infix.py index d313aebe5..8b43bf360 100644 --- a/spacy/tests/lang/da/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/da/test_prefix_suffix_infix.py @@ -4,19 +4,19 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["(under)"]) +@pytest.mark.parametrize("text", ["(under)"]) def test_da_tokenizer_splits_no_special(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["ta'r", "Søren's", "Lars'"]) +@pytest.mark.parametrize("text", ["ta'r", "Søren's", "Lars'"]) def test_da_tokenizer_handles_no_punct(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["(ta'r"]) +@pytest.mark.parametrize("text", ["(ta'r"]) def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 2 @@ -24,7 +24,7 @@ def test_da_tokenizer_splits_prefix_punct(da_tokenizer, text): assert tokens[1].text == "ta'r" -@pytest.mark.parametrize('text', ["ta'r)"]) +@pytest.mark.parametrize("text", ["ta'r)"]) def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 2 @@ -32,15 +32,16 @@ def test_da_tokenizer_splits_suffix_punct(da_tokenizer, text): assert tokens[1].text == ")" -@pytest.mark.parametrize('text,expected', [ - ("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])]) +@pytest.mark.parametrize( + "text,expected", [("(ta'r)", ["(", "ta'r", ")"]), ("'ta'r'", ["'", "ta'r", "'"])] +) def test_da_tokenizer_splits_even_wrap(da_tokenizer, text, expected): tokens = da_tokenizer(text) assert len(tokens) == len(expected) assert [t.text for t in tokens] == expected -@pytest.mark.parametrize('text', ["(ta'r?)"]) +@pytest.mark.parametrize("text", ["(ta'r?)"]) def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 4 @@ -50,15 +51,17 @@ def test_da_tokenizer_splits_uneven_wrap(da_tokenizer, text): assert tokens[3].text == ")" -@pytest.mark.parametrize('text,expected', [ - ("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])]) +@pytest.mark.parametrize( + "text,expected", + [("f.eks.", ["f.eks."]), ("fe.", ["fe", "."]), ("(f.eks.", ["(", "f.eks."])], +) def test_da_tokenizer_splits_prefix_interact(da_tokenizer, text, expected): tokens = da_tokenizer(text) assert len(tokens) == len(expected) assert [t.text for t in tokens] == expected -@pytest.mark.parametrize('text', ["f.eks.)"]) +@pytest.mark.parametrize("text", ["f.eks.)"]) def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 2 @@ -66,7 +69,7 @@ def test_da_tokenizer_splits_suffix_interact(da_tokenizer, text): assert tokens[1].text == ")" -@pytest.mark.parametrize('text', ["(f.eks.)"]) +@pytest.mark.parametrize("text", ["(f.eks.)"]) def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 3 @@ -75,7 +78,7 @@ def test_da_tokenizer_splits_even_wrap_interact(da_tokenizer, text): assert tokens[2].text == ")" -@pytest.mark.parametrize('text', ["(f.eks.?)"]) +@pytest.mark.parametrize("text", ["(f.eks.?)"]) def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 4 @@ -85,19 +88,19 @@ def test_da_tokenizer_splits_uneven_wrap_interact(da_tokenizer, text): assert tokens[3].text == ")" -@pytest.mark.parametrize('text', ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"]) +@pytest.mark.parametrize("text", ["0,1-13,5", "0,0-0,1", "103,27-300", "1/2-3/4"]) def test_da_tokenizer_handles_numeric_range(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["sort.Gul", "Hej.Verden"]) +@pytest.mark.parametrize("text", ["sort.Gul", "Hej.Verden"]) def test_da_tokenizer_splits_period_infix(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Hej,Verden", "en,to"]) +@pytest.mark.parametrize("text", ["Hej,Verden", "en,to"]) def test_da_tokenizer_splits_comma_infix(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 3 @@ -106,20 +109,25 @@ def test_da_tokenizer_splits_comma_infix(da_tokenizer, text): assert tokens[2].text == text.split(",")[1] -@pytest.mark.parametrize('text', ["sort...Gul", "sort...gul"]) +@pytest.mark.parametrize("text", ["sort...Gul", "sort...gul"]) def test_da_tokenizer_splits_ellipsis_infix(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ['gå-på-mod', '4-hjulstræk', '100-Pfennig-frimærke', 'TV-2-spots', 'trofæ-vaeggen']) +@pytest.mark.parametrize( + "text", + ["gå-på-mod", "4-hjulstræk", "100-Pfennig-frimærke", "TV-2-spots", "trofæ-vaeggen"], +) def test_da_tokenizer_keeps_hyphens(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer): - tokens = da_tokenizer("Mange regler--eksempelvis bindestregs-reglerne--er komplicerede.") + tokens = da_tokenizer( + "Mange regler--eksempelvis bindestregs-reglerne--er komplicerede." + ) assert len(tokens) == 9 assert tokens[0].text == "Mange" assert tokens[1].text == "regler" @@ -132,7 +140,9 @@ def test_da_tokenizer_splits_double_hyphen_infix(da_tokenizer): def test_da_tokenizer_handles_posessives_and_contractions(da_tokenizer): - tokens = da_tokenizer("'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun.") + tokens = da_tokenizer( + "'DBA's, Lars' og Liz' bil sku' sgu' ik' ha' en bule, det ka' han ik' li' mere', sagde hun." + ) assert len(tokens) == 25 assert tokens[0].text == "'" assert tokens[1].text == "DBA's" diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py index d2f0bd0c2..07b134e2d 100644 --- a/spacy/tests/lang/da/test_text.py +++ b/spacy/tests/lang/da/test_text.py @@ -15,17 +15,29 @@ Rundt om ager og eng var der store skove, og midt i skovene dybe søer; jo, der assert len(tokens) == 84 -@pytest.mark.parametrize('text,match', [ - ('10', True), ('1', True), ('10.000', True), ('10.00', True), - ('999,0', True), ('en', True), ('treoghalvfemsindstyvende', True), ('hundrede', True), - ('hund', False), (',', False), ('1/2', True)]) +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10.000", True), + ("10.00", True), + ("999,0", True), + ("en", True), + ("treoghalvfemsindstyvende", True), + ("hundrede", True), + ("hund", False), + (",", False), + ("1/2", True), + ], +) def test_lex_attrs_like_number(da_tokenizer, text, match): tokens = da_tokenizer(text) assert len(tokens) == 1 assert tokens[0].like_num == match -@pytest.mark.parametrize('word', ['elleve', 'første']) +@pytest.mark.parametrize("word", ["elleve", "første"]) def test_da_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index e66eeb781..ccdc31829 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["auf'm", "du's", "über'm", "wir's"]) +@pytest.mark.parametrize("text", ["auf'm", "du's", "über'm", "wir's"]) def test_de_tokenizer_splits_contractions(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) +@pytest.mark.parametrize("text", ["z.B.", "d.h.", "Jan.", "Dez.", "Chr."]) def test_de_tokenizer_handles_abbr(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 1 @@ -24,14 +24,16 @@ def test_de_tokenizer_handles_exc_in_text(de_tokenizer): assert tokens[2].lemma_ == "zur Zeit" -@pytest.mark.parametrize('text,norms', [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])]) +@pytest.mark.parametrize( + "text,norms", [("vor'm", ["vor", "dem"]), ("du's", ["du", "es"])] +) def test_de_tokenizer_norm_exceptions(de_tokenizer, text, norms): tokens = de_tokenizer(text) assert [token.norm_ for token in tokens] == norms @pytest.mark.xfail -@pytest.mark.parametrize('text,norm', [("daß", "dass")]) +@pytest.mark.parametrize("text,norm", [("daß", "dass")]) def test_de_lex_attrs_norm_exceptions(de_tokenizer, text, norm): tokens = de_tokenizer(text) assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/de/test_lemma.py b/spacy/tests/lang/de/test_lemma.py index 6c55ed76d..605887085 100644 --- a/spacy/tests/lang/de/test_lemma.py +++ b/spacy/tests/lang/de/test_lemma.py @@ -4,13 +4,17 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('string,lemma', [ - ('Abgehängten', 'Abgehängte'), - ('engagierte', 'engagieren'), - ('schließt', 'schließen'), - ('vorgebenden', 'vorgebend'), - ('die', 'der'), - ('Die', 'der')]) +@pytest.mark.parametrize( + "string,lemma", + [ + ("Abgehängten", "Abgehängte"), + ("engagierte", "engagieren"), + ("schließt", "schließen"), + ("vorgebenden", "vorgebend"), + ("die", "der"), + ("Die", "der"), + ], +) def test_de_lemmatizer_lookup_assigns(de_tokenizer, string, lemma): tokens = de_tokenizer(string) assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py index 1dc79c28f..5c8694da3 100644 --- a/spacy/tests/lang/de/test_parser.py +++ b/spacy/tests/lang/de/test_parser.py @@ -7,10 +7,12 @@ from ...util import get_doc def test_de_parser_noun_chunks_standard_de(de_tokenizer): text = "Eine Tasse steht auf dem Tisch." heads = [1, 1, 0, -1, 1, -2, -4] - tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', '$.'] - deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'punct'] + tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "$."] + deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "punct"] tokens = de_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 2 assert chunks[0].text_with_ws == "Eine Tasse " @@ -20,10 +22,12 @@ def test_de_parser_noun_chunks_standard_de(de_tokenizer): def test_de_extended_chunk(de_tokenizer): text = "Die Sängerin singt mit einer Tasse Kaffee Arien." heads = [1, 1, 0, -1, 1, -2, -1, -5, -6] - tags = ['ART', 'NN', 'VVFIN', 'APPR', 'ART', 'NN', 'NN', 'NN', '$.'] - deps = ['nk', 'sb', 'ROOT', 'mo', 'nk', 'nk', 'nk', 'oa', 'punct'] + tags = ["ART", "NN", "VVFIN", "APPR", "ART", "NN", "NN", "NN", "$."] + deps = ["nk", "sb", "ROOT", "mo", "nk", "nk", "nk", "oa", "punct"] tokens = de_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 3 assert chunks[0].text_with_ws == "Die Sängerin " diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index b76fb6e3d..13e109395 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -4,79 +4,79 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["(unter)"]) +@pytest.mark.parametrize("text", ["(unter)"]) def test_de_tokenizer_splits_no_special(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["unter'm"]) +@pytest.mark.parametrize("text", ["unter'm"]) def test_de_tokenizer_splits_no_punct(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(unter'm"]) +@pytest.mark.parametrize("text", ["(unter'm"]) def test_de_tokenizer_splits_prefix_punct(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["unter'm)"]) +@pytest.mark.parametrize("text", ["unter'm)"]) def test_de_tokenizer_splits_suffix_punct(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(unter'm)"]) +@pytest.mark.parametrize("text", ["(unter'm)"]) def test_de_tokenizer_splits_even_wrap(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["(unter'm?)"]) +@pytest.mark.parametrize("text", ["(unter'm?)"]) def test_de_tokenizer_splits_uneven_wrap(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 5 -@pytest.mark.parametrize('text,length', [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) +@pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)]) def test_de_tokenizer_splits_prefix_interact(de_tokenizer, text, length): tokens = de_tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text', ["z.B.)"]) +@pytest.mark.parametrize("text", ["z.B.)"]) def test_de_tokenizer_splits_suffix_interact(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(z.B.)"]) +@pytest.mark.parametrize("text", ["(z.B.)"]) def test_de_tokenizer_splits_even_wrap_interact(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(z.B.?)"]) +@pytest.mark.parametrize("text", ["(z.B.?)"]) def test_de_tokenizer_splits_uneven_wrap_interact(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"]) def test_de_tokenizer_splits_numeric_range(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["blau.Rot", "Hallo.Welt"]) +@pytest.mark.parametrize("text", ["blau.Rot", "Hallo.Welt"]) def test_de_tokenizer_splits_period_infix(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Hallo,Welt", "eins,zwei"]) +@pytest.mark.parametrize("text", ["Hallo,Welt", "eins,zwei"]) def test_de_tokenizer_splits_comma_infix(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 @@ -85,13 +85,13 @@ def test_de_tokenizer_splits_comma_infix(de_tokenizer, text): assert tokens[2].text == text.split(",")[1] -@pytest.mark.parametrize('text', ["blau...Rot", "blau...rot"]) +@pytest.mark.parametrize("text", ["blau...Rot", "blau...rot"]) def test_de_tokenizer_splits_ellipsis_infix(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ['Islam-Konferenz', 'Ost-West-Konflikt']) +@pytest.mark.parametrize("text", ["Islam-Konferenz", "Ost-West-Konflikt"]) def test_de_tokenizer_keeps_hyphens(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index 7f4097939..b3fb1eaa5 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -22,19 +22,27 @@ Umfang kläglich dünnen Beine flimmerten ihm hilflos vor den Augen. assert len(tokens) == 109 -@pytest.mark.parametrize('text', [ - "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", - "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", - "Kraftfahrzeug-Haftpflichtversicherung", - "Vakuum-Mittelfrequenz-Induktionsofen"]) +@pytest.mark.parametrize( + "text", + [ + "Donaudampfschifffahrtsgesellschaftskapitänsanwärterposten", + "Rindfleischetikettierungsüberwachungsaufgabenübertragungsgesetz", + "Kraftfahrzeug-Haftpflichtversicherung", + "Vakuum-Mittelfrequenz-Induktionsofen", + ], +) def test_de_tokenizer_handles_long_words(de_tokenizer, text): tokens = de_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text,length', [ - ("»Was ist mit mir geschehen?«, dachte er.", 12), - ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15)]) +@pytest.mark.parametrize( + "text,length", + [ + ("»Was ist mit mir geschehen?«, dachte er.", 12), + ("“Dies frühzeitige Aufstehen”, dachte er, “macht einen ganz blödsinnig. ", 15), + ], +) def test_de_tokenizer_handles_examples(de_tokenizer, text, length): tokens = de_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/el/test_exception.py b/spacy/tests/lang/el/test_exception.py index ef265afee..b8d10fb69 100644 --- a/spacy/tests/lang/el/test_exception.py +++ b/spacy/tests/lang/el/test_exception.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["αριθ.", "τρισ.", "δισ.", "σελ."]) +@pytest.mark.parametrize("text", ["αριθ.", "τρισ.", "δισ.", "σελ."]) def test_el_tokenizer_handles_abbr(el_tokenizer, text): tokens = el_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/lang/el/test_text.py b/spacy/tests/lang/el/test_text.py index 79b0b23ac..a6395ab4a 100644 --- a/spacy/tests/lang/el/test_text.py +++ b/spacy/tests/lang/el/test_text.py @@ -13,12 +13,22 @@ def test_el_tokenizer_handles_long_text(el_tokenizer): assert len(tokens) == 54 -@pytest.mark.parametrize('text,length',[ - ("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8), - ("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10), - ("Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.", 19), - ("Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.", 15), - ("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9)]) -def test_el_tokenizer_handles_cnts(el_tokenizer,text, length): +@pytest.mark.parametrize( + "text,length", + [ + ("Διοικητικά η Ελλάδα διαιρείται σε 13 Περιφέρειες.", 8), + ("Η εκπαίδευση στην Ελλάδα χωρίζεται κυρίως σε τρία επίπεδα.", 10), + ( + "Η Ελλάδα είναι μία από τις χώρες της Ευρωπαϊκής Ένωσης (ΕΕ) που διαθέτει σηµαντικό ορυκτό πλούτο.", + 19, + ), + ( + "Η ναυτιλία αποτέλεσε ένα σημαντικό στοιχείο της Ελληνικής οικονομικής δραστηριότητας από τα αρχαία χρόνια.", + 15, + ), + ("Η Ελλάδα είναι μέλος σε αρκετούς διεθνείς οργανισμούς.", 9), + ], +) +def test_el_tokenizer_handles_cnts(el_tokenizer, text, length): tokens = el_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index c7efcb4ee..9003292ce 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -12,29 +12,66 @@ from spacy.util import compile_infix_regex def custom_en_tokenizer(en_vocab): prefix_re = compile_prefix_regex(English.Defaults.prefixes) suffix_re = compile_suffix_regex(English.Defaults.suffixes) - custom_infixes = ['\.\.\.+', - '(?<=[0-9])-(?=[0-9])', - # '(?<=[0-9]+),(?=[0-9]+)', - '[0-9]+(,[0-9]+)+', - '[\[\]!&:,()\*—–\/-]'] + custom_infixes = [ + "\.\.\.+", + "(?<=[0-9])-(?=[0-9])", + # '(?<=[0-9]+),(?=[0-9]+)', + "[0-9]+(,[0-9]+)+", + "[\[\]!&:,()\*—–\/-]", + ] infix_re = compile_infix_regex(custom_infixes) - return Tokenizer(en_vocab, - English.Defaults.tokenizer_exceptions, - prefix_re.search, - suffix_re.search, - infix_re.finditer, - token_match=None) + return Tokenizer( + en_vocab, + English.Defaults.tokenizer_exceptions, + prefix_re.search, + suffix_re.search, + infix_re.finditer, + token_match=None, + ) def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer): sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion." context = [word.text for word in custom_en_tokenizer(sentence)] - assert context == ['The', '8', 'and', '10', '-', 'county', 'definitions', - 'are', 'not', 'used', 'for', 'the', 'greater', - 'Southern', 'California', 'Megaregion', '.'] + assert context == [ + "The", + "8", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ] # the trailing '-' may cause Assertion Error sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion." context = [word.text for word in custom_en_tokenizer(sentence)] - assert context == ['The', '8', '-', 'and', '10', '-', 'county', - 'definitions', 'are', 'not', 'used', 'for', 'the', - 'greater', 'Southern', 'California', 'Megaregion', '.'] + assert context == [ + "The", + "8", + "-", + "and", + "10", + "-", + "county", + "definitions", + "are", + "not", + "used", + "for", + "the", + "greater", + "Southern", + "California", + "Megaregion", + ".", + ] diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 3fc14c59d..51b9ca6b2 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -15,13 +15,15 @@ def test_en_tokenizer_handles_basic_contraction(en_tokenizer): assert tokens[4].text == "!" -@pytest.mark.parametrize('text', ["`ain't", '''"isn't''', "can't!"]) +@pytest.mark.parametrize("text", ["`ain't", """"isn't""", "can't!"]) def test_en_tokenizer_handles_basic_contraction_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text_poss,text', [("Robin's", "Robin"), ("Alexis's", "Alexis")]) +@pytest.mark.parametrize( + "text_poss,text", [("Robin's", "Robin"), ("Alexis's", "Alexis")] +) def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): tokens = en_tokenizer(text_poss) assert len(tokens) == 2 @@ -29,7 +31,7 @@ def test_en_tokenizer_handles_poss_contraction(en_tokenizer, text_poss, text): assert tokens[1].text == "'s" -@pytest.mark.parametrize('text', ["schools'", "Alexis'"]) +@pytest.mark.parametrize("text", ["schools'", "Alexis'"]) def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 @@ -37,14 +39,14 @@ def test_en_tokenizer_splits_trailing_apos(en_tokenizer, text): assert tokens[1].text == "'" -@pytest.mark.parametrize('text', ["'em", "nothin'", "ol'"]) +@pytest.mark.parametrize("text", ["'em", "nothin'", "ol'"]) def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 1 assert tokens[0].text == text -@pytest.mark.parametrize('text', ["we'll", "You'll", "there'll"]) +@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"]) def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 @@ -53,7 +55,9 @@ def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text): assert tokens[1].lemma_ == "will" -@pytest.mark.parametrize('text_lower,text_title', [("can't", "Can't"), ("ain't", "Ain't")]) +@pytest.mark.parametrize( + "text_lower,text_title", [("can't", "Can't"), ("ain't", "Ain't")] +) def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_title): tokens_lower = en_tokenizer(text_lower) tokens_title = en_tokenizer(text_title) @@ -62,21 +66,23 @@ def test_en_tokenizer_handles_capitalization(en_tokenizer, text_lower, text_titl assert tokens_lower[1].text == tokens_title[1].text -@pytest.mark.parametrize('pron', ["I", "You", "He", "She", "It", "We", "They"]) -@pytest.mark.parametrize('contraction', ["'ll", "'d"]) +@pytest.mark.parametrize("pron", ["I", "You", "He", "She", "It", "We", "They"]) +@pytest.mark.parametrize("contraction", ["'ll", "'d"]) def test_en_tokenizer_keeps_title_case(en_tokenizer, pron, contraction): tokens = en_tokenizer(pron + contraction) assert tokens[0].text == pron assert tokens[1].text == contraction -@pytest.mark.parametrize('exc', ["Ill", "ill", "Hell", "hell", "Well", "well"]) +@pytest.mark.parametrize("exc", ["Ill", "ill", "Hell", "hell", "Well", "well"]) def test_en_tokenizer_excludes_ambiguous(en_tokenizer, exc): tokens = en_tokenizer(exc) assert len(tokens) == 1 -@pytest.mark.parametrize('wo_punct,w_punct', [("We've", "``We've"), ("couldn't", "couldn't)")]) +@pytest.mark.parametrize( + "wo_punct,w_punct", [("We've", "``We've"), ("couldn't", "couldn't)")] +) def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): tokens = en_tokenizer(wo_punct) assert len(tokens) == 2 @@ -84,7 +90,7 @@ def test_en_tokenizer_splits_defined_punct(en_tokenizer, wo_punct, w_punct): assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) +@pytest.mark.parametrize("text", ["e.g.", "p.m.", "Jan.", "Dec.", "Inc."]) def test_en_tokenizer_handles_abbr(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 1 @@ -97,20 +103,24 @@ def test_en_tokenizer_handles_exc_in_text(en_tokenizer): assert tokens[3].text == "i.e." -@pytest.mark.parametrize('text', ["1am", "12a.m.", "11p.m.", "4pm"]) +@pytest.mark.parametrize("text", ["1am", "12a.m.", "11p.m.", "4pm"]) def test_en_tokenizer_handles_times(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 assert tokens[1].lemma_ in ["a.m.", "p.m."] -@pytest.mark.parametrize('text,norms', [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])]) +@pytest.mark.parametrize( + "text,norms", [("I'm", ["i", "am"]), ("shan't", ["shall", "not"])] +) def test_en_tokenizer_norm_exceptions(en_tokenizer, text, norms): tokens = en_tokenizer(text) assert [token.norm_ for token in tokens] == norms -@pytest.mark.parametrize('text,norm', [("radicalised", "radicalized"), ("cuz", "because")]) +@pytest.mark.parametrize( + "text,norm", [("radicalised", "radicalized"), ("cuz", "because")] +) def test_en_lex_attrs_norm_exceptions(en_tokenizer, text, norm): tokens = en_tokenizer(text) assert tokens[0].norm_ == norm diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 5ef3721fe..b3a6696d3 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -12,14 +12,25 @@ from ...util import get_doc def test_en_noun_chunks_not_nested(en_tokenizer): text = "Peter has chronic command and control issues" heads = [1, 0, 4, 3, -1, -2, -5] - deps = ['nsubj', 'ROOT', 'amod', 'nmod', 'cc', 'conj', 'dobj'] + deps = ["nsubj", "ROOT", "amod", "nmod", "cc", "conj", "dobj"] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) tokens.from_array( [HEAD, DEP], - numpy.asarray([[1, nsubj], [0, root], [4, amod], [3, nmod], [-1, cc], - [-2, conj], [-5, dobj]], dtype='uint64')) - tokens.noun_chunks_iterator = SYNTAX_ITERATORS['noun_chunks'] + numpy.asarray( + [ + [1, nsubj], + [0, root], + [4, amod], + [3, nmod], + [-1, cc], + [-2, conj], + [-5, dobj], + ], + dtype="uint64", + ), + ) + tokens.noun_chunks_iterator = SYNTAX_ITERATORS["noun_chunks"] word_occurred = {} for chunk in tokens.noun_chunks: for word in chunk: diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py index 566ea4295..ce696bc25 100644 --- a/spacy/tests/lang/en/test_parser.py +++ b/spacy/tests/lang/en/test_parser.py @@ -7,22 +7,28 @@ from ...util import get_doc def test_en_parser_noun_chunks_standard(en_tokenizer): text = "A base phrase should be recognized." heads = [2, 1, 3, 2, 1, 0, -1] - tags = ['DT', 'JJ', 'NN', 'MD', 'VB', 'VBN', '.'] - deps = ['det', 'amod', 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'punct'] + tags = ["DT", "JJ", "NN", "MD", "VB", "VBN", "."] + deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 1 assert chunks[0].text_with_ws == "A base phrase " def test_en_parser_noun_chunks_coordinated(en_tokenizer): + # fmt: off text = "A base phrase and a good phrase are often the same." heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4] - tags = ['DT', 'NN', 'NN', 'CC', 'DT', 'JJ', 'NN', 'VBP', 'RB', 'DT', 'JJ', '.'] - deps = ['det', 'compound', 'nsubj', 'cc', 'det', 'amod', 'conj', 'ROOT', 'advmod', 'det', 'attr', 'punct'] + tags = ["DT", "NN", "NN", "CC", "DT", "JJ", "NN", "VBP", "RB", "DT", "JJ", "."] + deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"] + # fmt: on tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 2 assert chunks[0].text_with_ws == "A base phrase " @@ -32,10 +38,12 @@ def test_en_parser_noun_chunks_coordinated(en_tokenizer): def test_en_parser_noun_chunks_pp_chunks(en_tokenizer): text = "A phrase with another phrase occurs." heads = [1, 4, -1, 1, -2, 0, -1] - tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ', '.'] - deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT', 'punct'] + tags = ["DT", "NN", "IN", "DT", "NN", "VBZ", "."] + deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 2 assert chunks[0].text_with_ws == "A phrase " @@ -43,12 +51,16 @@ def test_en_parser_noun_chunks_pp_chunks(en_tokenizer): def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer): + # fmt: off text = "Sam, my brother, arrived to the house." heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4] - tags = ['NNP', ',', 'PRP$', 'NN', ',', 'VBD', 'IN', 'DT', 'NN', '.'] - deps = ['nsubj', 'punct', 'poss', 'appos', 'punct', 'ROOT', 'prep', 'det', 'pobj', 'punct'] + tags = ["NNP", ",", "PRP$", "NN", ",", "VBD", "IN", "DT", "NN", "."] + deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"] + # fmt: on tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 3 assert chunks[0].text_with_ws == "Sam " @@ -59,10 +71,12 @@ def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer): def test_en_parser_noun_chunks_dative(en_tokenizer): text = "She gave Bob a raise." heads = [1, 0, -1, 1, -3, -4] - tags = ['PRP', 'VBD', 'NNP', 'DT', 'NN', '.'] - deps = ['nsubj', 'ROOT', 'dative', 'det', 'dobj', 'punct'] + tags = ["PRP", "VBD", "NNP", "DT", "NN", "."] + deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], tags=tags, deps=deps, heads=heads + ) chunks = list(doc.noun_chunks) assert len(chunks) == 3 assert chunks[0].text_with_ws == "She " diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 987f7b7bc..e9d75111d 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -4,85 +4,85 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["(can)"]) +@pytest.mark.parametrize("text", ["(can)"]) def test_en_tokenizer_splits_no_special(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["can't"]) +@pytest.mark.parametrize("text", ["can't"]) def test_en_tokenizer_splits_no_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(can't"]) +@pytest.mark.parametrize("text", ["(can't"]) def test_en_tokenizer_splits_prefix_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["can't)"]) +@pytest.mark.parametrize("text", ["can't)"]) def test_en_tokenizer_splits_suffix_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(can't)"]) +@pytest.mark.parametrize("text", ["(can't)"]) def test_en_tokenizer_splits_even_wrap(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["(can't?)"]) +@pytest.mark.parametrize("text", ["(can't?)"]) def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 5 -@pytest.mark.parametrize('text,length', [("U.S.", 1), ("us.", 2), ("(U.S.", 2)]) +@pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)]) def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length): tokens = en_tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text', ["U.S.)"]) +@pytest.mark.parametrize("text", ["U.S.)"]) def test_en_tokenizer_splits_suffix_interact(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(U.S.)"]) +@pytest.mark.parametrize("text", ["(U.S.)"]) def test_en_tokenizer_splits_even_wrap_interact(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(U.S.?)"]) +@pytest.mark.parametrize("text", ["(U.S.?)"]) def test_en_tokenizer_splits_uneven_wrap_interact(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text', ["best-known"]) +@pytest.mark.parametrize("text", ["best-known"]) def test_en_tokenizer_splits_hyphens(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"]) def test_en_tokenizer_splits_numeric_range(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["best.Known", "Hello.World"]) +@pytest.mark.parametrize("text", ["best.Known", "Hello.World"]) def test_en_tokenizer_splits_period_infix(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Hello,world", "one,two"]) +@pytest.mark.parametrize("text", ["Hello,world", "one,two"]) def test_en_tokenizer_splits_comma_infix(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 @@ -91,7 +91,7 @@ def test_en_tokenizer_splits_comma_infix(en_tokenizer, text): assert tokens[2].text == text.split(",")[1] -@pytest.mark.parametrize('text', ["best...Known", "best...known"]) +@pytest.mark.parametrize("text", ["best...Known", "best...known"]) def test_en_tokenizer_splits_ellipsis_infix(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 3 @@ -126,8 +126,10 @@ def test_en_tokenizer_splits_period_abbr(en_tokenizer): @pytest.mark.xfail def test_en_tokenizer_splits_em_dash_infix(en_tokenizer): # Re Issue #225 - tokens = en_tokenizer("""Will this road take me to Puddleton?\u2014No, """ - """you'll have to walk there.\u2014Ariel.""") + tokens = en_tokenizer( + """Will this road take me to Puddleton?\u2014No, """ + """you'll have to walk there.\u2014Ariel.""" + ) assert tokens[6].text == "Puddleton" assert tokens[7].text == "?" assert tokens[8].text == "\u2014" diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index dd344ae34..3061327ad 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -6,19 +6,19 @@ from spacy.util import compile_prefix_regex from spacy.lang.punctuation import TOKENIZER_PREFIXES -PUNCT_OPEN = ['(', '[', '{', '*'] -PUNCT_CLOSE = [')', ']', '}', '*'] -PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] +PUNCT_OPEN = ["(", "[", "{", "*"] +PUNCT_CLOSE = [")", "]", "}", "*"] +PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")] -@pytest.mark.parametrize('text', ["(", "((", "<"]) +@pytest.mark.parametrize("text", ["(", "((", "<"]) def test_en_tokenizer_handles_only_punct(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == len(text) -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text): tokens = en_tokenizer(punct + text) assert len(tokens) == 2 @@ -26,8 +26,8 @@ def test_en_tokenizer_splits_open_punct(en_tokenizer, punct, text): assert tokens[1].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text): tokens = en_tokenizer(text + punct) assert len(tokens) == 2 @@ -35,9 +35,9 @@ def test_en_tokenizer_splits_close_punct(en_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('punct_add', ["`"]) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("punct_add", ["`"]) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, text): tokens = en_tokenizer(punct + punct_add + text) assert len(tokens) == 3 @@ -46,9 +46,9 @@ def test_en_tokenizer_splits_two_diff_open_punct(en_tokenizer, punct, punct_add, assert tokens[2].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('punct_add', ["'"]) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("punct_add", ["'"]) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add, text): tokens = en_tokenizer(text + punct + punct_add) assert len(tokens) == 3 @@ -57,8 +57,8 @@ def test_en_tokenizer_splits_two_diff_close_punct(en_tokenizer, punct, punct_add assert tokens[2].text == punct_add -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): tokens = en_tokenizer(punct + punct + punct + text) assert len(tokens) == 4 @@ -66,8 +66,8 @@ def test_en_tokenizer_splits_same_open_punct(en_tokenizer, punct, text): assert tokens[3].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Hello"]) def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): tokens = en_tokenizer(text + punct + punct + punct) assert len(tokens) == 4 @@ -75,14 +75,14 @@ def test_en_tokenizer_splits_same_close_punct(en_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('text', ["'The"]) +@pytest.mark.parametrize("text", ["'The"]) def test_en_tokenizer_splits_open_appostrophe(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 assert tokens[0].text == "'" -@pytest.mark.parametrize('text', ["Hello''"]) +@pytest.mark.parametrize("text", ["Hello''"]) def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text): tokens = en_tokenizer(text) assert len(tokens) == 2 @@ -90,10 +90,11 @@ def test_en_tokenizer_splits_double_end_quote(en_tokenizer, text): assert len(tokens_punct) == 1 -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('text', ["Hello"]) -def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, - punct_close, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("text", ["Hello"]) +def test_en_tokenizer_splits_open_close_punct( + en_tokenizer, punct_open, punct_close, text +): tokens = en_tokenizer(punct_open + text + punct_close) assert len(tokens) == 3 assert tokens[0].text == punct_open @@ -101,11 +102,12 @@ def test_en_tokenizer_splits_open_close_punct(en_tokenizer, punct_open, assert tokens[2].text == punct_close -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) -@pytest.mark.parametrize('text', ["Hello"]) -def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, - punct_open2, punct_close2, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")]) +@pytest.mark.parametrize("text", ["Hello"]) +def test_en_tokenizer_two_diff_punct( + en_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text +): tokens = en_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) assert len(tokens) == 5 assert tokens[0].text == punct_open2 @@ -115,7 +117,7 @@ def test_en_tokenizer_two_diff_punct(en_tokenizer, punct_open, punct_close, assert tokens[4].text == punct_close2 -@pytest.mark.parametrize('text,punct', [("(can't", "(")]) +@pytest.mark.parametrize("text,punct", [("(can't", "(")]) def test_en_tokenizer_splits_pre_punct_regex(text, punct): en_search_prefixes = compile_prefix_regex(TOKENIZER_PREFIXES).search match = en_search_prefixes(text) diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 6bd1ee249..40bd110e8 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -6,8 +6,8 @@ import pytest from ...util import get_doc, apply_transition_sequence -@pytest.mark.parametrize('text', ["A test sentence"]) -@pytest.mark.parametrize('punct', ['.', '!', '?', '']) +@pytest.mark.parametrize("text", ["A test sentence"]) +@pytest.mark.parametrize("punct", [".", "!", "?", ""]) def test_en_sbd_single_punct(en_tokenizer, text, punct): heads = [2, 1, 0, -1] if punct else [2, 1, 0] tokens = en_tokenizer(text + punct) @@ -19,16 +19,18 @@ def test_en_sbd_single_punct(en_tokenizer, text, punct): @pytest.mark.xfail def test_en_sentence_breaks(en_tokenizer, en_parser): + # fmt: off text = "This is a sentence . This is another one ." heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3] - deps = ['nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det', - 'attr', 'punct'] - transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT', - 'L-nsubj', 'S', 'L-attr', 'R-attr', 'D', 'R-punct'] + deps = ["nsubj", "ROOT", "det", "attr", "punct", "nsubj", "ROOT", "det", + "attr", "punct"] + transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct", "B-ROOT", + "L-nsubj", "S", "L-attr", "R-attr", "D", "R-punct"] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) apply_transition_sequence(en_parser, doc, transition) assert len(list(doc.sents)) == 2 for token in doc: assert token.dep != 0 or token.is_space - assert [token.head.i for token in doc ] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6] + assert [token.head.i for token in doc] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6] diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py index a59f6f806..567fd5a44 100644 --- a/spacy/tests/lang/en/test_tagger.py +++ b/spacy/tests/lang/en/test_tagger.py @@ -6,10 +6,10 @@ from ...util import get_doc def test_en_tagger_load_morph_exc(en_tokenizer): text = "I like his style." - tags = ['PRP', 'VBP', 'PRP$', 'NN', '.'] - morph_exc = {'VBP': {'like': {'lemma': 'luck'}}} + tags = ["PRP", "VBP", "PRP$", "NN", "."] + morph_exc = {"VBP": {"like": {"lemma": "luck"}}} en_tokenizer.vocab.morphology.load_morph_exceptions(morph_exc) tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], tags=tags) - assert doc[1].tag_ == 'VBP' - assert doc[1].lemma_ == 'luck' + assert doc[1].tag_ == "VBP" + assert doc[1].lemma_ == "luck" diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index 91a7d6e4d..a7ebde989 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -20,30 +20,48 @@ untimely death" of the rapier-tongued Scottish barrister and parliamentarian. assert len(tokens) == 76 -@pytest.mark.parametrize('text,length', [ - ("The U.S. Army likes Shock and Awe.", 8), - ("U.N. regulations are not a part of their concern.", 10), - ("“Isn't it?”", 6), - ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), - ("""'Me too!', Mr. P. Delaware cried. """, 11), - ("They ran about 10km.", 6), - pytest.mark.xfail(("But then the 6,000-year ice age came...", 10))]) +@pytest.mark.parametrize( + "text,length", + [ + ("The U.S. Army likes Shock and Awe.", 8), + ("U.N. regulations are not a part of their concern.", 10), + ("“Isn't it?”", 6), + ("""Yes! "I'd rather have a walk", Ms. Comble sighed. """, 15), + ("""'Me too!', Mr. P. Delaware cried. """, 11), + ("They ran about 10km.", 6), + pytest.param( + "But then the 6,000-year ice age came...", 10, marks=pytest.mark.xfail() + ), + ], +) def test_en_tokenizer_handles_cnts(en_tokenizer, text, length): tokens = en_tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text,match', [ - ('10', True), ('1', True), ('10,000', True), ('10,00', True), - ('999.0', True), ('one', True), ('two', True), ('billion', True), - ('dog', False), (',', False), ('1/2', True)]) +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("999.0", True), + ("one", True), + ("two", True), + ("billion", True), + ("dog", False), + (",", False), + ("1/2", True), + ], +) def test_lex_attrs_like_number(en_tokenizer, text, match): tokens = en_tokenizer(text) assert len(tokens) == 1 assert tokens[0].like_num == match -@pytest.mark.parametrize('word', ['eleven']) +@pytest.mark.parametrize("word", ["eleven"]) def test_en_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py index f3e424e09..8d6164058 100644 --- a/spacy/tests/lang/es/test_exception.py +++ b/spacy/tests/lang/es/test_exception.py @@ -4,11 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text,lemma', [ - ("aprox.", "aproximadamente"), - ("esq.", "esquina"), - ("pág.", "página"), - ("p.ej.", "por ejemplo")]) +@pytest.mark.parametrize( + "text,lemma", + [ + ("aprox.", "aproximadamente"), + ("esq.", "esquina"), + ("pág.", "página"), + ("p.ej.", "por ejemplo"), + ], +) def test_es_tokenizer_handles_abbr(es_tokenizer, text, lemma): tokens = es_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index b03a9ee4a..acd572b48 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -20,12 +20,16 @@ en Montevideo y que pregona las bondades de la vida austera.""" assert len(tokens) == 90 -@pytest.mark.parametrize('text,length', [ - ("¿Por qué José Mujica?", 6), - ("“¿Oh no?”", 6), - ("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11), - ("Corrieron aprox. 10km.", 5), - ("Y entonces por qué...", 5)]) +@pytest.mark.parametrize( + "text,length", + [ + ("¿Por qué José Mujica?", 6), + ("“¿Oh no?”", 6), + ("""¡Sí! "Vámonos", contestó José Arcadio Buendía""", 11), + ("Corrieron aprox. 10km.", 5), + ("Y entonces por qué...", 5), + ], +) def test_es_tokenizer_handles_cnts(es_tokenizer, text, length): tokens = es_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index ff18b9eac..66be7bd46 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -5,12 +5,15 @@ import pytest ABBREVIATION_TESTS = [ - ('Hyvää uutta vuotta t. siht. Niemelä!', ['Hyvää', 'uutta', 'vuotta', 't.', 'siht.', 'Niemelä', '!']), - ('Paino on n. 2.2 kg', ['Paino', 'on', 'n.', '2.2', 'kg']) + ( + "Hyvää uutta vuotta t. siht. Niemelä!", + ["Hyvää", "uutta", "vuotta", "t.", "siht.", "Niemelä", "!"], + ), + ("Paino on n. 2.2 kg", ["Paino", "on", "n.", "2.2", "kg"]), ] -@pytest.mark.parametrize('text,expected_tokens', ABBREVIATION_TESTS) +@pytest.mark.parametrize("text,expected_tokens", ABBREVIATION_TESTS) def test_fi_tokenizer_handles_testcases(fi_tokenizer, text, expected_tokens): tokens = fi_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 2c4945720..087e48cca 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -2,26 +2,26 @@ from __future__ import unicode_literals import pytest -from .... import util - -@pytest.fixture(scope='module') -def fr_tokenizer(): - return util.get_lang_class('fr').Defaults.create_tokenizer() -@pytest.mark.parametrize('text', [ - "aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"]) +@pytest.mark.parametrize( + "text", ["aujourd'hui", "Aujourd'hui", "prud'hommes", "prud’hommal"] +) def test_fr_tokenizer_infix_exceptions(fr_tokenizer, text): tokens = fr_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text,lemma', [ - ("janv.", "janvier"), - ("juill.", "juillet"), - ("Dr.", "docteur"), - ("av.", "avant"), - ("sept.", "septembre")]) +@pytest.mark.parametrize( + "text,lemma", + [ + ("janv.", "janvier"), + ("juill.", "juillet"), + ("Dr.", "docteur"), + ("av.", "avant"), + ("sept.", "septembre"), + ], +) def test_fr_tokenizer_handles_abbr(fr_tokenizer, text, lemma): tokens = fr_tokenizer(text) assert len(tokens) == 1 @@ -57,6 +57,7 @@ def test_fr_tokenizer_handles_title(fr_tokenizer): assert tokens[2].lemma_ == "ce" +@pytest.mark.xfail def test_fr_tokenizer_handles_title_2(fr_tokenizer): text = "Est-ce pas génial?" tokens = fr_tokenizer(text) @@ -65,7 +66,7 @@ def test_fr_tokenizer_handles_title_2(fr_tokenizer): assert tokens[0].lemma_ == "être" -def test_fr_tokenizer_handles_title_2(fr_tokenizer): +def test_fr_tokenizer_handles_title_3(fr_tokenizer): text = "Qu'est-ce que tu fais?" tokens = fr_tokenizer(text) assert len(tokens) == 7 diff --git a/spacy/tests/lang/fr/test_lemmatization.py b/spacy/tests/lang/fr/test_lemmatization.py index a61ca001e..9a13e4689 100644 --- a/spacy/tests/lang/fr/test_lemmatization.py +++ b/spacy/tests/lang/fr/test_lemmatization.py @@ -16,7 +16,9 @@ def test_fr_lemmatizer_noun_verb_2(fr_tokenizer): assert tokens[4].lemma_ == "être" -@pytest.mark.xfail(reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN") +@pytest.mark.xfail( + reason="Costaricienne TAG is PROPN instead of NOUN and spacy don't lemmatize PROPN" +) def test_fr_lemmatizer_noun(fr_tokenizer): tokens = fr_tokenizer("il y a des Costaricienne.") assert tokens[4].lemma_ == "Costaricain" diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index b9fb7bbb1..ca6bdbd87 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -7,11 +7,12 @@ from spacy.lang.punctuation import TOKENIZER_INFIXES from spacy.lang.char_classes import ALPHA -@pytest.mark.parametrize('text,expected_tokens', [ - ("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])]) +@pytest.mark.parametrize( + "text,expected_tokens", [("l'avion", ["l'", "avion"]), ("j'ai", ["j'", "ai"])] +) def test_issue768(text, expected_tokens): """Allow zero-width 'infix' token during the tokenization process.""" - SPLIT_INFIX = r'(?<=[{a}]\')(?=[{a}])'.format(a=ALPHA) + SPLIT_INFIX = r"(?<=[{a}]\')(?=[{a}])".format(a=ALPHA) class FrenchTest(Language): class Defaults(Language.Defaults): diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py index a10633882..24b4c4532 100644 --- a/spacy/tests/lang/fr/test_text.py +++ b/spacy/tests/lang/fr/test_text.py @@ -1,13 +1,5 @@ # coding: utf8 from __future__ import unicode_literals -import pytest - -from .... import util - -@pytest.fixture(scope='module') -def fr_tokenizer(): - return util.get_lang_class('fr').Defaults.create_tokenizer() - import pytest from spacy.lang.fr.lex_attrs import like_num @@ -27,7 +19,7 @@ ou avec un autre vrai humain.""" assert len(tokens) == 113 -@pytest.mark.parametrize('word', ['onze', 'onzième']) +@pytest.mark.parametrize("word", ["onze", "onzième"]) def test_fr_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index db490315a..29bc1c759 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -4,13 +4,15 @@ from __future__ import unicode_literals import pytest +# fmt: off GA_TOKEN_EXCEPTION_TESTS = [ - ('Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).', ['Niall', 'Ó', 'Domhnaill', ',', 'Rialtas', 'na', 'hÉireann', '1977', '(', 'lch.', '600', ')', '.']), - ('Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise', ['Daoine', 'a', 'bhfuil', 'Gaeilge', 'acu', ',', 'm.sh.', 'tusa', 'agus', 'mise']) + ("Niall Ó Domhnaill, Rialtas na hÉireann 1977 (lch. 600).", ["Niall", "Ó", "Domhnaill", ",", "Rialtas", "na", "hÉireann", "1977", "(", "lch.", "600", ")", "."]), + ("Daoine a bhfuil Gaeilge acu, m.sh. tusa agus mise", ["Daoine", "a", "bhfuil", "Gaeilge", "acu", ",", "m.sh.", "tusa", "agus", "mise"]) ] +# fmt: on -@pytest.mark.parametrize('text,expected_tokens', GA_TOKEN_EXCEPTION_TESTS) +@pytest.mark.parametrize("text,expected_tokens", GA_TOKEN_EXCEPTION_TESTS) def test_ga_tokenizer_handles_exception_cases(ga_tokenizer, text, expected_tokens): tokens = ga_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py index b3672c652..f138ec6e7 100644 --- a/spacy/tests/lang/he/test_tokenizer.py +++ b/spacy/tests/lang/he/test_tokenizer.py @@ -4,20 +4,41 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text,expected_tokens', - [('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])]) +@pytest.mark.parametrize( + "text,expected_tokens", + [("פייתון היא שפת תכנות דינמית", ["פייתון", "היא", "שפת", "תכנות", "דינמית"])], +) def test_he_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens): tokens = he_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list -@pytest.mark.parametrize('text,expected_tokens', [ - ('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']), - ('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']), - ('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']), - ('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']), - ('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])]) +@pytest.mark.parametrize( + "text,expected_tokens", + [ + ( + "עקבת אחריו בכל רחבי המדינה.", + ["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "."], + ), + ( + "עקבת אחריו בכל רחבי המדינה?", + ["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "?"], + ), + ( + "עקבת אחריו בכל רחבי המדינה!", + ["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "!"], + ), + ( + "עקבת אחריו בכל רחבי המדינה..", + ["עקבת", "אחריו", "בכל", "רחבי", "המדינה", ".."], + ), + ( + "עקבת אחריו בכל רחבי המדינה...", + ["עקבת", "אחריו", "בכל", "רחבי", "המדינה", "..."], + ), + ], +) def test_he_tokenizer_handles_punct(he_tokenizer, text, expected_tokens): tokens = he_tokenizer(text) assert expected_tokens == [token.text for token in tokens] diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index ad725b2f9..c29366fc8 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -6,11 +6,11 @@ import pytest DEFAULT_TESTS = [ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])), + pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('A pl. rovidites.', ['A', 'pl.', 'rovidites', '.']), ('A S.M.A.R.T. szo.', ['A', 'S.M.A.R.T.', 'szo', '.']), - pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])), + pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ('A pl.', ['A', 'pl.']), ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), @@ -228,11 +228,11 @@ QUOTE_TESTS = [ DOT_TESTS = [ ('N. kormányzósági\nszékhely.', ['N.', 'kormányzósági', 'székhely', '.']), - pytest.mark.xfail(('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'])), + pytest.param('A .hu egy tld.', ['A', '.hu', 'egy', 'tld', '.'], marks=pytest.mark.xfail()), ('Az egy.ketto pelda.', ['Az', 'egy.ketto', 'pelda', '.']), ('A pl. rövidítés.', ['A', 'pl.', 'rövidítés', '.']), ('A S.M.A.R.T. szó.', ['A', 'S.M.A.R.T.', 'szó', '.']), - pytest.mark.xfail(('A .hu.', ['A', '.hu', '.'])), + pytest.param('A .hu.', ['A', '.hu', '.'], marks=pytest.mark.xfail()), ('Az egy.ketto.', ['Az', 'egy.ketto', '.']), ('A pl.', ['A', 'pl.']), ('A S.M.A.R.T.', ['A', 'S.M.A.R.T.']), diff --git a/spacy/tests/lang/id/test_prefix_suffix_infix.py b/spacy/tests/lang/id/test_prefix_suffix_infix.py index 125213fb0..e86a98ee3 100644 --- a/spacy/tests/lang/id/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py @@ -4,85 +4,87 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["(Ma'arif)"]) +@pytest.mark.parametrize("text", ["(Ma'arif)"]) def test_id_tokenizer_splits_no_special(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Ma'arif"]) +@pytest.mark.parametrize("text", ["Ma'arif"]) def test_id_tokenizer_splits_no_punct(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["(Ma'arif"]) +@pytest.mark.parametrize("text", ["(Ma'arif"]) def test_id_tokenizer_splits_prefix_punct(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["Ma'arif)"]) +@pytest.mark.parametrize("text", ["Ma'arif)"]) def test_id_tokenizer_splits_suffix_punct(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(Ma'arif)"]) +@pytest.mark.parametrize("text", ["(Ma'arif)"]) def test_id_tokenizer_splits_even_wrap(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(Ma'arif?)"]) +@pytest.mark.parametrize("text", ["(Ma'arif?)"]) def test_tokenizer_splits_uneven_wrap(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text,length', [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)]) +@pytest.mark.parametrize("text,length", [("S.Kom.", 1), ("SKom.", 2), ("(S.Kom.", 2)]) def test_id_tokenizer_splits_prefix_interact(id_tokenizer, text, length): tokens = id_tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text', ["S.Kom.)"]) +@pytest.mark.parametrize("text", ["S.Kom.)"]) def test_id_tokenizer_splits_suffix_interact(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["(S.Kom.)"]) +@pytest.mark.parametrize("text", ["(S.Kom.)"]) def test_id_tokenizer_splits_even_wrap_interact(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["(S.Kom.?)"]) +@pytest.mark.parametrize("text", ["(S.Kom.?)"]) def test_id_tokenizer_splits_uneven_wrap_interact(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 4 -@pytest.mark.parametrize('text,length', [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)]) +@pytest.mark.parametrize( + "text,length", [("gara-gara", 1), ("Jokowi-Ahok", 3), ("Sukarno-Hatta", 3)] +) def test_id_tokenizer_splits_hyphens(id_tokenizer, text, length): tokens = id_tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text', ["0.1-13.5", "0.0-0.1", "103.27-300"]) +@pytest.mark.parametrize("text", ["0.1-13.5", "0.0-0.1", "103.27-300"]) def test_id_tokenizer_splits_numeric_range(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["ini.Budi", "Halo.Bandung"]) +@pytest.mark.parametrize("text", ["ini.Budi", "Halo.Bandung"]) def test_id_tokenizer_splits_period_infix(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["Halo,Bandung", "satu,dua"]) +@pytest.mark.parametrize("text", ["Halo,Bandung", "satu,dua"]) def test_id_tokenizer_splits_comma_infix(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 @@ -91,7 +93,7 @@ def test_id_tokenizer_splits_comma_infix(id_tokenizer, text): assert tokens[2].text == text.split(",")[1] -@pytest.mark.parametrize('text', ["halo...Bandung", "dia...pergi"]) +@pytest.mark.parametrize("text", ["halo...Bandung", "dia...pergi"]) def test_id_tokenizer_splits_ellipsis_infix(id_tokenizer, text): tokens = id_tokenizer(text) assert len(tokens) == 3 diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py index 947804162..915d268ae 100644 --- a/spacy/tests/lang/id/test_text.py +++ b/spacy/tests/lang/id/test_text.py @@ -5,7 +5,7 @@ import pytest from spacy.lang.id.lex_attrs import like_num -@pytest.mark.parametrize('word', ['sebelas']) +@pytest.mark.parametrize("word", ["sebelas"]) def test_id_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index 52f3535c0..cfff0fcfe 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -4,12 +4,10 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('word,lemma', [ - ('新しく', '新しい'), - ('赤く', '赤い'), - ('すごく', '凄い'), - ('いただきました', '頂く'), - ('なった', '成る')]) +@pytest.mark.parametrize( + "word,lemma", + [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")], +) def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma): test_lemma = ja_tokenizer(word)[0].lemma_ assert test_lemma == lemma diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index 345429c9e..87a343185 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest +# fmt: off TOKENIZER_TESTS = [ ("日本語だよ", ['日本', '語', 'だ', 'よ']), ("東京タワーの近くに住んでいます。", ['東京', 'タワー', 'の', '近く', 'に', '住ん', 'で', 'い', 'ます', '。']), @@ -27,21 +28,22 @@ POS_TESTS = [ ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']), ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN']) ] +# fmt: on -@pytest.mark.parametrize('text,expected_tokens', TOKENIZER_TESTS) +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) def test_ja_tokenizer(ja_tokenizer, text, expected_tokens): tokens = [token.text for token in ja_tokenizer(text)] assert tokens == expected_tokens -@pytest.mark.parametrize('text,expected_tags', TAG_TESTS) -def test_ja_tokenizer(ja_tokenizer, text, expected_tags): +@pytest.mark.parametrize("text,expected_tags", TAG_TESTS) +def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags): tags = [token.tag_ for token in ja_tokenizer(text)] assert tags == expected_tags -@pytest.mark.parametrize('text,expected_pos', POS_TESTS) -def test_ja_tokenizer(ja_tokenizer, text, expected_pos): +@pytest.mark.parametrize("text,expected_pos", POS_TESTS) +def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): pos = [token.pos_ for token in ja_tokenizer(text)] assert pos == expected_pos diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py index 806bae136..f72d310e8 100644 --- a/spacy/tests/lang/nb/test_tokenizer.py +++ b/spacy/tests/lang/nb/test_tokenizer.py @@ -5,12 +5,18 @@ import pytest NB_TOKEN_EXCEPTION_TESTS = [ - ('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']), - ('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser']) + ( + "Smørsausen brukes bl.a. til fisk", + ["Smørsausen", "brukes", "bl.a.", "til", "fisk"], + ), + ( + "Jeg kommer først kl. 13 pga. diverse forsinkelser", + ["Jeg", "kommer", "først", "kl.", "13", "pga.", "diverse", "forsinkelser"], + ), ] -@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS) +@pytest.mark.parametrize("text,expected_tokens", NB_TOKEN_EXCEPTION_TESTS) def test_nb_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens): tokens = nb_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py index f98d1d105..bfa35ff5a 100644 --- a/spacy/tests/lang/nl/test_text.py +++ b/spacy/tests/lang/nl/test_text.py @@ -5,7 +5,7 @@ import pytest from spacy.lang.nl.lex_attrs import like_num -@pytest.mark.parametrize('word', ['elf', 'elfde']) +@pytest.mark.parametrize("word", ["elf", "elfde"]) def test_nl_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py index 8e6fecc45..39dfff2c1 100644 --- a/spacy/tests/lang/pt/test_text.py +++ b/spacy/tests/lang/pt/test_text.py @@ -5,7 +5,7 @@ import pytest from spacy.lang.pt.lex_attrs import like_num -@pytest.mark.parametrize('word', ['onze', 'quadragésimo']) +@pytest.mark.parametrize("word", ["onze", "quadragésimo"]) def test_pt_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/ro/test_lemmatizer.py b/spacy/tests/lang/ro/test_lemmatizer.py index 8d238bcd5..0344a90d1 100644 --- a/spacy/tests/lang/ro/test_lemmatizer.py +++ b/spacy/tests/lang/ro/test_lemmatizer.py @@ -4,11 +4,15 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('string,lemma', [ - ('câini', 'câine'), - ('expedițiilor', 'expediție'), - ('pensete', 'pensetă'), - ('erau', 'fi')]) +@pytest.mark.parametrize( + "string,lemma", + [ + ("câini", "câine"), + ("expedițiilor", "expediție"), + ("pensete", "pensetă"), + ("erau", "fi"), + ], +) def test_ro_lemmatizer_lookup_assigns(ro_tokenizer, string, lemma): tokens = ro_tokenizer(string) assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py index 6ed3f2c90..a327174e5 100644 --- a/spacy/tests/lang/ro/test_tokenizer.py +++ b/spacy/tests/lang/ro/test_tokenizer.py @@ -5,17 +5,20 @@ import pytest TEST_CASES = [ - ('Adresa este str. Principală nr. 5.', ['Adresa', 'este', 'str.', 'Principală', 'nr.', '5', '.']), - ('Teste, etc.', ['Teste', ',', 'etc.']), - ('Lista, ș.a.m.d.', ['Lista', ',', 'ș.a.m.d.']), - ('Și d.p.d.v. al...', ['Și', 'd.p.d.v.', 'al', '...']), + ( + "Adresa este str. Principală nr. 5.", + ["Adresa", "este", "str.", "Principală", "nr.", "5", "."], + ), + ("Teste, etc.", ["Teste", ",", "etc."]), + ("Lista, ș.a.m.d.", ["Lista", ",", "ș.a.m.d."]), + ("Și d.p.d.v. al...", ["Și", "d.p.d.v.", "al", "..."]), # number tests - ('Clasa a 4-a.', ['Clasa', 'a', '4-a', '.']), - ('Al 12-lea ceas.', ['Al', '12-lea', 'ceas', '.']) + ("Clasa a 4-a.", ["Clasa", "a", "4-a", "."]), + ("Al 12-lea ceas.", ["Al", "12-lea", "ceas", "."]), ] -@pytest.mark.parametrize('text,expected_tokens', TEST_CASES) +@pytest.mark.parametrize("text,expected_tokens", TEST_CASES) def test_ro_tokenizer_handles_testcases(ro_tokenizer, text, expected_tokens): tokens = ro_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] diff --git a/spacy/tests/lang/ru/test_exceptions.py b/spacy/tests/lang/ru/test_exceptions.py index ea731df44..a8f0c3429 100644 --- a/spacy/tests/lang/ru/test_exceptions.py +++ b/spacy/tests/lang/ru/test_exceptions.py @@ -4,10 +4,10 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text,norms', [ - ("пн.", ["понедельник"]), - ("пт.", ["пятница"]), - ("дек.", ["декабрь"])]) +@pytest.mark.parametrize( + "text,norms", + [("пн.", ["понедельник"]), ("пт.", ["пятница"]), ("дек.", ["декабрь"])], +) def test_ru_tokenizer_abbrev_exceptions(ru_tokenizer, text, norms): tokens = ru_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index 21c0923d7..690cadf5d 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -9,55 +9,71 @@ from ...util import get_doc @pytest.fixture def ru_lemmatizer(): - pymorphy = pytest.importorskip('pymorphy2') + pymorphy = pytest.importorskip("pymorphy2") return Russian.Defaults.create_lemmatizer() def test_ru_doc_lemmatization(ru_tokenizer): - words = ['мама', 'мыла', 'раму'] - tags = ['NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing', - 'VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act', - 'NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing'] + words = ["мама", "мыла", "раму"] + tags = [ + "NOUN__Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing", + "VERB__Aspect=Imp|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act", + "NOUN__Animacy=Anim|Case=Acc|Gender=Fem|Number=Sing", + ] doc = get_doc(ru_tokenizer.vocab, words=words, tags=tags) lemmas = [token.lemma_ for token in doc] - assert lemmas == ['мама', 'мыть', 'рама'] + assert lemmas == ["мама", "мыть", "рама"] -@pytest.mark.parametrize('text,lemmas', [ - ('гвоздики', ['гвоздик', 'гвоздика']), - ('люди', ['человек']), - ('реки', ['река']), - ('кольцо', ['кольцо']), - ('пепперони', ['пепперони'])]) +@pytest.mark.parametrize( + "text,lemmas", + [ + ("гвоздики", ["гвоздик", "гвоздика"]), + ("люди", ["человек"]), + ("реки", ["река"]), + ("кольцо", ["кольцо"]), + ("пепперони", ["пепперони"]), + ], +) def test_ru_lemmatizer_noun_lemmas(ru_lemmatizer, text, lemmas): assert sorted(ru_lemmatizer.noun(text)) == lemmas -@pytest.mark.models('ru') -@pytest.mark.parametrize('text,pos,morphology,lemma', [ - ('рой', 'NOUN', None, 'рой'), - ('рой', 'VERB', None, 'рыть'), - ('клей', 'NOUN', None, 'клей'), - ('клей', 'VERB', None, 'клеить'), - ('три', 'NUM', None, 'три'), - ('кос', 'NOUN', {'Number': 'Sing'}, 'кос'), - ('кос', 'NOUN', {'Number': 'Plur'}, 'коса'), - ('кос', 'ADJ', None, 'косой'), - ('потом', 'NOUN', None, 'пот'), - ('потом', 'ADV', None, 'потом')]) -def test_ru_lemmatizer_works_with_different_pos_homonyms(ru_lemmatizer, text, pos, morphology, lemma): +@pytest.mark.models("ru") +@pytest.mark.parametrize( + "text,pos,morphology,lemma", + [ + ("рой", "NOUN", None, "рой"), + ("рой", "VERB", None, "рыть"), + ("клей", "NOUN", None, "клей"), + ("клей", "VERB", None, "клеить"), + ("три", "NUM", None, "три"), + ("кос", "NOUN", {"Number": "Sing"}, "кос"), + ("кос", "NOUN", {"Number": "Plur"}, "коса"), + ("кос", "ADJ", None, "косой"), + ("потом", "NOUN", None, "пот"), + ("потом", "ADV", None, "потом"), + ], +) +def test_ru_lemmatizer_works_with_different_pos_homonyms( + ru_lemmatizer, text, pos, morphology, lemma +): assert ru_lemmatizer(text, pos, morphology) == [lemma] -@pytest.mark.parametrize('text,morphology,lemma', [ - ('гвоздики', {'Gender': 'Fem'}, 'гвоздика'), - ('гвоздики', {'Gender': 'Masc'}, 'гвоздик'), - ('вина', {'Gender': 'Fem'}, 'вина'), - ('вина', {'Gender': 'Neut'}, 'вино')]) +@pytest.mark.parametrize( + "text,morphology,lemma", + [ + ("гвоздики", {"Gender": "Fem"}, "гвоздика"), + ("гвоздики", {"Gender": "Masc"}, "гвоздик"), + ("вина", {"Gender": "Fem"}, "вина"), + ("вина", {"Gender": "Neut"}, "вино"), + ], +) def test_ru_lemmatizer_works_with_noun_homonyms(ru_lemmatizer, text, morphology, lemma): assert ru_lemmatizer.noun(text, morphology) == [lemma] def test_ru_lemmatizer_punct(ru_lemmatizer): - assert ru_lemmatizer.punct('«') == ['"'] - assert ru_lemmatizer.punct('»') == ['"'] + assert ru_lemmatizer.punct("«") == ['"'] + assert ru_lemmatizer.punct("»") == ['"'] diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py index 6d26988a9..c5bff6973 100644 --- a/spacy/tests/lang/ru/test_text.py +++ b/spacy/tests/lang/ru/test_text.py @@ -5,7 +5,7 @@ import pytest from spacy.lang.ru.lex_attrs import like_num -@pytest.mark.parametrize('word', ['одиннадцать']) +@pytest.mark.parametrize("word", ["одиннадцать"]) def test_ru_lex_attrs_capitals(word): assert like_num(word) assert like_num(word.upper()) diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index 350b8a6c2..03e1d4c04 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -4,19 +4,19 @@ from __future__ import unicode_literals import pytest -PUNCT_OPEN = ['(', '[', '{', '*'] -PUNCT_CLOSE = [')', ']', '}', '*'] -PUNCT_PAIRED = [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] +PUNCT_OPEN = ["(", "[", "{", "*"] +PUNCT_CLOSE = [")", "]", "}", "*"] +PUNCT_PAIRED = [("(", ")"), ("[", "]"), ("{", "}"), ("*", "*")] -@pytest.mark.parametrize('text', ["(", "((", "<"]) +@pytest.mark.parametrize("text", ["(", "((", "<"]) def test_ru_tokenizer_handles_only_punct(ru_tokenizer, text): tokens = ru_tokenizer(text) assert len(tokens) == len(text) -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text): tokens = ru_tokenizer(punct + text) assert len(tokens) == 2 @@ -24,8 +24,8 @@ def test_ru_tokenizer_splits_open_punct(ru_tokenizer, punct, text): assert tokens[1].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text): tokens = ru_tokenizer(text + punct) assert len(tokens) == 2 @@ -33,9 +33,9 @@ def test_ru_tokenizer_splits_close_punct(ru_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('punct_add', ["`"]) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("punct_add", ["`"]) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, text): tokens = ru_tokenizer(punct + punct_add + text) assert len(tokens) == 3 @@ -44,9 +44,9 @@ def test_ru_tokenizer_splits_two_diff_open_punct(ru_tokenizer, punct, punct_add, assert tokens[2].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('punct_add', ["'"]) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("punct_add", ["'"]) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add, text): tokens = ru_tokenizer(text + punct + punct_add) assert len(tokens) == 3 @@ -55,8 +55,8 @@ def test_ru_tokenizer_splits_two_diff_close_punct(ru_tokenizer, punct, punct_add assert tokens[2].text == punct_add -@pytest.mark.parametrize('punct', PUNCT_OPEN) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_OPEN) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text): tokens = ru_tokenizer(punct + punct + punct + text) assert len(tokens) == 4 @@ -64,8 +64,8 @@ def test_ru_tokenizer_splits_same_open_punct(ru_tokenizer, punct, text): assert tokens[3].text == text -@pytest.mark.parametrize('punct', PUNCT_CLOSE) -@pytest.mark.parametrize('text', ["Привет"]) +@pytest.mark.parametrize("punct", PUNCT_CLOSE) +@pytest.mark.parametrize("text", ["Привет"]) def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text): tokens = ru_tokenizer(text + punct + punct + punct) assert len(tokens) == 4 @@ -73,14 +73,14 @@ def test_ru_tokenizer_splits_same_close_punct(ru_tokenizer, punct, text): assert tokens[1].text == punct -@pytest.mark.parametrize('text', ["'Тест"]) +@pytest.mark.parametrize("text", ["'Тест"]) def test_ru_tokenizer_splits_open_appostrophe(ru_tokenizer, text): tokens = ru_tokenizer(text) assert len(tokens) == 2 assert tokens[0].text == "'" -@pytest.mark.parametrize('text', ["Тест''"]) +@pytest.mark.parametrize("text", ["Тест''"]) def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): tokens = ru_tokenizer(text) assert len(tokens) == 2 @@ -88,10 +88,11 @@ def test_ru_tokenizer_splits_double_end_quote(ru_tokenizer, text): assert len(tokens_punct) == 1 -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('text', ["Тест"]) -def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open, - punct_close, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("text", ["Тест"]) +def test_ru_tokenizer_splits_open_close_punct( + ru_tokenizer, punct_open, punct_close, text +): tokens = ru_tokenizer(punct_open + text + punct_close) assert len(tokens) == 3 assert tokens[0].text == punct_open @@ -99,11 +100,12 @@ def test_ru_tokenizer_splits_open_close_punct(ru_tokenizer, punct_open, assert tokens[2].text == punct_close -@pytest.mark.parametrize('punct_open,punct_close', PUNCT_PAIRED) -@pytest.mark.parametrize('punct_open2,punct_close2', [("`", "'")]) -@pytest.mark.parametrize('text', ["Тест"]) -def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close, - punct_open2, punct_close2, text): +@pytest.mark.parametrize("punct_open,punct_close", PUNCT_PAIRED) +@pytest.mark.parametrize("punct_open2,punct_close2", [("`", "'")]) +@pytest.mark.parametrize("text", ["Тест"]) +def test_ru_tokenizer_two_diff_punct( + ru_tokenizer, punct_open, punct_close, punct_open2, punct_close2, text +): tokens = ru_tokenizer(punct_open2 + punct_open + text + punct_close + punct_close2) assert len(tokens) == 5 assert tokens[0].text == punct_open2 @@ -113,7 +115,7 @@ def test_ru_tokenizer_two_diff_punct(ru_tokenizer, punct_open, punct_close, assert tokens[4].text == punct_close2 -@pytest.mark.parametrize('text', ["Тест."]) +@pytest.mark.parametrize("text", ["Тест."]) def test_ru_tokenizer_splits_trailing_dot(ru_tokenizer, text): tokens = ru_tokenizer(text) assert tokens[1].text == "." diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index 85389a63c..894b5aa6a 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -5,20 +5,29 @@ import pytest SV_TOKEN_EXCEPTION_TESTS = [ - ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), - ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']), - ('Anders I. tycker om ord med i i.', ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."]) + ( + "Smörsåsen används bl.a. till fisk", + ["Smörsåsen", "används", "bl.a.", "till", "fisk"], + ), + ( + "Jag kommer först kl. 13 p.g.a. diverse förseningar", + ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], + ), + ( + "Anders I. tycker om ord med i i.", + ["Anders", "I.", "tycker", "om", "ord", "med", "i", "i", "."], + ), ] -@pytest.mark.parametrize('text,expected_tokens', SV_TOKEN_EXCEPTION_TESTS) +@pytest.mark.parametrize("text,expected_tokens", SV_TOKEN_EXCEPTION_TESTS) def test_sv_tokenizer_handles_exception_cases(sv_tokenizer, text, expected_tokens): tokens = sv_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] assert expected_tokens == token_list -@pytest.mark.parametrize('text', ["driveru", "hajaru", "Serru", "Fixaru"]) +@pytest.mark.parametrize("text", ["driveru", "hajaru", "Serru", "Fixaru"]) def test_sv_tokenizer_handles_verb_exceptions(sv_tokenizer, text): tokens = sv_tokenizer(text) assert len(tokens) == 2 diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 05bbe9534..4bb5aac70 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -6,53 +6,85 @@ from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape -@pytest.mark.parametrize('text', ["dog"]) +@pytest.mark.parametrize("text", ["dog"]) def test_attrs_key(text): assert intify_attrs({"ORTH": text}) == {ORTH: text} assert intify_attrs({"NORM": text}) == {NORM: text} assert intify_attrs({"lemma": text}, strings_map={text: 10}) == {LEMMA: 10} -@pytest.mark.parametrize('text', ["dog"]) +@pytest.mark.parametrize("text", ["dog"]) def test_attrs_idempotence(text): - int_attrs = intify_attrs({"lemma": text, 'is_alpha': True}, strings_map={text: 10}) + int_attrs = intify_attrs({"lemma": text, "is_alpha": True}, strings_map={text: 10}) assert intify_attrs(int_attrs) == {LEMMA: 10, IS_ALPHA: True} -@pytest.mark.parametrize('text', ["dog"]) +@pytest.mark.parametrize("text", ["dog"]) def test_attrs_do_deprecated(text): - int_attrs = intify_attrs({"F": text, 'is_alpha': True}, strings_map={text: 10}, - _do_deprecated=True) + int_attrs = intify_attrs( + {"F": text, "is_alpha": True}, strings_map={text: 10}, _do_deprecated=True + ) assert int_attrs == {ORTH: 10, IS_ALPHA: True} -@pytest.mark.parametrize('text,match', [(',', True), (' ', False), ('a', False)]) +@pytest.mark.parametrize("text,match", [(",", True), (" ", False), ("a", False)]) def test_lex_attrs_is_punct(text, match): assert is_punct(text) == match -@pytest.mark.parametrize('text,match', [(',', True), ('£', False), ('♥', False)]) +@pytest.mark.parametrize("text,match", [(",", True), ("£", False), ("♥", False)]) def test_lex_attrs_is_ascii(text, match): assert is_ascii(text) == match -@pytest.mark.parametrize('text,match', [('$', True), ('£', True), ('♥', False), - ('€', True), ('¥', True), ('¢', True), - ('a', False), ('www.google.com', False), ('dog', False)]) +@pytest.mark.parametrize( + "text,match", + [ + ("$", True), + ("£", True), + ("♥", False), + ("€", True), + ("¥", True), + ("¢", True), + ("a", False), + ("www.google.com", False), + ("dog", False), + ], +) def test_lex_attrs_is_currency(text, match): assert is_currency(text) == match -@pytest.mark.parametrize('text,match', [ - ('www.google.com', True), ('google.com', True), ('sydney.com', True), - ('2girls1cup.org', True), ('http://stupid', True), ('www.hi', True), - ('dog', False), ('1.2', False), ('1.a', False), ('hello.There', False)]) +@pytest.mark.parametrize( + "text,match", + [ + ("www.google.com", True), + ("google.com", True), + ("sydney.com", True), + ("2girls1cup.org", True), + ("http://stupid", True), + ("www.hi", True), + ("dog", False), + ("1.2", False), + ("1.a", False), + ("hello.There", False), + ], +) def test_lex_attrs_like_url(text, match): assert like_url(text) == match -@pytest.mark.parametrize('text,shape', [ - ('Nasa', 'Xxxx'), ('capitalized', 'xxxx'), ('999999999', 'dddd'), - ('C3P0', 'XdXd'), (',', ','), ('\n', '\n'), ('``,-', '``,-')]) +@pytest.mark.parametrize( + "text,shape", + [ + ("Nasa", "Xxxx"), + ("capitalized", "xxxx"), + ("999999999", "dddd"), + ("C3P0", "XdXd"), + (",", ","), + ("\n", "\n"), + ("``,-", "``,-"), + ], +) def test_lex_attrs_word_shape(text, shape): assert word_shape(text) == shape diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py index 8f40fb040..265c7753d 100644 --- a/spacy/tests/lang/th/test_tokenizer.py +++ b/spacy/tests/lang/th/test_tokenizer.py @@ -4,8 +4,9 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text,expected_tokens', [ - ("คุณรักผมไหม", ['คุณ', 'รัก', 'ผม', 'ไหม'])]) +@pytest.mark.parametrize( + "text,expected_tokens", [("คุณรักผมไหม", ["คุณ", "รัก", "ผม", "ไหม"])] +) def test_th_tokenizer(th_tokenizer, text, expected_tokens): tokens = [token.text for token in th_tokenizer(text)] assert tokens == expected_tokens diff --git a/spacy/tests/lang/tr/test_lemmatization.py b/spacy/tests/lang/tr/test_lemmatization.py index 52141109f..f8a0636f7 100644 --- a/spacy/tests/lang/tr/test_lemmatization.py +++ b/spacy/tests/lang/tr/test_lemmatization.py @@ -4,14 +4,18 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('string,lemma', [ - ('evlerimizdeki', 'ev'), - ('işlerimizi', 'iş'), - ('biran', 'biran'), - ('bitirmeliyiz', 'bitir'), - ('isteklerimizi', 'istek'), - ('karşılaştırmamızın', 'karşılaştır'), - ('çoğulculuktan', 'çoğulcu')]) +@pytest.mark.parametrize( + "string,lemma", + [ + ("evlerimizdeki", "ev"), + ("işlerimizi", "iş"), + ("biran", "biran"), + ("bitirmeliyiz", "bitir"), + ("isteklerimizi", "istek"), + ("karşılaştırmamızın", "karşılaştır"), + ("çoğulculuktan", "çoğulcu"), + ], +) def test_tr_lemmatizer_lookup_assigns(tr_tokenizer, string, lemma): tokens = tr_tokenizer(string) assert tokens[0].lemma_ == lemma diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py index e95a7acd5..66ef9c181 100644 --- a/spacy/tests/lang/tt/test_tokenizer.py +++ b/spacy/tests/lang/tt/test_tokenizer.py @@ -6,14 +6,16 @@ import pytest INFIX_HYPHEN_TESTS = [ ("Явым-төшем күләме.", "Явым-төшем күләме .".split()), - ("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()) + ("Хатын-кыз киеме.", "Хатын-кыз киеме .".split()), ] PUNC_INSIDE_WORDS_TESTS = [ - ("Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.", - "Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ," - " 783,9 млн. кеше / елда .".split()), - ("Ту\"кай", "Ту \" кай".split()) + ( + "Пассаҗир саны - 2,13 млн — кеше/көндә (2010), 783,9 млн. кеше/елда.", + "Пассаҗир саны - 2,13 млн — кеше / көндә ( 2010 ) ," + " 783,9 млн. кеше / елда .".split(), + ), + ('Ту"кай', 'Ту " кай'.split()), ] MIXED_ORDINAL_NUMS_TESTS = [ @@ -22,14 +24,14 @@ MIXED_ORDINAL_NUMS_TESTS = [ ABBREV_TESTS = [ ("«3 елда (б.э.к.) туган", "« 3 елда ( б.э.к. ) туган".split()), - ("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()) + ("тукымадан һ.б.ш. тегелгән.", "тукымадан һ.б.ш. тегелгән .".split()), ] NAME_ABBREV_TESTS = [ ("Ә.Тукай", "Ә.Тукай".split()), ("Ә.тукай", "Ә.тукай".split()), ("ә.Тукай", "ә . Тукай".split()), - ("Миләүшә.", "Миләүшә .".split()) + ("Миләүшә.", "Миләүшә .".split()), ] TYPOS_IN_PUNC_TESTS = [ @@ -37,30 +39,39 @@ TYPOS_IN_PUNC_TESTS = [ ("«3 елда,туган", "« 3 елда , туган".split()), ("«3 елда,туган.", "« 3 елда , туган .".split()), ("Ул эшли(кайчан?)", "Ул эшли ( кайчан ? )".split()), - ("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()) # "?)" => "?)" or "? )" + ("Ул (кайчан?)эшли", "Ул ( кайчан ?) эшли".split()), # "?)" => "?)" or "? )" ] LONG_TEXTS_TESTS = [ - ("Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы" - "якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз" - "меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең" - "салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын" - "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.", - "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы" - "якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз" - "меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең" - "салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын" - "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split() - ) + ( + "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы" + "якларда яшәгәннәр, шуңа күрә аларга кием кирәк булмаган.Йөз" + "меңнәрчә еллар үткән, борынгы кешеләр акрынлап Европа һәм Азиянең" + "салкын илләрендә дә яши башлаганнар. Алар кырыс һәм салкын" + "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк.", + "Иң борынгы кешеләр суыклар һәм салкын кышлар булмый торган җылы" + "якларда яшәгәннәр , шуңа күрә аларга кием кирәк булмаган . Йөз" + "меңнәрчә еллар үткән , борынгы кешеләр акрынлап Европа һәм Азиянең" + "салкын илләрендә дә яши башлаганнар . Алар кырыс һәм салкын" + "кышлардан саклану өчен кием-салым уйлап тапканнар - итәк .".split(), + ) ] -TESTCASES = (INFIX_HYPHEN_TESTS + PUNC_INSIDE_WORDS_TESTS + - MIXED_ORDINAL_NUMS_TESTS + ABBREV_TESTS + NAME_ABBREV_TESTS + - LONG_TEXTS_TESTS + TYPOS_IN_PUNC_TESTS) +TESTCASES = ( + INFIX_HYPHEN_TESTS + + PUNC_INSIDE_WORDS_TESTS + + MIXED_ORDINAL_NUMS_TESTS + + ABBREV_TESTS + + NAME_ABBREV_TESTS + + LONG_TEXTS_TESTS + + TYPOS_IN_PUNC_TESTS +) NORM_TESTCASES = [ - ("тукымадан һ.б.ш. тегелгән.", - ["тукымадан", "һәм башка шундыйлар", "тегелгән", "."]) + ( + "тукымадан һ.б.ш. тегелгән.", + ["тукымадан", "һәм башка шундыйлар", "тегелгән", "."], + ) ] @@ -70,7 +81,7 @@ def test_tt_tokenizer_handles_testcases(tt_tokenizer, text, expected_tokens): assert expected_tokens == tokens -@pytest.mark.parametrize('text,norms', NORM_TESTCASES) +@pytest.mark.parametrize("text,norms", NORM_TESTCASES) def test_tt_tokenizer_handles_norm_exceptions(tt_tokenizer, text, norms): tokens = tt_tokenizer(text) assert [token.norm_ for token in tokens] == norms diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index d872799b8..45d80e027 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -13,9 +13,7 @@ def test_ur_tokenizer_handles_long_text(ur_tokenizer): assert len(tokens) == 77 -@pytest.mark.parametrize('text,length', [ - ("تحریر باسط حبیب", 3), - ("میرا پاکستان", 2)]) +@pytest.mark.parametrize("text,length", [("تحریر باسط حبیب", 3), ("میرا پاکستان", 2)]) def test_ur_tokenizer_handles_cnts(ur_tokenizer, text, length): tokens = ur_tokenizer(text) assert len(tokens) == length diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 6f4919ac5..4d75eb870 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -10,9 +10,11 @@ from ..util import get_doc @pytest.fixture def matcher(en_vocab): - rules = {'JS': [[{'ORTH': 'JavaScript'}]], - 'GoogleNow': [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]], - 'Java': [[{'LOWER': 'java'}]]} + rules = { + "JS": [[{"ORTH": "JavaScript"}]], + "GoogleNow": [[{"ORTH": "Google"}, {"ORTH": "Now"}]], + "Java": [[{"LOWER": "java"}]], + } matcher = Matcher(en_vocab) for key, patterns in rules.items(): matcher.add(key, None, *patterns) @@ -21,44 +23,44 @@ def matcher(en_vocab): def test_matcher_from_api_docs(en_vocab): matcher = Matcher(en_vocab) - pattern = [{'ORTH': 'test'}] + pattern = [{"ORTH": "test"}] assert len(matcher) == 0 - matcher.add('Rule', None, pattern) + matcher.add("Rule", None, pattern) assert len(matcher) == 1 - matcher.remove('Rule') - assert 'Rule' not in matcher - matcher.add('Rule', None, pattern) - assert 'Rule' in matcher - on_match, patterns = matcher.get('Rule') + matcher.remove("Rule") + assert "Rule" not in matcher + matcher.add("Rule", None, pattern) + assert "Rule" in matcher + on_match, patterns = matcher.get("Rule") assert len(patterns[0]) def test_matcher_from_usage_docs(en_vocab): text = "Wow 😀 This is really cool! 😂 😂" - doc = Doc(en_vocab, words=text.split(' ')) - pos_emoji = ['😀', '😃', '😂', '🤣', '😊', '😍'] - pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji] + doc = Doc(en_vocab, words=text.split(" ")) + pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"] + pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji] def label_sentiment(matcher, doc, i, matches): match_id, start, end = matches[i] - if doc.vocab.strings[match_id] == 'HAPPY': + if doc.vocab.strings[match_id] == "HAPPY": doc.sentiment += 0.1 - span = doc[start : end] + span = doc[start:end] token = span.merge() - token.vocab[token.text].norm_ = 'happy emoji' + token.vocab[token.text].norm_ = "happy emoji" matcher = Matcher(en_vocab) - matcher.add('HAPPY', label_sentiment, *pos_patterns) + matcher.add("HAPPY", label_sentiment, *pos_patterns) matches = matcher(doc) assert doc.sentiment != 0 - assert doc[1].norm_ == 'happy emoji' + assert doc[1].norm_ == "happy emoji" def test_matcher_len_contains(matcher): assert len(matcher) == 3 - matcher.add('TEST', None, [{'ORTH': 'test'}]) - assert 'TEST' in matcher - assert 'TEST2' not in matcher + matcher.add("TEST", None, [{"ORTH": "test"}]) + assert "TEST" in matcher + assert "TEST2" not in matcher def test_matcher_no_match(matcher): @@ -68,38 +70,40 @@ def test_matcher_no_match(matcher): def test_matcher_match_start(matcher): doc = Doc(matcher.vocab, words=["JavaScript", "is", "good"]) - assert matcher(doc) == [(matcher.vocab.strings['JS'], 0, 1)] + assert matcher(doc) == [(matcher.vocab.strings["JS"], 0, 1)] def test_matcher_match_end(matcher): words = ["I", "like", "java"] doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [(doc.vocab.strings['Java'], 2, 3)] + assert matcher(doc) == [(doc.vocab.strings["Java"], 2, 3)] def test_matcher_match_middle(matcher): words = ["I", "like", "Google", "Now", "best"] doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4)] + assert matcher(doc) == [(doc.vocab.strings["GoogleNow"], 2, 4)] def test_matcher_match_multi(matcher): words = ["I", "like", "Google", "Now", "and", "java", "best"] doc = Doc(matcher.vocab, words=words) - assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], 2, 4), - (doc.vocab.strings['Java'], 5, 6)] + assert matcher(doc) == [ + (doc.vocab.strings["GoogleNow"], 2, 4), + (doc.vocab.strings["Java"], 5, 6), + ] def test_matcher_empty_dict(en_vocab): """Test matcher allows empty token specs, meaning match on any token.""" matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) - matcher.add('A.C', None, [{'ORTH': 'a'}, {}, {'ORTH': 'c'}]) + matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}]) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) matcher = Matcher(en_vocab) - matcher.add('A.', None, [{'ORTH': 'a'}, {}]) + matcher.add("A.", None, [{"ORTH": "a"}, {}]) matches = matcher(doc) assert matches[0][1:] == (0, 2) @@ -107,8 +111,8 @@ def test_matcher_empty_dict(en_vocab): def test_matcher_operator_shadow(en_vocab): matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=["a", "b", "c"]) - pattern = [{'ORTH': 'a'}, {"IS_ALPHA": True, "OP": "+"}, {'ORTH': 'c'}] - matcher.add('A.C', None, pattern) + pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}] + matcher.add("A.C", None, pattern) matches = matcher(doc) assert len(matches) == 1 assert matches[0][1:] == (0, 3) @@ -117,43 +121,48 @@ def test_matcher_operator_shadow(en_vocab): def test_matcher_match_zero(matcher): words1 = 'He said , " some words " ...'.split() words2 = 'He said , " some three words " ...'.split() - pattern1 = [{'ORTH': '"'}, - {'OP': '!', 'IS_PUNCT': True}, - {'OP': '!', 'IS_PUNCT': True}, - {'ORTH': '"'}] - pattern2 = [{'ORTH': '"'}, - {'IS_PUNCT': True}, - {'IS_PUNCT': True}, - {'IS_PUNCT': True}, - {'ORTH': '"'}] - matcher.add('Quote', None, pattern1) + pattern1 = [ + {"ORTH": '"'}, + {"OP": "!", "IS_PUNCT": True}, + {"OP": "!", "IS_PUNCT": True}, + {"ORTH": '"'}, + ] + pattern2 = [ + {"ORTH": '"'}, + {"IS_PUNCT": True}, + {"IS_PUNCT": True}, + {"IS_PUNCT": True}, + {"ORTH": '"'}, + ] + matcher.add("Quote", None, pattern1) doc = Doc(matcher.vocab, words=words1) assert len(matcher(doc)) == 1 doc = Doc(matcher.vocab, words=words2) assert len(matcher(doc)) == 0 - matcher.add('Quote', None, pattern2) + matcher.add("Quote", None, pattern2) assert len(matcher(doc)) == 0 def test_matcher_match_zero_plus(matcher): words = 'He said , " some words " ...'.split() - pattern = [{'ORTH': '"'}, - {'OP': '*', 'IS_PUNCT': False}, - {'ORTH': '"'}] + pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}] matcher = Matcher(matcher.vocab) - matcher.add('Quote', None, pattern) + matcher.add("Quote", None, pattern) doc = Doc(matcher.vocab, words=words) assert len(matcher(doc)) == 1 def test_matcher_match_one_plus(matcher): control = Matcher(matcher.vocab) - control.add('BasicPhilippe', None, [{'ORTH': 'Philippe'}]) - doc = Doc(control.vocab, words=['Philippe', 'Philippe']) + control.add("BasicPhilippe", None, [{"ORTH": "Philippe"}]) + doc = Doc(control.vocab, words=["Philippe", "Philippe"]) m = control(doc) assert len(m) == 2 - matcher.add('KleenePhilippe', None, [{'ORTH': 'Philippe', 'OP': '1'}, - {'ORTH': 'Philippe', 'OP': '+'}]) + matcher.add( + "KleenePhilippe", + None, + [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}], + ) m = matcher(doc) assert len(m) == 1 @@ -161,54 +170,70 @@ def test_matcher_match_one_plus(matcher): def test_matcher_any_token_operator(en_vocab): """Test that patterns with "any token" {} work with operators.""" matcher = Matcher(en_vocab) - matcher.add('TEST', None, [{'ORTH': 'test'}, {'OP': '*'}]) - doc = Doc(en_vocab, words=['test', 'hello', 'world']) + matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}]) + doc = Doc(en_vocab, words=["test", "hello", "world"]) matches = [doc[start:end].text for _, start, end in matcher(doc)] assert len(matches) == 3 - assert matches[0] == 'test' - assert matches[1] == 'test hello' - assert matches[2] == 'test hello world' + assert matches[0] == "test" + assert matches[1] == "test hello" + assert matches[2] == "test hello world" @pytest.fixture def text(): - return u"The quick brown fox jumped over the lazy fox" + return "The quick brown fox jumped over the lazy fox" + @pytest.fixture def heads(): - return [3,2,1,1,0,-1,2,1,-3] + return [3, 2, 1, 1, 0, -1, 2, 1, -3] + @pytest.fixture def deps(): - return ['det', 'amod', 'amod', 'nsubj', 'prep', 'pobj', 'det', 'amod'] + return ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"] + @pytest.fixture def dependency_tree_matcher(en_vocab): - is_brown_yellow = lambda text: bool(re.compile(r'brown|yellow|over').match(text)) + def is_brown_yellow(text): + return bool(re.compile(r"brown|yellow|over").match(text)) + IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) pattern1 = [ - {'SPEC': {'NODE_NAME': 'fox'}, 'PATTERN': {'ORTH': 'fox'}}, - {'SPEC': {'NODE_NAME': 'q', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'},'PATTERN': {'LOWER': u'quick'}}, - {'SPEC': {'NODE_NAME': 'r', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}} + {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}}, + { + "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {"LOWER": "quick"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {IS_BROWN_YELLOW: True}, + }, ] pattern2 = [ - {'SPEC': {'NODE_NAME': 'jumped'}, 'PATTERN': {'ORTH': 'jumped'}}, - {'SPEC': {'NODE_NAME': 'fox', 'NBOR_RELOP': '>', 'NBOR_NAME': 'jumped'},'PATTERN': {'LOWER': u'fox'}}, - {'SPEC': {'NODE_NAME': 'over', 'NBOR_RELOP': '>', 'NBOR_NAME': 'fox'}, 'PATTERN': {IS_BROWN_YELLOW: True}} + {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"LOWER": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "over", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {IS_BROWN_YELLOW: True}, + }, ] matcher = DependencyTreeMatcher(en_vocab) - matcher.add('pattern1', None, pattern1) - matcher.add('pattern2', None, pattern2) + matcher.add("pattern1", None, pattern1) + matcher.add("pattern2", None, pattern2) return matcher - def test_dependency_tree_matcher_compile(dependency_tree_matcher): assert len(dependency_tree_matcher) == 2 -def test_dependency_tree_matcher(dependency_tree_matcher,text,heads,deps): - doc = get_doc(dependency_tree_matcher.vocab,text.split(),heads=heads,deps=deps) + +def test_dependency_tree_matcher(dependency_tree_matcher, text, heads, deps): + doc = get_doc(dependency_tree_matcher.vocab, text.split(), heads=heads, deps=deps) matches = dependency_tree_matcher(doc) assert len(matches) == 2 - diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 825f23cb3..56a03d200 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -7,17 +7,25 @@ from spacy.matcher import Matcher from spacy.tokens import Doc -pattern1 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}] -pattern2 = [{'ORTH':'A', 'OP':'*'}, {'ORTH':'A', 'OP':'1'}] -pattern3 = [{'ORTH':'A', 'OP':'1'}, {'ORTH':'A', 'OP':'1'}] -pattern4 = [{'ORTH':'B', 'OP':'1'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}] -pattern5 = [{'ORTH':'B', 'OP':'*'}, {'ORTH':'A', 'OP':'*'}, {'ORTH':'B', 'OP':'1'}] +pattern1 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "*"}] +pattern2 = [{"ORTH": "A", "OP": "*"}, {"ORTH": "A", "OP": "1"}] +pattern3 = [{"ORTH": "A", "OP": "1"}, {"ORTH": "A", "OP": "1"}] +pattern4 = [ + {"ORTH": "B", "OP": "1"}, + {"ORTH": "A", "OP": "*"}, + {"ORTH": "B", "OP": "1"}, +] +pattern5 = [ + {"ORTH": "B", "OP": "*"}, + {"ORTH": "A", "OP": "*"}, + {"ORTH": "B", "OP": "1"}, +] -re_pattern1 = 'AA*' -re_pattern2 = 'A*A' -re_pattern3 = 'AA' -re_pattern4 = 'BA*B' -re_pattern5 = 'B*A*B' +re_pattern1 = "AA*" +re_pattern2 = "A*A" +re_pattern3 = "AA" +re_pattern4 = "BA*B" +re_pattern5 = "B*A*B" @pytest.fixture @@ -27,17 +35,20 @@ def text(): @pytest.fixture def doc(en_tokenizer, text): - doc = en_tokenizer(' '.join(text)) + doc = en_tokenizer(" ".join(text)) return doc -@pytest.mark.xfail -@pytest.mark.parametrize('pattern,re_pattern', [ - (pattern1, re_pattern1), - (pattern2, re_pattern2), - (pattern3, re_pattern3), - (pattern4, re_pattern4), - (pattern5, re_pattern5)]) +@pytest.mark.parametrize( + "pattern,re_pattern", + [ + pytest.param(pattern1, re_pattern1, marks=pytest.mark.xfail()), + pytest.param(pattern2, re_pattern2, marks=pytest.mark.xfail()), + pytest.param(pattern3, re_pattern3, marks=pytest.mark.xfail()), + (pattern4, re_pattern4), + pytest.param(pattern5, re_pattern5, marks=pytest.mark.xfail()), + ], +) def test_greedy_matching(doc, text, pattern, re_pattern): """Test that the greedy matching behavior of the * op is consistant with other re implementations.""" @@ -50,12 +61,16 @@ def test_greedy_matching(doc, text, pattern, re_pattern): @pytest.mark.xfail -@pytest.mark.parametrize('pattern,re_pattern', [ - (pattern1, re_pattern1), - (pattern2, re_pattern2), - (pattern3, re_pattern3), - (pattern4, re_pattern4), - (pattern5, re_pattern5)]) +@pytest.mark.parametrize( + "pattern,re_pattern", + [ + (pattern1, re_pattern1), + (pattern2, re_pattern2), + (pattern3, re_pattern3), + (pattern4, re_pattern4), + (pattern5, re_pattern5), + ], +) def test_match_consuming(doc, text, pattern, re_pattern): """Test that matcher.__call__ consumes tokens on a match similar to re.findall.""" @@ -68,33 +83,33 @@ def test_match_consuming(doc, text, pattern, re_pattern): def test_operator_combos(en_vocab): cases = [ - ('aaab', 'a a a b', True), - ('aaab', 'a+ b', True), - ('aaab', 'a+ a+ b', True), - ('aaab', 'a+ a+ a b', True), - ('aaab', 'a+ a+ a+ b', True), - ('aaab', 'a+ a a b', True), - ('aaab', 'a+ a a', True), - ('aaab', 'a+', True), - ('aaa', 'a+ b', False), - ('aaa', 'a+ a+ b', False), - ('aaa', 'a+ a+ a+ b', False), - ('aaa', 'a+ a b', False), - ('aaa', 'a+ a a b', False), - ('aaab', 'a+ a a', True), - ('aaab', 'a+', True), - ('aaab', 'a+ a b', True) + ("aaab", "a a a b", True), + ("aaab", "a+ b", True), + ("aaab", "a+ a+ b", True), + ("aaab", "a+ a+ a b", True), + ("aaab", "a+ a+ a+ b", True), + ("aaab", "a+ a a b", True), + ("aaab", "a+ a a", True), + ("aaab", "a+", True), + ("aaa", "a+ b", False), + ("aaa", "a+ a+ b", False), + ("aaa", "a+ a+ a+ b", False), + ("aaa", "a+ a b", False), + ("aaa", "a+ a a b", False), + ("aaab", "a+ a a", True), + ("aaab", "a+", True), + ("aaab", "a+ a b", True), ] for string, pattern_str, result in cases: matcher = Matcher(en_vocab) doc = Doc(matcher.vocab, words=list(string)) pattern = [] for part in pattern_str.split(): - if part.endswith('+'): - pattern.append({'ORTH': part[0], 'OP': '+'}) + if part.endswith("+"): + pattern.append({"ORTH": part[0], "OP": "+"}) else: - pattern.append({'ORTH': part}) - matcher.add('PATTERN', None, pattern) + pattern.append({"ORTH": part}) + matcher.add("PATTERN", None, pattern) matches = matcher(doc) if result: assert matches, (string, pattern_str) @@ -105,12 +120,12 @@ def test_operator_combos(en_vocab): def test_matcher_end_zero_plus(en_vocab): """Test matcher works when patterns end with * operator. (issue 1450)""" matcher = Matcher(en_vocab) - pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}] - matcher.add('TSTEND', None, pattern) + pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] + matcher.add("TSTEND", None, pattern) nlp = lambda string: Doc(matcher.vocab, words=string.split()) - assert len(matcher(nlp('a'))) == 1 - assert len(matcher(nlp('a b'))) == 2 - assert len(matcher(nlp('a c'))) == 1 - assert len(matcher(nlp('a b c'))) == 2 - assert len(matcher(nlp('a b b c'))) == 3 - assert len(matcher(nlp('a b b'))) == 3 + assert len(matcher(nlp("a"))) == 1 + assert len(matcher(nlp("a b"))) == 2 + assert len(matcher(nlp("a c"))) == 1 + assert len(matcher(nlp("a b c"))) == 2 + assert len(matcher(nlp("a b b c"))) == 3 + assert len(matcher(nlp("a b b"))) == 3 diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 125d7be74..1d01990bd 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -11,7 +10,7 @@ from ..util import get_doc def test_matcher_phrase_matcher(en_vocab): doc = Doc(en_vocab, words=["Google", "Now"]) matcher = PhraseMatcher(en_vocab) - matcher.add('COMPANY', None, doc) + matcher.add("COMPANY", None, doc) doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"]) assert len(matcher(doc)) == 1 @@ -19,63 +18,63 @@ def test_matcher_phrase_matcher(en_vocab): def test_phrase_matcher_length(en_vocab): matcher = PhraseMatcher(en_vocab) assert len(matcher) == 0 - matcher.add('TEST', None, Doc(en_vocab, words=['test'])) + matcher.add("TEST", None, Doc(en_vocab, words=["test"])) assert len(matcher) == 1 - matcher.add('TEST2', None, Doc(en_vocab, words=['test2'])) + matcher.add("TEST2", None, Doc(en_vocab, words=["test2"])) assert len(matcher) == 2 def test_phrase_matcher_contains(en_vocab): matcher = PhraseMatcher(en_vocab) - matcher.add('TEST', None, Doc(en_vocab, words=['test'])) - assert 'TEST' in matcher - assert 'TEST2' not in matcher + matcher.add("TEST", None, Doc(en_vocab, words=["test"])) + assert "TEST" in matcher + assert "TEST2" not in matcher def test_phrase_matcher_string_attrs(en_vocab): - words1 = ['I', 'like', 'cats'] - pos1 = ['PRON', 'VERB', 'NOUN'] - words2 = ['Yes', ',', 'you', 'hate', 'dogs', 'very', 'much'] - pos2 = ['INTJ', 'PUNCT', 'PRON', 'VERB', 'NOUN', 'ADV', 'ADV'] + words1 = ["I", "like", "cats"] + pos1 = ["PRON", "VERB", "NOUN"] + words2 = ["Yes", ",", "you", "hate", "dogs", "very", "much"] + pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"] pattern = get_doc(en_vocab, words=words1, pos=pos1) - matcher = PhraseMatcher(en_vocab, attr='POS') - matcher.add('TEST', None, pattern) + matcher = PhraseMatcher(en_vocab, attr="POS") + matcher.add("TEST", None, pattern) doc = get_doc(en_vocab, words=words2, pos=pos2) matches = matcher(doc) assert len(matches) == 1 match_id, start, end = matches[0] - assert match_id == en_vocab.strings['TEST'] + assert match_id == en_vocab.strings["TEST"] assert start == 2 assert end == 5 def test_phrase_matcher_string_attrs_negative(en_vocab): """Test that token with the control codes as ORTH are *not* matched.""" - words1 = ['I', 'like', 'cats'] - pos1 = ['PRON', 'VERB', 'NOUN'] - words2 = ['matcher:POS-PRON', 'matcher:POS-VERB', 'matcher:POS-NOUN'] - pos2 = ['X', 'X', 'X'] + words1 = ["I", "like", "cats"] + pos1 = ["PRON", "VERB", "NOUN"] + words2 = ["matcher:POS-PRON", "matcher:POS-VERB", "matcher:POS-NOUN"] + pos2 = ["X", "X", "X"] pattern = get_doc(en_vocab, words=words1, pos=pos1) - matcher = PhraseMatcher(en_vocab, attr='POS') - matcher.add('TEST', None, pattern) + matcher = PhraseMatcher(en_vocab, attr="POS") + matcher.add("TEST", None, pattern) doc = get_doc(en_vocab, words=words2, pos=pos2) matches = matcher(doc) assert len(matches) == 0 def test_phrase_matcher_bool_attrs(en_vocab): - words1 = ['Hello', 'world', '!'] - words2 = ['No', 'problem', ',', 'he', 'said', '.'] + words1 = ["Hello", "world", "!"] + words2 = ["No", "problem", ",", "he", "said", "."] pattern = Doc(en_vocab, words=words1) - matcher = PhraseMatcher(en_vocab, attr='IS_PUNCT') - matcher.add('TEST', None, pattern) + matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT") + matcher.add("TEST", None, pattern) doc = Doc(en_vocab, words=words2) matches = matcher(doc) assert len(matches) == 2 match_id1, start1, end1 = matches[0] match_id2, start2, end2 = matches[1] - assert match_id1 == en_vocab.strings['TEST'] - assert match_id2 == en_vocab.strings['TEST'] + assert match_id1 == en_vocab.strings["TEST"] + assert match_id2 == en_vocab.strings["TEST"] assert start1 == 0 assert end1 == 3 assert start2 == 3 diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index 9295946a5..31bfbe56d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import pytest -import numpy.random from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps from spacy.attrs import NORM @@ -20,18 +19,17 @@ def vocab(): @pytest.fixture def parser(vocab): parser = DependencyParser(vocab) - parser.cfg['token_vector_width'] = 8 - parser.cfg['hidden_width'] = 30 - parser.cfg['hist_size'] = 0 - parser.add_label('left') + parser.cfg["token_vector_width"] = 8 + parser.cfg["hidden_width"] = 30 + parser.cfg["hist_size"] = 0 + parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} - doc = Doc(vocab, words=['a', 'b', 'c', 'd']) - gold = GoldParse(doc, heads=[1, 1, 3, 3], - deps=['left', 'ROOT', 'left', 'ROOT']) + doc = Doc(vocab, words=["a", "b", "c", "d"]) + gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser @@ -44,29 +42,30 @@ def test_init_parser(parser): # TODO: This now seems to be implicated in segfaults. Not sure what's up! @pytest.mark.skip def test_add_label(parser): - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].head.i == 1 - assert doc[0].dep_ == 'left' + assert doc[0].dep_ == "left" assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 - parser.add_label('right') - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + parser.add_label("right") + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert doc[0].head.i == 1 - assert doc[0].dep_ == 'left' + assert doc[0].dep_ == "left" assert doc[1].head.i == 1 assert doc[2].head.i == 3 assert doc[2].head.i == 3 sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) - gold = GoldParse(doc, heads=[1, 1, 3, 3], - deps=['right', 'ROOT', 'left', 'ROOT']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) + gold = GoldParse( + doc, heads=[1, 1, 3, 3], deps=["right", "ROOT", "left", "ROOT"] + ) parser.update([doc], [gold], sgd=sgd, losses=losses) - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) - assert doc[0].dep_ == 'right' - assert doc[2].dep_ == 'left' + assert doc[0].dep_ == "right" + assert doc[2].dep_ == "left" diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index f126fc961..80b81bdf3 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -31,16 +31,19 @@ def get_sequence_costs(M, words, heads, deps, transitions): def vocab(): return Vocab() + @pytest.fixture def arc_eager(vocab): moves = ArcEager(vocab.strings, ArcEager.get_actions()) - moves.add_action(2, 'left') - moves.add_action(3, 'right') + moves.add_action(2, "left") + moves.add_action(3, "right") return moves + @pytest.fixture def words(): - return ['a', 'b'] + return ["a", "b"] + @pytest.fixture def doc(words, vocab): @@ -48,19 +51,21 @@ def doc(words, vocab): vocab = Vocab() return Doc(vocab, words=list(words)) + @pytest.fixture def gold(doc, words): if len(words) == 2: - return GoldParse(doc, words=['a', 'b'], heads=[0, 0], deps=['ROOT', 'right']) + return GoldParse(doc, words=["a", "b"], heads=[0, 0], deps=["ROOT", "right"]) else: raise NotImplementedError + @pytest.mark.xfail def test_oracle_four_words(arc_eager, vocab): - words = ['a', 'b', 'c', 'd'] + words = ["a", "b", "c", "d"] heads = [1, 1, 3, 3] - deps = ['left', 'ROOT', 'left', 'ROOT'] - actions = ['L-left', 'B-ROOT', 'L-left'] + deps = ["left", "ROOT", "left", "ROOT"] + actions = ["L-left", "B-ROOT", "L-left"] state, cost_history = get_sequence_costs(arc_eager, words, heads, deps, actions) assert state.is_final() for i, state_costs in enumerate(cost_history): @@ -72,63 +77,65 @@ def test_oracle_four_words(arc_eager, vocab): annot_tuples = [ - (0, 'When', 'WRB', 11, 'advmod', 'O'), - (1, 'Walter', 'NNP', 2, 'compound', 'B-PERSON'), - (2, 'Rodgers', 'NNP', 11, 'nsubj', 'L-PERSON'), - (3, ',', ',', 2, 'punct', 'O'), - (4, 'our', 'PRP$', 6, 'poss', 'O'), - (5, 'embedded', 'VBN', 6, 'amod', 'O'), - (6, 'reporter', 'NN', 2, 'appos', 'O'), - (7, 'with', 'IN', 6, 'prep', 'O'), - (8, 'the', 'DT', 10, 'det', 'B-ORG'), - (9, '3rd', 'NNP', 10, 'compound', 'I-ORG'), - (10, 'Cavalry', 'NNP', 7, 'pobj', 'L-ORG'), - (11, 'says', 'VBZ', 44, 'advcl', 'O'), - (12, 'three', 'CD', 13, 'nummod', 'U-CARDINAL'), - (13, 'battalions', 'NNS', 16, 'nsubj', 'O'), - (14, 'of', 'IN', 13, 'prep', 'O'), - (15, 'troops', 'NNS', 14, 'pobj', 'O'), - (16, 'are', 'VBP', 11, 'ccomp', 'O'), - (17, 'on', 'IN', 16, 'prep', 'O'), - (18, 'the', 'DT', 19, 'det', 'O'), - (19, 'ground', 'NN', 17, 'pobj', 'O'), - (20, ',', ',', 17, 'punct', 'O'), - (21, 'inside', 'IN', 17, 'prep', 'O'), - (22, 'Baghdad', 'NNP', 21, 'pobj', 'U-GPE'), - (23, 'itself', 'PRP', 22, 'appos', 'O'), - (24, ',', ',', 16, 'punct', 'O'), - (25, 'have', 'VBP', 26, 'aux', 'O'), - (26, 'taken', 'VBN', 16, 'dep', 'O'), - (27, 'up', 'RP', 26, 'prt', 'O'), - (28, 'positions', 'NNS', 26, 'dobj', 'O'), - (29, 'they', 'PRP', 31, 'nsubj', 'O'), - (30, "'re", 'VBP', 31, 'aux', 'O'), - (31, 'going', 'VBG', 26, 'parataxis', 'O'), - (32, 'to', 'TO', 33, 'aux', 'O'), - (33, 'spend', 'VB', 31, 'xcomp', 'O'), - (34, 'the', 'DT', 35, 'det', 'B-TIME'), - (35, 'night', 'NN', 33, 'dobj', 'L-TIME'), - (36, 'there', 'RB', 33, 'advmod', 'O'), - (37, 'presumably', 'RB', 33, 'advmod', 'O'), - (38, ',', ',', 44, 'punct', 'O'), - (39, 'how', 'WRB', 40, 'advmod', 'O'), - (40, 'many', 'JJ', 41, 'amod', 'O'), - (41, 'soldiers', 'NNS', 44, 'pobj', 'O'), - (42, 'are', 'VBP', 44, 'aux', 'O'), - (43, 'we', 'PRP', 44, 'nsubj', 'O'), - (44, 'talking', 'VBG', 44, 'ROOT', 'O'), - (45, 'about', 'IN', 44, 'prep', 'O'), - (46, 'right', 'RB', 47, 'advmod', 'O'), - (47, 'now', 'RB', 44, 'advmod', 'O'), - (48, '?', '.', 44, 'punct', 'O')] + (0, "When", "WRB", 11, "advmod", "O"), + (1, "Walter", "NNP", 2, "compound", "B-PERSON"), + (2, "Rodgers", "NNP", 11, "nsubj", "L-PERSON"), + (3, ",", ",", 2, "punct", "O"), + (4, "our", "PRP$", 6, "poss", "O"), + (5, "embedded", "VBN", 6, "amod", "O"), + (6, "reporter", "NN", 2, "appos", "O"), + (7, "with", "IN", 6, "prep", "O"), + (8, "the", "DT", 10, "det", "B-ORG"), + (9, "3rd", "NNP", 10, "compound", "I-ORG"), + (10, "Cavalry", "NNP", 7, "pobj", "L-ORG"), + (11, "says", "VBZ", 44, "advcl", "O"), + (12, "three", "CD", 13, "nummod", "U-CARDINAL"), + (13, "battalions", "NNS", 16, "nsubj", "O"), + (14, "of", "IN", 13, "prep", "O"), + (15, "troops", "NNS", 14, "pobj", "O"), + (16, "are", "VBP", 11, "ccomp", "O"), + (17, "on", "IN", 16, "prep", "O"), + (18, "the", "DT", 19, "det", "O"), + (19, "ground", "NN", 17, "pobj", "O"), + (20, ",", ",", 17, "punct", "O"), + (21, "inside", "IN", 17, "prep", "O"), + (22, "Baghdad", "NNP", 21, "pobj", "U-GPE"), + (23, "itself", "PRP", 22, "appos", "O"), + (24, ",", ",", 16, "punct", "O"), + (25, "have", "VBP", 26, "aux", "O"), + (26, "taken", "VBN", 16, "dep", "O"), + (27, "up", "RP", 26, "prt", "O"), + (28, "positions", "NNS", 26, "dobj", "O"), + (29, "they", "PRP", 31, "nsubj", "O"), + (30, "'re", "VBP", 31, "aux", "O"), + (31, "going", "VBG", 26, "parataxis", "O"), + (32, "to", "TO", 33, "aux", "O"), + (33, "spend", "VB", 31, "xcomp", "O"), + (34, "the", "DT", 35, "det", "B-TIME"), + (35, "night", "NN", 33, "dobj", "L-TIME"), + (36, "there", "RB", 33, "advmod", "O"), + (37, "presumably", "RB", 33, "advmod", "O"), + (38, ",", ",", 44, "punct", "O"), + (39, "how", "WRB", 40, "advmod", "O"), + (40, "many", "JJ", 41, "amod", "O"), + (41, "soldiers", "NNS", 44, "pobj", "O"), + (42, "are", "VBP", 44, "aux", "O"), + (43, "we", "PRP", 44, "nsubj", "O"), + (44, "talking", "VBG", 44, "ROOT", "O"), + (45, "about", "IN", 44, "prep", "O"), + (46, "right", "RB", 47, "advmod", "O"), + (47, "now", "RB", 44, "advmod", "O"), + (48, "?", ".", 44, "punct", "O"), +] + def test_get_oracle_actions(): doc = Doc(Vocab(), words=[t[1] for t in annot_tuples]) parser = DependencyParser(doc.vocab) - parser.moves.add_action(0, '') - parser.moves.add_action(1, '') - parser.moves.add_action(1, '') - parser.moves.add_action(4, 'ROOT') + parser.moves.add_action(0, "") + parser.moves.add_action(1, "") + parser.moves.add_action(1, "") + parser.moves.add_action(4, "ROOT") for i, (id_, word, tag, head, dep, ent) in enumerate(annot_tuples): if head > i: parser.moves.add_action(2, dep) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 4de0d25c2..534460ccd 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -16,15 +16,17 @@ def vocab(): @pytest.fixture def doc(vocab): - return Doc(vocab, words=['Casey', 'went', 'to', 'New', 'York', '.']) + return Doc(vocab, words=["Casey", "went", "to", "New", "York", "."]) @pytest.fixture def entity_annots(doc): casey = doc[0:1] ny = doc[3:5] - return [(casey.start_char, casey.end_char, 'PERSON'), - (ny.start_char, ny.end_char, 'GPE')] + return [ + (casey.start_char, casey.end_char, "PERSON"), + (ny.start_char, ny.end_char, "GPE"), + ] @pytest.fixture @@ -43,32 +45,33 @@ def test_get_oracle_moves(tsys, doc, entity_annots): tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] - assert names == ['U-PERSON', 'O', 'O', 'B-GPE', 'L-GPE', 'O'] + assert names == ["U-PERSON", "O", "O", "B-GPE", "L-GPE", "O"] + def test_get_oracle_moves_negative_entities(tsys, doc, entity_annots): - entity_annots = [(s, e, '!' + label) for s, e, label in entity_annots] + entity_annots = [(s, e, "!" + label) for s, e, label in entity_annots] gold = GoldParse(doc, entities=entity_annots) for i, tag in enumerate(gold.ner): - if tag == 'L-!GPE': - gold.ner[i] = '-' + if tag == "L-!GPE": + gold.ner[i] = "-" tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] def test_get_oracle_moves_negative_entities2(tsys, vocab): - doc = Doc(vocab, words=['A', 'B', 'C', 'D']) + doc = Doc(vocab, words=["A", "B", "C", "D"]) gold = GoldParse(doc, entities=[]) - gold.ner = ['B-!PERSON', 'L-!PERSON', 'B-!PERSON', 'L-!PERSON'] + gold.ner = ["B-!PERSON", "L-!PERSON", "B-!PERSON", "L-!PERSON"] tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] def test_get_oracle_moves_negative_O(tsys, vocab): - doc = Doc(vocab, words=['A', 'B', 'C', 'D']) + doc = Doc(vocab, words=["A", "B", "C", "D"]) gold = GoldParse(doc, entities=[]) - gold.ner = ['O', '!O', 'O', '!O'] + gold.ner = ["O", "!O", "O", "!O"] tsys.preprocess_gold(gold) act_classes = tsys.get_oracle_sequence(doc, gold) names = [tsys.get_class_name(act) for act in act_classes] @@ -80,8 +83,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab): ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 - assert [w.ent_iob_ for w in doc] == (['O'] * len(doc)) - doc.ents = [(doc.vocab.strings['ANIMAL'], 3, 4)] - assert [w.ent_iob_ for w in doc] == ['', '', '', 'B'] - doc.ents = [(doc.vocab.strings['WORD'], 0, 2)] - assert [w.ent_iob_ for w in doc] == ['B', 'I', '', ''] + assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) + doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] + assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] + doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] + assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""] diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 4ab49cb4e..062c76ae3 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -17,7 +17,7 @@ def vocab(): @pytest.fixture def arc_eager(vocab): - actions = ArcEager.get_actions(left_labels=['L'], right_labels=['R']) + actions = ArcEager.get_actions(left_labels=["L"], right_labels=["R"]) return ArcEager(vocab.strings, actions) @@ -30,6 +30,7 @@ def tok2vec(): def parser(vocab, arc_eager): return Parser(vocab, moves=arc_eager, model=None) + @pytest.fixture def model(arc_eager, tok2vec): return Parser.Model(arc_eager.n_moves, token_vector_width=tok2vec.nO)[0] @@ -37,12 +38,12 @@ def model(arc_eager, tok2vec): @pytest.fixture def doc(vocab): - return Doc(vocab, words=['a', 'b', 'c']) + return Doc(vocab, words=["a", "b", "c"]) @pytest.fixture def gold(doc): - return GoldParse(doc, heads=[1, 1, 1], deps=['L', 'ROOT', 'R']) + return GoldParse(doc, heads=[1, 1, 1], deps=["L", "ROOT", "R"]) def test_can_init_nn_parser(parser): @@ -62,8 +63,10 @@ def test_predict_doc(parser, tok2vec, model, doc): def test_update_doc(parser, model, doc, gold): parser.model = model + def optimize(weights, gradient, key=None): weights -= 0.001 * gradient + parser.update([doc], [gold], sgd=optimize) @@ -76,6 +79,8 @@ def test_predict_doc_beam(parser, model, doc): @pytest.mark.xfail def test_update_doc_beam(parser, model, doc, gold): parser.model = model + def optimize(weights, gradient, key=None): weights -= 0.001 * gradient + parser.update_beam([doc], [gold], sgd=optimize) diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index a6943d49e..9dca99255 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -21,20 +21,22 @@ def vocab(): @pytest.fixture def moves(vocab): aeager = ArcEager(vocab.strings, {}) - aeager.add_action(2, 'nsubj') - aeager.add_action(3, 'dobj') - aeager.add_action(2, 'aux') + aeager.add_action(2, "nsubj") + aeager.add_action(3, "dobj") + aeager.add_action(2, "aux") return aeager @pytest.fixture def docs(vocab): - return [Doc(vocab, words=['Rats', 'bite', 'things'])] + return [Doc(vocab, words=["Rats", "bite", "things"])] + @pytest.fixture def states(docs): return [StateClass(doc) for doc in docs] + @pytest.fixture def tokvecs(docs, vector_size): output = [] @@ -73,9 +75,10 @@ def beam(moves, states, golds, beam_width): def scores(moves, batch_size, beam_width): return [ numpy.asarray( - numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), - dtype='f') - for _ in range(batch_size)] + numpy.random.uniform(-0.1, 0.1, (batch_size, moves.n_moves)), dtype="f" + ) + for _ in range(batch_size) + ] def test_create_beam(beam): @@ -93,8 +96,8 @@ def test_beam_advance_too_few_scores(beam, scores): def test_beam_parse(): nlp = Language() - nlp.add_pipe(DependencyParser(nlp.vocab), name='parser') - nlp.parser.add_label('nsubj') + nlp.add_pipe(DependencyParser(nlp.vocab), name="parser") + nlp.parser.add_label("nsubj") nlp.parser.begin_training([], token_vector_width=8, hidden_width=8) - doc = nlp.make_doc('Australia is a country') + doc = nlp.make_doc("Australia is a country") nlp.parser(doc, beam_width=2) diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index fad04f340..442d01010 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -40,106 +40,116 @@ def multirooted_tree(): def test_parser_ancestors(tree, cyclic_tree, partial_tree, multirooted_tree): - assert([a for a in ancestors(3, tree)] == [4, 5, 2]) - assert([a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4]) - assert([a for a in ancestors(3, partial_tree)] == [4, 5, None]) - assert([a for a in ancestors(17, multirooted_tree)] == []) + assert [a for a in ancestors(3, tree)] == [4, 5, 2] + assert [a for a in ancestors(3, cyclic_tree)] == [4, 5, 3, 4, 5, 3, 4] + assert [a for a in ancestors(3, partial_tree)] == [4, 5, None] + assert [a for a in ancestors(17, multirooted_tree)] == [] def test_parser_contains_cycle(tree, cyclic_tree, partial_tree, multirooted_tree): - assert(contains_cycle(tree) == None) - assert(contains_cycle(cyclic_tree) == set([3, 4, 5])) - assert(contains_cycle(partial_tree) == None) - assert(contains_cycle(multirooted_tree) == None) + assert contains_cycle(tree) == None + assert contains_cycle(cyclic_tree) == set([3, 4, 5]) + assert contains_cycle(partial_tree) == None + assert contains_cycle(multirooted_tree) == None def test_parser_is_nonproj_arc(nonproj_tree, partial_tree, multirooted_tree): - assert(is_nonproj_arc(0, nonproj_tree) == False) - assert(is_nonproj_arc(1, nonproj_tree) == False) - assert(is_nonproj_arc(2, nonproj_tree) == False) - assert(is_nonproj_arc(3, nonproj_tree) == False) - assert(is_nonproj_arc(4, nonproj_tree) == False) - assert(is_nonproj_arc(5, nonproj_tree) == False) - assert(is_nonproj_arc(6, nonproj_tree) == False) - assert(is_nonproj_arc(7, nonproj_tree) == True) - assert(is_nonproj_arc(8, nonproj_tree) == False) - assert(is_nonproj_arc(7, partial_tree) == False) - assert(is_nonproj_arc(17, multirooted_tree) == False) - assert(is_nonproj_arc(16, multirooted_tree) == True) + assert is_nonproj_arc(0, nonproj_tree) == False + assert is_nonproj_arc(1, nonproj_tree) == False + assert is_nonproj_arc(2, nonproj_tree) == False + assert is_nonproj_arc(3, nonproj_tree) == False + assert is_nonproj_arc(4, nonproj_tree) == False + assert is_nonproj_arc(5, nonproj_tree) == False + assert is_nonproj_arc(6, nonproj_tree) == False + assert is_nonproj_arc(7, nonproj_tree) == True + assert is_nonproj_arc(8, nonproj_tree) == False + assert is_nonproj_arc(7, partial_tree) == False + assert is_nonproj_arc(17, multirooted_tree) == False + assert is_nonproj_arc(16, multirooted_tree) == True -def test_parser_is_nonproj_tree(proj_tree, nonproj_tree, partial_tree, multirooted_tree): - assert(is_nonproj_tree(proj_tree) == False) - assert(is_nonproj_tree(nonproj_tree) == True) - assert(is_nonproj_tree(partial_tree) == False) - assert(is_nonproj_tree(multirooted_tree) == True) +def test_parser_is_nonproj_tree( + proj_tree, nonproj_tree, partial_tree, multirooted_tree +): + assert is_nonproj_tree(proj_tree) == False + assert is_nonproj_tree(nonproj_tree) == True + assert is_nonproj_tree(partial_tree) == False + assert is_nonproj_tree(multirooted_tree) == True def test_parser_pseudoprojectivity(en_tokenizer): def deprojectivize(proj_heads, deco_labels): - tokens = en_tokenizer('whatever ' * len(proj_heads)) - rel_proj_heads = [head-i for i, head in enumerate(proj_heads)] - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], - deps=deco_labels, heads=rel_proj_heads) + tokens = en_tokenizer("whatever " * len(proj_heads)) + rel_proj_heads = [head - i for i, head in enumerate(proj_heads)] + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + deps=deco_labels, + heads=rel_proj_heads, + ) nonproj.deprojectivize(doc) return [t.head.i for t in doc], [token.dep_ for token in doc] + # fmt: off tree = [1, 2, 2] nonproj_tree = [1, 2, 2, 4, 5, 2, 7, 4, 2] nonproj_tree2 = [9, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] - labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', 'acl', 'punct'] - labels2 = ['advmod', 'root', 'det', 'nsubj', 'advmod', 'det', 'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod', 'det', 'amod', 'punct'] + labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", "acl", "punct"] + labels2 = ["advmod", "root", "det", "nsubj", "advmod", "det", "dobj", "det", "nmod", "aux", "nmod", "advmod", "det", "amod", "punct"] + # fmt: on - assert(nonproj.decompose('X||Y') == ('X','Y')) - assert(nonproj.decompose('X') == ('X','')) - assert(nonproj.is_decorated('X||Y') == True) - assert(nonproj.is_decorated('X') == False) + assert nonproj.decompose("X||Y") == ("X", "Y") + assert nonproj.decompose("X") == ("X", "") + assert nonproj.is_decorated("X||Y") == True + assert nonproj.is_decorated("X") == False nonproj._lift(0, tree) - assert(tree == [2, 2, 2]) + assert tree == [2, 2, 2] - assert(nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7) - assert(nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10) + assert nonproj._get_smallest_nonproj_arc(nonproj_tree) == 7 + assert nonproj._get_smallest_nonproj_arc(nonproj_tree2) == 10 + # fmt: off proj_heads, deco_labels = nonproj.projectivize(nonproj_tree, labels) - assert(proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2]) - assert(deco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', - 'nsubj', 'acl||dobj', 'punct']) + assert proj_heads == [1, 2, 2, 4, 5, 2, 7, 5, 2] + assert deco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", + "nsubj", "acl||dobj", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) - assert(deproj_heads == nonproj_tree) - assert(undeco_labels == labels) + assert deproj_heads == nonproj_tree + assert undeco_labels == labels proj_heads, deco_labels = nonproj.projectivize(nonproj_tree2, labels2) - assert(proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1]) - assert(deco_labels == ['advmod||aux', 'root', 'det', 'nsubj', 'advmod', - 'det', 'dobj', 'det', 'nmod', 'aux', 'nmod||dobj', - 'advmod', 'det', 'amod', 'punct']) + assert proj_heads == [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] + assert deco_labels == ["advmod||aux", "root", "det", "nsubj", "advmod", + "det", "dobj", "det", "nmod", "aux", "nmod||dobj", + "advmod", "det", "amod", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) - assert(deproj_heads == nonproj_tree2) - assert(undeco_labels == labels2) + assert deproj_heads == nonproj_tree2 + assert undeco_labels == labels2 # if decoration is wrong such that there is no head with the desired label # the structure is kept and the label is undecorated proj_heads = [1, 2, 2, 4, 5, 2, 7, 5, 2] - deco_labels = ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', 'nsubj', - 'acl||iobj', 'punct'] + deco_labels = ["det", "nsubj", "root", "det", "dobj", "aux", "nsubj", + "acl||iobj", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) - assert(deproj_heads == proj_heads) - assert(undeco_labels == ['det', 'nsubj', 'root', 'det', 'dobj', 'aux', - 'nsubj', 'acl', 'punct']) + assert deproj_heads == proj_heads + assert undeco_labels == ["det", "nsubj", "root", "det", "dobj", "aux", + "nsubj", "acl", "punct"] # if there are two potential new heads, the first one is chosen even if - # it's wrong + # it"s wrong proj_heads = [1, 1, 3, 1, 5, 6, 9, 8, 6, 1, 9, 12, 13, 10, 1] - deco_labels = ['advmod||aux', 'root', 'det', 'aux', 'advmod', 'det', - 'dobj', 'det', 'nmod', 'aux', 'nmod||dobj', 'advmod', - 'det', 'amod', 'punct'] + deco_labels = ["advmod||aux", "root", "det", "aux", "advmod", "det", + "dobj", "det", "nmod", "aux", "nmod||dobj", "advmod", + "det", "amod", "punct"] deproj_heads, undeco_labels = deprojectivize(proj_heads, deco_labels) - assert(deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1]) - assert(undeco_labels == ['advmod', 'root', 'det', 'aux', 'advmod', 'det', - 'dobj', 'det', 'nmod', 'aux', 'nmod', 'advmod', - 'det', 'amod', 'punct']) + assert deproj_heads == [3, 1, 3, 1, 5, 6, 9, 8, 6, 1, 6, 12, 13, 10, 1] + assert undeco_labels == ["advmod", "root", "det", "aux", "advmod", "det", + "dobj", "det", "nmod", "aux", "nmod", "advmod", + "det", "amod", "punct"] + # fmt: on diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index 9706d9b9b..2f7d8484e 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -9,7 +9,7 @@ from ..util import get_doc, apply_transition_sequence def test_parser_root(en_tokenizer): text = "i don't have other assistance" heads = [3, 2, 1, 0, 1, -2] - deps = ['nsubj', 'aux', 'neg', 'ROOT', 'amod', 'dobj'] + deps = ["nsubj", "aux", "neg", "ROOT", "amod", "dobj"] tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) for t in doc: @@ -17,10 +17,12 @@ def test_parser_root(en_tokenizer): @pytest.mark.xfail -@pytest.mark.parametrize('text', ["Hello"]) +@pytest.mark.parametrize("text", ["Hello"]) def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT']) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"] + ) assert len(doc) == 1 with en_parser.step_through(doc) as _: @@ -32,7 +34,7 @@ def test_parser_parse_one_word_sentence(en_tokenizer, en_parser, text): def test_parser_initial(en_tokenizer, en_parser): text = "I ate the pizza with anchovies." heads = [1, 0, 1, -2, -3, -1, -5] - transition = ['L-nsubj', 'S', 'L-det'] + transition = ["L-nsubj", "S", "L-det"] tokens = en_tokenizer(text) apply_transition_sequence(en_parser, tokens, transition) assert tokens[0].head.i == 1 @@ -58,17 +60,19 @@ def test_parser_parse_subtrees(en_tokenizer, en_parser): def test_parser_merge_pp(en_tokenizer): text = "A phrase with another phrase occurs" heads = [1, 4, -1, 1, -2, 0] - deps = ['det', 'nsubj', 'prep', 'det', 'pobj', 'ROOT'] - tags = ['DT', 'NN', 'IN', 'DT', 'NN', 'VBZ'] + deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT"] + tags = ["DT", "NN", "IN", "DT", "NN", "VBZ"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, tags=tags + ) nps = [(np[0].idx, np[-1].idx + len(np[-1]), np.lemma_) for np in doc.noun_chunks] for start, end, lemma in nps: - doc.merge(start, end, label='NP', lemma=lemma) - assert doc[0].text == 'A phrase' - assert doc[1].text == 'with' - assert doc[2].text == 'another phrase' - assert doc[3].text == 'occurs' + doc.merge(start, end, label="NP", lemma=lemma) + assert doc[0].text == "A phrase" + assert doc[1].text == "with" + assert doc[2].text == "another phrase" + assert doc[3].text == "occurs" @pytest.mark.xfail @@ -76,7 +80,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): text = "a b c d e" # right branching - transition = ['R-nsubj', 'D', 'R-nsubj', 'R-nsubj', 'D', 'R-ROOT'] + transition = ["R-nsubj", "D", "R-nsubj", "R-nsubj", "D", "R-ROOT"] tokens = en_tokenizer(text) apply_transition_sequence(en_parser, tokens, transition) @@ -111,7 +115,7 @@ def test_parser_arc_eager_finalize_state(en_tokenizer, en_parser): assert tokens[4].head.i == 2 # left branching - transition = ['S', 'S', 'S', 'L-nsubj','L-nsubj','L-nsubj', 'L-nsubj'] + transition = ["S", "S", "S", "L-nsubj", "L-nsubj", "L-nsubj", "L-nsubj"] tokens = en_tokenizer(text) apply_transition_sequence(en_parser, tokens, transition) diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index 8cfe3c280..eb206458e 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -33,6 +33,7 @@ BIG BROTHER IS WATCHING YOU, the caption beneath it ran. @pytest.fixture def heads(): + # fmt: off return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, -10, 2, 1, -3, -1, -15, -1, 1, 4, -1, 1, -3, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1, -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, 3, 1, 1, -14, @@ -50,6 +51,7 @@ def heads(): 0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1, 1, -4, -1, -2, 2, 1, -5, -19, -1, 1, 1, 0, 1, 6, -1, 1, -3, -1, -1, -8, -9, -1] + # fmt: on def test_parser_parse_navigate_consistency(en_tokenizer, text, heads): @@ -100,7 +102,14 @@ def test_parser_parse_navigate_edges(en_tokenizer, text, heads): doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads) for token in doc: subtree = list(token.subtree) - debug = '\t'.join((token.text, token.left_edge.text, subtree[0].text)) + debug = "\t".join((token.text, token.left_edge.text, subtree[0].text)) assert token.left_edge == subtree[0], debug - debug = '\t'.join((token.text, token.right_edge.text, subtree[-1].text, token.right_edge.head.text)) + debug = "\t".join( + ( + token.text, + token.right_edge.text, + subtree[-1].text, + token.right_edge.head.text, + ) + ) assert token.right_edge == subtree[-1], debug diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index c54e01a6d..70beb2f60 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -19,34 +19,33 @@ def vocab(): @pytest.fixture def parser(vocab): parser = DependencyParser(vocab) - parser.cfg['token_vector_width'] = 4 - parser.cfg['hidden_width'] = 32 - #parser.add_label('right') - parser.add_label('left') + parser.cfg["token_vector_width"] = 4 + parser.cfg["hidden_width"] = 32 + # parser.add_label('right') + parser.add_label("left") parser.begin_training([], **parser.cfg) sgd = Adam(NumpyOps(), 0.001) for i in range(10): losses = {} - doc = Doc(vocab, words=['a', 'b', 'c', 'd']) - gold = GoldParse(doc, heads=[1, 1, 3, 3], - deps=['left', 'ROOT', 'left', 'ROOT']) + doc = Doc(vocab, words=["a", "b", "c", "d"]) + gold = GoldParse(doc, heads=[1, 1, 3, 3], deps=["left", "ROOT", "left", "ROOT"]) parser.update([doc], [gold], sgd=sgd, losses=losses) return parser def test_no_sentences(parser): - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = parser(doc) assert len(list(doc.sents)) >= 1 def test_sents_1(parser): - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[2].sent_start = True doc = parser(doc) assert len(list(doc.sents)) >= 2 - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = False doc[2].sent_start = True doc[3].sent_start = False @@ -55,7 +54,7 @@ def test_sents_1(parser): def test_sents_1_2(parser): - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True doc[2].sent_start = True doc = parser(doc) @@ -63,12 +62,12 @@ def test_sents_1_2(parser): def test_sents_1_3(parser): - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True doc[3].sent_start = True doc = parser(doc) assert len(list(doc.sents)) >= 3 - doc = Doc(parser.vocab, words=['a', 'b', 'c', 'd']) + doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc[1].sent_start = True doc[2].sent_start = False doc[3].sent_start = True diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 216915882..3c5279bec 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -19,11 +19,13 @@ def test_parser_space_attachment(en_tokenizer): def test_parser_sentence_space(en_tokenizer): + # fmt: off text = "I look forward to using Thingamajig. I've been told it will make my life easier..." heads = [1, 0, -1, -2, -1, -1, -5, -1, 3, 2, 1, 0, 2, 1, -3, 1, 1, -3, -7] - deps = ['nsubj', 'ROOT', 'advmod', 'prep', 'pcomp', 'dobj', 'punct', '', - 'nsubjpass', 'aux', 'auxpass', 'ROOT', 'nsubj', 'aux', 'ccomp', - 'poss', 'nsubj', 'ccomp', 'punct'] + deps = ["nsubj", "ROOT", "advmod", "prep", "pcomp", "dobj", "punct", "", + "nsubjpass", "aux", "auxpass", "ROOT", "nsubj", "aux", "ccomp", + "poss", "nsubj", "ccomp", "punct"] + # fmt: on tokens = en_tokenizer(text) doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps) assert len(list(doc.sents)) == 2 @@ -34,10 +36,10 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser): text = "\t \n This is a sentence ." heads = [1, 1, 0, 1, -2, -3] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads) + doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads) assert doc[0].is_space assert doc[1].is_space - assert doc[2].text == 'This' + assert doc[2].text == "This" with en_parser.step_through(doc) as stepwise: pass assert doc[0].head.i == 2 @@ -49,9 +51,9 @@ def test_parser_space_attachment_leading(en_tokenizer, en_parser): def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): text = "This is \t a \t\n \n sentence . \n\n \n" heads = [1, 0, -1, 2, -1, -4, -5, -1] - transition = ['L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct'] + transition = ["L-nsubj", "S", "L-det", "R-attr", "D", "R-punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=text.split(' '), heads=heads) + doc = get_doc(tokens.vocab, words=text.split(" "), heads=heads) assert doc[2].is_space assert doc[4].is_space assert doc[5].is_space @@ -64,8 +66,7 @@ def test_parser_space_attachment_intermediate_trailing(en_tokenizer, en_parser): assert [token.head.i for token in doc] == [1, 1, 1, 6, 3, 3, 1, 1, 7, 7] -@pytest.mark.parametrize('text,length', [(['\n'], 1), - (['\n', '\t', '\n\n', '\t'], 4)]) +@pytest.mark.parametrize("text,length", [(["\n"], 1), (["\n", "\t", "\n\n", "\t"], 4)]) @pytest.mark.xfail def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): doc = Doc(en_parser.vocab, words=text) @@ -74,4 +75,4 @@ def test_parser_space_attachment_space(en_tokenizer, en_parser, text, length): pass assert doc[0].is_space for token in doc: - assert token.head.i == length-1 + assert token.head.i == length - 1 diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 67fb4e003..8757c9af5 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -18,14 +18,16 @@ def patterns(): {"label": "HELLO", "pattern": "hello world"}, {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]}, {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]}, - {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]} + {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]}, ] + @pytest.fixture def add_ent(): def add_ent_component(doc): - doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings['ORG'])] + doc.ents = [Span(doc, 0, 3, label=doc.vocab.strings["ORG"])] return doc + return add_ent_component @@ -33,13 +35,13 @@ def test_entity_ruler_init(nlp, patterns): ruler = EntityRuler(nlp, patterns=patterns) assert len(ruler) == len(patterns) assert len(ruler.labels) == 3 - assert 'HELLO' in ruler - assert 'BYE' in ruler + assert "HELLO" in ruler + assert "BYE" in ruler nlp.add_pipe(ruler) doc = nlp("hello world bye bye") assert len(doc.ents) == 2 - assert doc.ents[0].label_ == 'HELLO' - assert doc.ents[1].label_ == 'BYE' + assert doc.ents[0].label_ == "HELLO" + assert doc.ents[1].label_ == "BYE" def test_entity_ruler_existing(nlp, patterns, add_ent): @@ -48,8 +50,8 @@ def test_entity_ruler_existing(nlp, patterns, add_ent): nlp.add_pipe(ruler) doc = nlp("OH HELLO WORLD bye bye") assert len(doc.ents) == 2 - assert doc.ents[0].label_ == 'ORG' - assert doc.ents[1].label_ == 'BYE' + assert doc.ents[0].label_ == "ORG" + assert doc.ents[1].label_ == "BYE" def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent): @@ -58,9 +60,9 @@ def test_entity_ruler_existing_overwrite(nlp, patterns, add_ent): nlp.add_pipe(ruler) doc = nlp("OH HELLO WORLD bye bye") assert len(doc.ents) == 2 - assert doc.ents[0].label_ == 'HELLO' - assert doc.ents[0].text == 'HELLO' - assert doc.ents[1].label_ == 'BYE' + assert doc.ents[0].label_ == "HELLO" + assert doc.ents[0].text == "HELLO" + assert doc.ents[1].label_ == "BYE" def test_entity_ruler_existing_complex(nlp, patterns, add_ent): @@ -69,8 +71,8 @@ def test_entity_ruler_existing_complex(nlp, patterns, add_ent): nlp.add_pipe(ruler) doc = nlp("foo foo bye bye") assert len(doc.ents) == 2 - assert doc.ents[0].label_ == 'COMPLEX' - assert doc.ents[1].label_ == 'BYE' + assert doc.ents[0].label_ == "COMPLEX" + assert doc.ents[1].label_ == "BYE" assert len(doc.ents[0]) == 2 assert len(doc.ents[1]) == 2 diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py index d6ed68c4a..5efcc319a 100644 --- a/spacy/tests/pipeline/test_factories.py +++ b/spacy/tests/pipeline/test_factories.py @@ -10,15 +10,21 @@ from ..util import get_doc @pytest.fixture def doc(en_tokenizer): - text = 'I like New York in Autumn.' + text = "I like New York in Autumn." heads = [1, 0, 1, -2, -3, -1, -5] - tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] - pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT'] - deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."] + pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"] + deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] tokens = en_tokenizer(text) - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, - tags=tags, pos=pos, deps=deps) - doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])] + doc = get_doc( + tokens.vocab, + words=[t.text for t in tokens], + heads=heads, + tags=tags, + pos=pos, + deps=deps, + ) + doc.ents = [Span(doc, 2, 4, doc.vocab.strings["GPE"])] doc.is_parsed = True doc.is_tagged = True return doc @@ -27,18 +33,18 @@ def doc(en_tokenizer): def test_factories_merge_noun_chunks(doc): assert len(doc) == 7 nlp = Language() - merge_noun_chunks = nlp.create_pipe('merge_noun_chunks') + merge_noun_chunks = nlp.create_pipe("merge_noun_chunks") merge_noun_chunks(doc) assert len(doc) == 6 - assert doc[2].text == 'New York' + assert doc[2].text == "New York" def test_factories_merge_ents(doc): assert len(doc) == 7 assert len(list(doc.ents)) == 1 nlp = Language() - merge_entities = nlp.create_pipe('merge_entities') + merge_entities = nlp.create_pipe("merge_entities") merge_entities(doc) assert len(doc) == 6 assert len(list(doc.ents)) == 1 - assert doc[2].text == 'New York' + assert doc[2].text == "New York" diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 225c9acf8..bd779d5c2 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -16,22 +16,22 @@ def new_pipe(doc): def test_add_pipe_no_name(nlp): nlp.add_pipe(new_pipe) - assert 'new_pipe' in nlp.pipe_names + assert "new_pipe" in nlp.pipe_names def test_add_pipe_duplicate_name(nlp): - nlp.add_pipe(new_pipe, name='duplicate_name') + nlp.add_pipe(new_pipe, name="duplicate_name") with pytest.raises(ValueError): - nlp.add_pipe(new_pipe, name='duplicate_name') + nlp.add_pipe(new_pipe, name="duplicate_name") -@pytest.mark.parametrize('name', ['parser']) +@pytest.mark.parametrize("name", ["parser"]) def test_add_pipe_first(nlp, name): nlp.add_pipe(new_pipe, name=name, first=True) assert nlp.pipeline[0][0] == name -@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) +@pytest.mark.parametrize("name1,name2", [("parser", "lambda_pipe")]) def test_add_pipe_last(nlp, name1, name2): nlp.add_pipe(lambda doc: doc, name=name2) nlp.add_pipe(new_pipe, name=name1, last=True) @@ -44,7 +44,7 @@ def test_cant_add_pipe_first_and_last(nlp): nlp.add_pipe(new_pipe, first=True, last=True) -@pytest.mark.parametrize('name', ['my_component']) +@pytest.mark.parametrize("name", ["my_component"]) def test_get_pipe(nlp, name): with pytest.raises(KeyError): nlp.get_pipe(name) @@ -52,7 +52,7 @@ def test_get_pipe(nlp, name): assert nlp.get_pipe(name) == new_pipe -@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) +@pytest.mark.parametrize("name,replacement", [("my_component", lambda doc: doc)]) def test_replace_pipe(nlp, name, replacement): with pytest.raises(ValueError): nlp.replace_pipe(name, new_pipe) @@ -62,7 +62,7 @@ def test_replace_pipe(nlp, name, replacement): assert nlp.get_pipe(name) == replacement -@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) +@pytest.mark.parametrize("old_name,new_name", [("old_pipe", "new_pipe")]) def test_rename_pipe(nlp, old_name, new_name): with pytest.raises(ValueError): nlp.rename_pipe(old_name, new_name) @@ -71,7 +71,7 @@ def test_rename_pipe(nlp, old_name, new_name): assert nlp.pipeline[0][0] == new_name -@pytest.mark.parametrize('name', ['my_component']) +@pytest.mark.parametrize("name", ["my_component"]) def test_remove_pipe(nlp, name): with pytest.raises(ValueError): nlp.remove_pipe(name) @@ -83,7 +83,7 @@ def test_remove_pipe(nlp, name): assert removed_component == new_pipe -@pytest.mark.parametrize('name', ['my_component']) +@pytest.mark.parametrize("name", ["my_component"]) def test_disable_pipes_method(nlp, name): nlp.add_pipe(new_pipe, name=name) assert nlp.has_pipe(name) @@ -92,7 +92,7 @@ def test_disable_pipes_method(nlp, name): disabled.restore() -@pytest.mark.parametrize('name', ['my_component']) +@pytest.mark.parametrize("name", ["my_component"]) def test_disable_pipes_context(nlp, name): nlp.add_pipe(new_pipe, name=name) assert nlp.has_pipe(name) @@ -101,14 +101,14 @@ def test_disable_pipes_context(nlp, name): assert nlp.has_pipe(name) -@pytest.mark.parametrize('n_pipes', [100]) +@pytest.mark.parametrize("n_pipes", [100]) def test_add_lots_of_pipes(nlp, n_pipes): for i in range(n_pipes): - nlp.add_pipe(lambda doc: doc, name='pipe_%d' % i) + nlp.add_pipe(lambda doc: doc, name="pipe_%d" % i) assert len(nlp.pipe_names) == n_pipes -@pytest.mark.parametrize('component', ['ner', {'hello': 'world'}]) +@pytest.mark.parametrize("component", ["ner", {"hello": "world"}]) def test_raise_for_invalid_components(nlp, component): with pytest.raises(ValueError): nlp.add_pipe(component) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 323395cf4..ef70dc013 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -13,16 +13,21 @@ from spacy.gold import GoldParse @pytest.mark.skip(reason="Test is flakey when run with others") def test_simple_train(): nlp = Language() - nlp.add_pipe(nlp.create_pipe('textcat')) - nlp.get_pipe('textcat').add_label('answer') + nlp.add_pipe(nlp.create_pipe("textcat")) + nlp.get_pipe("textcat").add_label("answer") nlp.begin_training() for i in range(5): - for text, answer in [('aaaa', 1.), ('bbbb', 0), ('aa', 1.), - ('bbbbbbbbb', 0.), ('aaaaaa', 1)]: - nlp.update([text], [{'cats': {'answer': answer}}]) - doc = nlp('aaa') - assert 'answer' in doc.cats - assert doc.cats['answer'] >= 0.5 + for text, answer in [ + ("aaaa", 1.0), + ("bbbb", 0), + ("aa", 1.0), + ("bbbbbbbbb", 0.0), + ("aaaaaa", 1), + ]: + nlp.update([text], [{"cats": {"answer": answer}}]) + doc = nlp("aaa") + assert "answer" in doc.cats + assert doc.cats["answer"] >= 0.5 @pytest.mark.skip(reason="Test is flakey when run with others") @@ -31,11 +36,11 @@ def test_textcat_learns_multilabel(): numpy.random.seed(5) docs = [] nlp = Language() - letters = ['a', 'b', 'c'] + letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: - cats = {letter: float(w2==letter) for letter in letters} - docs.append((Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3), cats)) + cats = {letter: float(w2 == letter) for letter in letters} + docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) model = TextCategorizer(nlp.vocab, width=8) for letter in letters: @@ -49,8 +54,8 @@ def test_textcat_learns_multilabel(): random.shuffle(docs) for w1 in letters: for w2 in letters: - doc = Doc(nlp.vocab, words=['d']*3 + [w1, w2] + ['d']*3) - truth = {letter: w2==letter for letter in letters} + doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) + truth = {letter: w2 == letter for letter in letters} model(doc) for cat, score in doc.cats.items(): if not truth[cat]: diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index d3fe240d8..5819f490d 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -14,14 +14,20 @@ from spacy.tokens import Doc from ..util import get_doc, make_tempdir -@pytest.mark.parametrize('patterns', [ - [[{'LOWER': 'celtics'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]], - [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'celtics'}]]]) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "celtics"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "celtics"}]], + ], +) def test_issue118(en_tokenizer, patterns): """Test a bug that arose from having overlapping matches""" - text = "how many points did lebron james score against the boston celtics last night" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) doc = en_tokenizer(text) - ORG = doc.vocab.strings['ORG'] + ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) matcher.add("BostonCeltics", None, *patterns) assert len(list(doc.ents)) == 0 @@ -35,16 +41,22 @@ def test_issue118(en_tokenizer, patterns): assert ents[0].end == 11 -@pytest.mark.parametrize('patterns', [ - [[{'LOWER': 'boston'}], [{'LOWER': 'boston'}, {'LOWER': 'celtics'}]], - [[{'LOWER': 'boston'}, {'LOWER': 'celtics'}], [{'LOWER': 'boston'}]]]) +@pytest.mark.parametrize( + "patterns", + [ + [[{"LOWER": "boston"}], [{"LOWER": "boston"}, {"LOWER": "celtics"}]], + [[{"LOWER": "boston"}, {"LOWER": "celtics"}], [{"LOWER": "boston"}]], + ], +) def test_issue118_prefix_reorder(en_tokenizer, patterns): """Test a bug that arose from having overlapping matches""" - text = "how many points did lebron james score against the boston celtics last night" + text = ( + "how many points did lebron james score against the boston celtics last night" + ) doc = en_tokenizer(text) - ORG = doc.vocab.strings['ORG'] + ORG = doc.vocab.strings["ORG"] matcher = Matcher(doc.vocab) - matcher.add('BostonCeltics', None, *patterns) + matcher.add("BostonCeltics", None, *patterns) assert len(list(doc.ents)) == 0 matches = [(ORG, start, end) for _, start, end in matcher(doc)] doc.ents += tuple(matches)[1:] @@ -59,11 +71,13 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns): def test_issue242(en_tokenizer): """Test overlapping multi-word phrases.""" text = "There are different food safety standards in different countries." - patterns = [[{'LOWER': 'food'}, {'LOWER': 'safety'}], - [{'LOWER': 'safety'}, {'LOWER': 'standards'}]] + patterns = [ + [{"LOWER": "food"}, {"LOWER": "safety"}], + [{"LOWER": "safety"}, {"LOWER": "standards"}], + ] doc = en_tokenizer(text) matcher = Matcher(doc.vocab) - matcher.add('FOOD', None, *patterns) + matcher.add("FOOD", None, *patterns) matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)] doc.ents += tuple(matches) @@ -77,7 +91,9 @@ def test_issue242(en_tokenizer): def test_issue309(en_tokenizer): """Test Issue #309: SBD fails on empty string""" tokens = en_tokenizer(" ") - doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=['ROOT']) + doc = get_doc( + tokens.vocab, words=[t.text for t in tokens], heads=[0], deps=["ROOT"] + ) doc.is_parsed = True assert len(doc) == 1 sents = list(doc.sents) @@ -93,11 +109,11 @@ def test_issue351(en_tokenizer): def test_issue360(en_tokenizer): """Test tokenization of big ellipsis""" - tokens = en_tokenizer('$45...............Asking') + tokens = en_tokenizer("$45...............Asking") assert len(tokens) > 2 -@pytest.mark.parametrize('text1,text2', [("cat", "dog")]) +@pytest.mark.parametrize("text1,text2", [("cat", "dog")]) def test_issue361(en_vocab, text1, text2): """Test Issue #361: Equality of lexemes""" assert en_vocab[text1] == en_vocab[text1] @@ -106,15 +122,19 @@ def test_issue361(en_vocab, text1, text2): def test_issue587(en_tokenizer): """Test that Matcher doesn't segfault on particular input""" - doc = en_tokenizer('a b; c') + doc = en_tokenizer("a b; c") matcher = Matcher(doc.vocab) - matcher.add('TEST1', None, [{ORTH: 'a'}, {ORTH: 'b'}]) + matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}]) matches = matcher(doc) assert len(matches) == 1 - matcher.add('TEST2', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]) + matcher.add( + "TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}] + ) matches = matcher(doc) assert len(matches) == 2 - matcher.add('TEST3', None, [{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'd'}]) + matcher.add( + "TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}] + ) matches = matcher(doc) assert len(matches) == 2 @@ -122,22 +142,26 @@ def test_issue587(en_tokenizer): def test_issue588(en_vocab): matcher = Matcher(en_vocab) with pytest.raises(ValueError): - matcher.add('TEST', None, []) + matcher.add("TEST", None, []) @pytest.mark.xfail def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) - doc = Doc(vocab, words=['whata']) + doc = Doc(vocab, words=["whata"]) def test_issue590(en_vocab): """Test overlapping matches""" - doc = Doc(en_vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%']) + doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"]) matcher = Matcher(en_vocab) - matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': ':'}, {'LIKE_NUM': True}, {'ORTH': '%'}]) - matcher.add('ab', None, [{'IS_ALPHA': True}, {'ORTH': '='}, {'LIKE_NUM': True}]) + matcher.add( + "ab", + None, + [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}], + ) + matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]) matches = matcher(doc) assert len(matches) == 2 @@ -145,14 +169,14 @@ def test_issue590(en_vocab): def test_issue595(): """Test lemmatization of base forms""" words = ["Do", "n't", "feed", "the", "dog"] - tag_map = {'VB': {POS: VERB, VerbForm_inf: True}} + tag_map = {"VB": {POS: VERB, VerbForm_inf: True}} rules = {"verb": [["ed", "e"]]} - lemmatizer = Lemmatizer({'verb': {}}, {'verb': {}}, rules) + lemmatizer = Lemmatizer({"verb": {}}, {"verb": {}}, rules) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=words) - doc[2].tag_ = 'VB' - assert doc[2].text == 'feed' - assert doc[2].lemma_ == 'feed' + doc[2].tag_ = "VB" + assert doc[2].text == "feed" + assert doc[2].lemma_ == "feed" def test_issue599(en_vocab): @@ -165,9 +189,9 @@ def test_issue599(en_vocab): def test_issue600(): - vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}}) + vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) doc = Doc(vocab, words=["hello"]) - doc[0].tag_ = 'NN' + doc[0].tag_ = "NN" def test_issue615(en_tokenizer): @@ -175,16 +199,17 @@ def test_issue615(en_tokenizer): """Merge a phrase. We have to be careful here because we'll change the token indices. To avoid problems, merge all the phrases once we're called on the last match.""" - if i != len(matches)-1: + if i != len(matches) - 1: return None - spans = [(ent_id, ent_id, doc[start : end]) for ent_id, start, end in matches] + spans = [(ent_id, ent_id, doc[start:end]) for ent_id, start, end in matches] for ent_id, label, span in spans: - span.merge(tag='NNP' if label else span.root.tag_, lemma=span.text, - label=label) + span.merge( + tag="NNP" if label else span.root.tag_, lemma=span.text, label=label + ) doc.ents = doc.ents + ((label, span.start, span.end),) text = "The golf club is broken" - pattern = [{'ORTH': "golf"}, {'ORTH': "club"}] + pattern = [{"ORTH": "golf"}, {"ORTH": "club"}] label = "Sport_Equipment" doc = en_tokenizer(text) matcher = Matcher(doc.vocab) @@ -195,7 +220,7 @@ def test_issue615(en_tokenizer): assert entities[0].label != 0 -@pytest.mark.parametrize('text,number', [("7am", "7"), ("11p.m.", "11")]) +@pytest.mark.parametrize("text,number", [("7am", "7"), ("11p.m.", "11")]) def test_issue736(en_tokenizer, text, number): """Test that times like "7am" are tokenized correctly and that numbers are converted to string.""" @@ -204,7 +229,7 @@ def test_issue736(en_tokenizer, text, number): assert tokens[0].text == number -@pytest.mark.parametrize('text', ["3/4/2012", "01/12/1900"]) +@pytest.mark.parametrize("text", ["3/4/2012", "01/12/1900"]) def test_issue740(en_tokenizer, text): """Test that dates are not split and kept as one token. This behaviour is currently inconsistent, since dates separated by hyphens are still split. @@ -214,14 +239,14 @@ def test_issue740(en_tokenizer, text): def test_issue743(): - doc = Doc(Vocab(), ['hello', 'world']) + doc = Doc(Vocab(), ["hello", "world"]) token = doc[0] s = set([token]) items = list(s) assert items[0] is token -@pytest.mark.parametrize('text', ["We were scared", "We Were Scared"]) +@pytest.mark.parametrize("text", ["We were scared", "We Were Scared"]) def test_issue744(en_tokenizer, text): """Test that 'were' and 'Were' are excluded from the contractions generated by the English tokenizer exceptions.""" @@ -230,14 +255,15 @@ def test_issue744(en_tokenizer, text): assert tokens[1].text.lower() == "were" -@pytest.mark.parametrize('text,is_num', [("one", True), ("ten", True), - ("teneleven", False)]) +@pytest.mark.parametrize( + "text,is_num", [("one", True), ("ten", True), ("teneleven", False)] +) def test_issue759(en_tokenizer, text, is_num): tokens = en_tokenizer(text) assert tokens[0].like_num == is_num -@pytest.mark.parametrize('text', ["Shell", "shell", "Shed", "shed"]) +@pytest.mark.parametrize("text", ["Shell", "shell", "Shed", "shed"]) def test_issue775(en_tokenizer, text): """Test that 'Shell' and 'shell' are excluded from the contractions generated by the English tokenizer exceptions.""" @@ -246,28 +272,32 @@ def test_issue775(en_tokenizer, text): assert tokens[0].text == text -@pytest.mark.parametrize('text', ["This is a string ", "This is a string\u0020"]) +@pytest.mark.parametrize("text", ["This is a string ", "This is a string\u0020"]) def test_issue792(en_tokenizer, text): """Test for Issue #792: Trailing whitespace is removed after tokenization.""" doc = en_tokenizer(text) - assert ''.join([token.text_with_ws for token in doc]) == text + assert "".join([token.text_with_ws for token in doc]) == text -@pytest.mark.parametrize('text', ["This is a string", "This is a string\n"]) +@pytest.mark.parametrize("text", ["This is a string", "This is a string\n"]) def test_control_issue792(en_tokenizer, text): """Test base case for Issue #792: Non-trailing whitespace""" doc = en_tokenizer(text) - assert ''.join([token.text_with_ws for token in doc]) == text + assert "".join([token.text_with_ws for token in doc]) == text -@pytest.mark.parametrize('text,tokens', [ - ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), - ("exception;--exclusive", ["exception", ";--", "exclusive"]), - ("day.--Is", ["day", ".--", "Is"]), - ("refinement:--just", ["refinement", ":--", "just"]), - ("memories?--To", ["memories", "?--", "To"]), - ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), - ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"])]) +@pytest.mark.parametrize( + "text,tokens", + [ + ('"deserve,"--and', ['"', "deserve", ',"--', "and"]), + ("exception;--exclusive", ["exception", ";--", "exclusive"]), + ("day.--Is", ["day", ".--", "Is"]), + ("refinement:--just", ["refinement", ":--", "just"]), + ("memories?--To", ["memories", "?--", "To"]), + ("Useful.=--Therefore", ["Useful", ".=--", "Therefore"]), + ("=Hope.=--Pandora", ["=", "Hope", ".=--", "Pandora"]), + ], +) def test_issue801(en_tokenizer, text, tokens): """Test that special characters + hyphens are split correctly.""" doc = en_tokenizer(text) @@ -275,10 +305,19 @@ def test_issue801(en_tokenizer, text, tokens): assert [t.text for t in doc] == tokens -@pytest.mark.parametrize('text,expected_tokens', [ - ('Smörsåsen används bl.a. till fisk', ['Smörsåsen', 'används', 'bl.a.', 'till', 'fisk']), - ('Jag kommer först kl. 13 p.g.a. diverse förseningar', ['Jag', 'kommer', 'först', 'kl.', '13', 'p.g.a.', 'diverse', 'förseningar']) -]) +@pytest.mark.parametrize( + "text,expected_tokens", + [ + ( + "Smörsåsen används bl.a. till fisk", + ["Smörsåsen", "används", "bl.a.", "till", "fisk"], + ), + ( + "Jag kommer först kl. 13 p.g.a. diverse förseningar", + ["Jag", "kommer", "först", "kl.", "13", "p.g.a.", "diverse", "förseningar"], + ), + ], +) def test_issue805(sv_tokenizer, text, expected_tokens): tokens = sv_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] @@ -291,9 +330,9 @@ def test_issue850(): vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) - pattern = [{'LOWER': "bob"}, {'OP': '*', 'IS_ANY_TOKEN': True}, {'LOWER': 'frank'}] - matcher.add('FarAway', None, pattern) - doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) + pattern = [{"LOWER": "bob"}, {"OP": "*", "IS_ANY_TOKEN": True}, {"LOWER": "frank"}] + matcher.add("FarAway", None, pattern) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] @@ -306,9 +345,9 @@ def test_issue850_basic(): vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()}) matcher = Matcher(vocab) IS_ANY_TOKEN = matcher.vocab.add_flag(lambda x: True) - pattern = [{'LOWER': "bob"}, {'OP': '*', 'LOWER': 'and'}, {'LOWER': 'frank'}] - matcher.add('FarAway', None, pattern) - doc = Doc(matcher.vocab, words=['bob', 'and', 'and', 'frank']) + pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}] + matcher.add("FarAway", None, pattern) + doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"]) match = matcher(doc) assert len(match) == 1 ent_id, start, end = match[0] @@ -316,23 +355,25 @@ def test_issue850_basic(): assert end == 4 -@pytest.mark.parametrize('text', ["au-delàs", "pair-programmâmes", - "terra-formées", "σ-compacts"]) +@pytest.mark.parametrize( + "text", ["au-delàs", "pair-programmâmes", "terra-formées", "σ-compacts"] +) def test_issue852(fr_tokenizer, text): """Test that French tokenizer exceptions are imported correctly.""" tokens = fr_tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["aaabbb@ccc.com\nThank you!", - "aaabbb@ccc.com \nThank you!"]) +@pytest.mark.parametrize( + "text", ["aaabbb@ccc.com\nThank you!", "aaabbb@ccc.com \nThank you!"] +) def test_issue859(en_tokenizer, text): """Test that no extra space is added in doc.text method.""" doc = en_tokenizer(text) assert doc.text == text -@pytest.mark.parametrize('text', ["Datum:2014-06-02\nDokument:76467"]) +@pytest.mark.parametrize("text", ["Datum:2014-06-02\nDokument:76467"]) def test_issue886(en_tokenizer, text): """Test that token.idx matches the original text index for texts with newlines.""" doc = en_tokenizer(text) @@ -341,7 +382,7 @@ def test_issue886(en_tokenizer, text): assert text[token.idx] == token.text[0] -@pytest.mark.parametrize('text', ["want/need"]) +@pytest.mark.parametrize("text", ["want/need"]) def test_issue891(en_tokenizer, text): """Test that / infixes are split correctly.""" tokens = en_tokenizer(text) @@ -349,11 +390,10 @@ def test_issue891(en_tokenizer, text): assert tokens[1].text == "/" -@pytest.mark.parametrize('text,tag,lemma', [ - ("anus", "NN", "anus"), - ("princess", "NN", "princess"), - ("inner", "JJ", "inner") -]) +@pytest.mark.parametrize( + "text,tag,lemma", + [("anus", "NN", "anus"), ("princess", "NN", "princess"), ("inner", "JJ", "inner")], +) def test_issue912(en_vocab, text, tag, lemma): """Test base-forms are preserved.""" doc = Doc(en_vocab, words=[text]) @@ -364,10 +404,10 @@ def test_issue912(en_vocab, text, tag, lemma): def test_issue957(en_tokenizer): """Test that spaCy doesn't hang on many periods.""" # skip test if pytest-timeout is not installed - timeout = pytest.importorskip('pytest-timeout') - string = '0' + timeout = pytest.importorskip("pytest-timeout") + string = "0" for i in range(1, 100): - string += '.%d' % i + string += ".%d" % i doc = en_tokenizer(string) @@ -386,13 +426,13 @@ def test_issue999(train_data): ["hello", []], ["hi", []], ["i'm looking for a place to eat", []], - ["i'm looking for a place in the north of town", [[31,36,"LOCATION"]]], - ["show me chinese restaurants", [[8,15,"CUISINE"]]], - ["show me chines restaurants", [[8,14,"CUISINE"]]], + ["i'm looking for a place in the north of town", [[31, 36, "LOCATION"]]], + ["show me chinese restaurants", [[8, 15, "CUISINE"]]], + ["show me chines restaurants", [[8, 14, "CUISINE"]]], ] nlp = Language() - ner = nlp.create_pipe('ner') + ner = nlp.create_pipe("ner") nlp.add_pipe(ner) for _, offsets in TRAIN_DATA: for start, end, label in offsets: @@ -402,7 +442,7 @@ def test_issue999(train_data): for itn in range(100): random.shuffle(TRAIN_DATA) for raw_text, entity_offsets in TRAIN_DATA: - nlp.update([raw_text], [{'entities': entity_offsets}]) + nlp.update([raw_text], [{"entities": entity_offsets}]) with make_tempdir() as model_dir: nlp.to_disk(model_dir) diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index e85d19ccd..0471502de 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -15,76 +15,84 @@ from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part def test_issue1242(): nlp = English() - doc = nlp('') + doc = nlp("") assert len(doc) == 0 - docs = list(nlp.pipe(['', 'hello'])) + docs = list(nlp.pipe(["", "hello"])) assert len(docs[0]) == 0 assert len(docs[1]) == 1 def test_issue1250(): """Test cached special cases.""" - special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}] + special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}] nlp = English() - nlp.tokenizer.add_special_case('reimbur', special_case) - lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')] - assert lemmas == ['reimburse', ',', 'reimburse', '...'] - lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')] - assert lemmas == ['reimburse', ',', 'reimburse', '...'] + nlp.tokenizer.add_special_case("reimbur", special_case) + lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] + assert lemmas == ["reimburse", ",", "reimburse", "..."] + lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")] + assert lemmas == ["reimburse", ",", "reimburse", "..."] def test_issue1257(): """Test that tokens compare correctly.""" - doc1 = Doc(Vocab(), words=['a', 'b', 'c']) - doc2 = Doc(Vocab(), words=['a', 'c', 'e']) + doc1 = Doc(Vocab(), words=["a", "b", "c"]) + doc2 = Doc(Vocab(), words=["a", "c", "e"]) assert doc1[0] != doc2[0] assert not doc1[0] == doc2[0] def test_issue1375(): """Test that token.nbor() raises IndexError for out-of-bounds access.""" - doc = Doc(Vocab(), words=['0', '1', '2']) + doc = Doc(Vocab(), words=["0", "1", "2"]) with pytest.raises(IndexError): assert doc[0].nbor(-1) - assert doc[1].nbor(-1).text == '0' + assert doc[1].nbor(-1).text == "0" with pytest.raises(IndexError): assert doc[2].nbor(1) - assert doc[1].nbor(1).text == '2' + assert doc[1].nbor(1).text == "2" def test_issue1387(): - tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} - index = {"verb": ("cope","cop")} + tag_map = {"VBG": {POS: VERB, VerbForm_part: True}} + index = {"verb": ("cope", "cop")} exc = {"verb": {"coping": ("cope",)}} rules = {"verb": [["ing", ""]]} lemmatizer = Lemmatizer(index, exc, rules) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) - doc[0].tag_ = 'VBG' + doc[0].tag_ = "VBG" assert doc[0].text == "coping" assert doc[0].lemma_ == "cope" def test_issue1434(): """Test matches occur when optional element at end of short doc.""" - pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}] + pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}] vocab = Vocab(lex_attr_getters=LEX_ATTRS) - hello_world = Doc(vocab, words=['Hello', 'World']) - hello = Doc(vocab, words=['Hello']) + hello_world = Doc(vocab, words=["Hello", "World"]) + hello = Doc(vocab, words=["Hello"]) matcher = Matcher(vocab) - matcher.add('MyMatcher', None, pattern) + matcher.add("MyMatcher", None, pattern) matches = matcher(hello_world) assert matches matches = matcher(hello) assert matches -@pytest.mark.parametrize('string,start,end', [ - ('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2), - ('a b b c', 0, 3), ('a b b', 0, 3),]) +@pytest.mark.parametrize( + "string,start,end", + [ + ("a", 0, 1), + ("a b", 0, 2), + ("a c", 0, 1), + ("a b c", 0, 2), + ("a b b c", 0, 3), + ("a b b", 0, 3), + ], +) def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" - pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}] + pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}] matcher = Matcher(Vocab()) matcher.add("TSTEND", None, pattern) doc = Doc(Vocab(), words=string.split()) @@ -96,17 +104,20 @@ def test_issue1450(string, start, end): def test_issue1488(): - prefix_re = re.compile(r'''[\[\("']''') - suffix_re = re.compile(r'''[\]\)"']''') - infix_re = re.compile(r'''[-~\.]''') - simple_url_re = re.compile(r'''^https?://''') + prefix_re = re.compile(r"""[\[\("']""") + suffix_re = re.compile(r"""[\]\)"']""") + infix_re = re.compile(r"""[-~\.]""") + simple_url_re = re.compile(r"""^https?://""") def my_tokenizer(nlp): - return Tokenizer(nlp.vocab, {}, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer, - token_match=simple_url_re.match) + return Tokenizer( + nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match, + ) nlp = English() nlp.tokenizer = my_tokenizer(nlp) @@ -116,11 +127,16 @@ def test_issue1488(): def test_issue1494(): - infix_re = re.compile(r'''[^a-z]''') - test_cases = [('token 123test', ['token', '1', '2', '3', 'test']), - ('token 1test', ['token', '1test']), - ('hello...test', ['hello', '.', '.', '.', 'test'])] - new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) + infix_re = re.compile(r"""[^a-z]""") + test_cases = [ + ("token 123test", ["token", "1", "2", "3", "test"]), + ("token 1test", ["token", "1test"]), + ("hello...test", ["hello", ".", ".", ".", "test"]), + ] + + def new_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) + nlp = English() nlp.tokenizer = new_tokenizer(nlp) for text, expected in test_cases: diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index ea329f55b..71c563c41 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -45,17 +45,17 @@ def test_issue1506(): def test_issue1518(): """Test vectors.resize() works.""" vectors = Vectors(shape=(10, 10)) - vectors.add('hello', row=2) + vectors.add("hello", row=2) vectors.resize((5, 9)) def test_issue1537(): """Test that Span.as_doc() doesn't segfault.""" - string = 'The sky is blue . The man is pink . The dog is purple .' + string = "The sky is blue . The man is pink . The dog is purple ." doc = Doc(Vocab(), words=string.split()) doc[0].sent_start = True for word in doc[1:]: - if word.nbor(-1).text == '.': + if word.nbor(-1).text == ".": word.sent_start = True else: word.sent_start = False @@ -67,7 +67,7 @@ def test_issue1537(): # TODO: Currently segfaulting, due to l_edge and r_edge misalignment -#def test_issue1537_model(): +# def test_issue1537_model(): # nlp = load_spacy('en') # doc = nlp('The sky is blue. The man is pink. The dog is purple.') # sents = [s.as_doc() for s in doc.sents] @@ -77,41 +77,41 @@ def test_issue1537(): def test_issue1539(): """Ensure vectors.resize() doesn't try to modify dictionary during iteration.""" - v = Vectors(shape=(10, 10), keys=[5,3,98,100]) - v.resize((100,100)) + v = Vectors(shape=(10, 10), keys=[5, 3, 98, 100]) + v.resize((100, 100)) def test_issue1547(): """Test that entity labels still match after merging tokens.""" - words = ['\n', 'worda', '.', '\n', 'wordb', '-', 'Biosphere', '2', '-', ' \n'] + words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) - doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings['PRODUCT'])] + doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] doc[5:7].merge() assert [ent.text for ent in doc.ents] def test_issue1612(en_tokenizer): - doc = en_tokenizer('The black cat purrs.') - span = doc[1: 3] + doc = en_tokenizer("The black cat purrs.") + span = doc[1:3] assert span.orth_ == span.text def test_issue1654(): nlp = Language(Vocab()) assert not nlp.pipeline - nlp.add_pipe(lambda doc: doc, name='1') - nlp.add_pipe(lambda doc: doc, name='2', after='1') - nlp.add_pipe(lambda doc: doc, name='3', after='2') - assert nlp.pipe_names == ['1', '2', '3'] + nlp.add_pipe(lambda doc: doc, name="1") + nlp.add_pipe(lambda doc: doc, name="2", after="1") + nlp.add_pipe(lambda doc: doc, name="3", after="2") + assert nlp.pipe_names == ["1", "2", "3"] nlp2 = Language(Vocab()) assert not nlp2.pipeline - nlp2.add_pipe(lambda doc: doc, name='3') - nlp2.add_pipe(lambda doc: doc, name='2', before='3') - nlp2.add_pipe(lambda doc: doc, name='1', before='2') - assert nlp2.pipe_names == ['1', '2', '3'] + nlp2.add_pipe(lambda doc: doc, name="3") + nlp2.add_pipe(lambda doc: doc, name="2", before="3") + nlp2.add_pipe(lambda doc: doc, name="1", before="2") + assert nlp2.pipe_names == ["1", "2", "3"] -@pytest.mark.parametrize('text', ['test@example.com', 'john.doe@example.co.uk']) +@pytest.mark.parametrize("text", ["test@example.com", "john.doe@example.co.uk"]) def test_issue1698(en_tokenizer, text): doc = en_tokenizer(text) assert len(doc) == 1 @@ -121,30 +121,30 @@ def test_issue1698(en_tokenizer, text): def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" - data = numpy.ones((3, 300), dtype='f') - vectors = Vectors(data=data, keys=['I', 'am', 'Matt']) + data = numpy.ones((3, 300), dtype="f") + vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = Tagger(Vocab()) - tagger.add_label('PRP') + tagger.add_label("PRP") tagger.begin_training() - assert tagger.cfg.get('pretrained_dims', 0) == 0 + assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) tagger = Tagger(Vocab()).from_disk(path) - assert tagger.cfg.get('pretrained_dims', 0) == 0 + assert tagger.cfg.get("pretrained_dims", 0) == 0 def test_issue1757(): """Test comparison against None doesn't cause segfault.""" - doc = Doc(Vocab(), words=['a', 'b', 'c']) + doc = Doc(Vocab(), words=["a", "b", "c"]) assert not doc[0] < None assert not doc[0] == None assert doc[0] >= None assert not doc[:2] < None assert not doc[:2] == None assert doc[:2] >= None - assert not doc.vocab['a'] == None - assert not doc.vocab['a'] < None + assert not doc.vocab["a"] == None + assert not doc.vocab["a"] < None def test_issue1758(en_tokenizer): @@ -158,11 +158,20 @@ def test_issue1758(en_tokenizer): def test_issue1799(): """Test sentence boundaries are deserialized correctly, even for non-projective sentences.""" - heads_deps = numpy.asarray([[1, 397], [4, 436], [2, 426], [1, 402], - [0, 8206900633647566924], [18446744073709551615, 440], - [18446744073709551614, 442]], dtype='uint64') - doc = Doc(Vocab(), words='Just what I was looking for .'.split()) - doc.vocab.strings.add('ROOT') + heads_deps = numpy.asarray( + [ + [1, 397], + [4, 436], + [2, 426], + [1, 402], + [0, 8206900633647566924], + [18446744073709551615, 440], + [18446744073709551614, 442], + ], + dtype="uint64", + ) + doc = Doc(Vocab(), words="Just what I was looking for .".split()) + doc.vocab.strings.add("ROOT") doc = doc.from_array([HEAD, DEP], heads_deps) assert len(list(doc.sents)) == 1 @@ -170,9 +179,9 @@ def test_issue1799(): def test_issue1807(): """Test vocab.set_vector also adds the word to the vocab.""" vocab = Vocab() - assert 'hello' not in vocab - vocab.set_vector('hello', numpy.ones((50,), dtype='f')) - assert 'hello' in vocab + assert "hello" not in vocab + vocab.set_vector("hello", numpy.ones((50,), dtype="f")) + assert "hello" in vocab def test_issue1834(): @@ -195,34 +204,34 @@ def test_issue1834(): def test_issue1868(): """Test Vocab.__contains__ works with int keys.""" vocab = Vocab() - lex = vocab['hello'] + lex = vocab["hello"] assert lex.orth in vocab assert lex.orth_ in vocab - assert 'some string' not in vocab - int_id = vocab.strings.add('some string') + assert "some string" not in vocab + int_id = vocab.strings.add("some string") assert int_id not in vocab def test_issue1883(): matcher = Matcher(Vocab()) - matcher.add('pat1', None, [{'orth': 'hello'}]) - doc = Doc(matcher.vocab, words=['hello']) + matcher.add("pat1", None, [{"orth": "hello"}]) + doc = Doc(matcher.vocab, words=["hello"]) assert len(matcher(doc)) == 1 new_matcher = copy.deepcopy(matcher) - new_doc = Doc(new_matcher.vocab, words=['hello']) + new_doc = Doc(new_matcher.vocab, words=["hello"]) assert len(new_matcher(new_doc)) == 1 -@pytest.mark.parametrize('word', ['the']) +@pytest.mark.parametrize("word", ["the"]) def test_issue1889(word): assert is_stop(word, STOP_WORDS) == is_stop(word.upper(), STOP_WORDS) def test_issue1915(): - cfg = {'hidden_depth': 2} # should error out + cfg = {"hidden_depth": 2} # should error out nlp = Language() - nlp.add_pipe(nlp.create_pipe('ner')) - nlp.get_pipe('ner').add_label('answer') + nlp.add_pipe(nlp.create_pipe("ner")) + nlp.get_pipe("ner").add_label("answer") with pytest.raises(ValueError): nlp.begin_training(**cfg) @@ -230,17 +239,17 @@ def test_issue1915(): def test_issue1945(): """Test regression in Matcher introduced in v2.0.6.""" matcher = Matcher(Vocab()) - matcher.add('MWE', None, [{'orth': 'a'}, {'orth': 'a'}]) - doc = Doc(matcher.vocab, words=['a', 'a', 'a']) + matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}]) + doc = Doc(matcher.vocab, words=["a", "a", "a"]) matches = matcher(doc) # we should see two overlapping matches here assert len(matches) == 2 assert matches[0][1:] == (0, 2) assert matches[1][1:] == (1, 3) -@pytest.mark.parametrize('label', ['U-JOB-NAME']) +@pytest.mark.parametrize("label", ["U-JOB-NAME"]) def test_issue1967(label): ner = EntityRecognizer(Vocab()) - entry = ([0], ['word'], ['tag'], [0], ['dep'], [label]) + entry = ([0], ["word"], ["tag"], [0], ["dep"], [label]) gold_parses = [(None, [(entry, None)])] ner.moves.get_actions(gold_parses=gold_parses) diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index b33e8e1d9..32839d050 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -14,15 +14,15 @@ from ..util import add_vecs_to_vocab def test_issue2179(): """Test that spurious 'extra_labels' aren't created when initializing NER.""" nlp = Italian() - ner = nlp.create_pipe('ner') - ner.add_label('CITIZENSHIP') + ner = nlp.create_pipe("ner") + ner.add_label("CITIZENSHIP") nlp.add_pipe(ner) nlp.begin_training() nlp2 = Italian() - nlp2.add_pipe(nlp2.create_pipe('ner')) + nlp2.add_pipe(nlp2.create_pipe("ner")) nlp2.from_bytes(nlp.to_bytes()) - assert 'extra_labels' not in nlp2.get_pipe('ner').cfg - assert nlp2.get_pipe('ner').labels == ['CITIZENSHIP'] + assert "extra_labels" not in nlp2.get_pipe("ner").cfg + assert nlp2.get_pipe("ner").labels == ["CITIZENSHIP"] def test_issue2219(en_vocab): @@ -34,7 +34,7 @@ def test_issue2219(en_vocab): def test_issue2361(de_tokenizer): - chars = ('<', '>', '&', '"') + chars = ("<", ">", "&", """) doc = de_tokenizer('< > & " ') doc.is_parsed = True doc.is_tagged = True @@ -46,25 +46,32 @@ def test_issue2361(de_tokenizer): def test_issue2385(): """Test that IOB tags are correctly converted to BILUO tags.""" # fix bug in labels with a 'b' character - tags1 = ('B-BRAWLER', 'I-BRAWLER', 'I-BRAWLER') - assert iob_to_biluo(tags1) == ['B-BRAWLER', 'I-BRAWLER', 'L-BRAWLER'] + tags1 = ("B-BRAWLER", "I-BRAWLER", "I-BRAWLER") + assert iob_to_biluo(tags1) == ["B-BRAWLER", "I-BRAWLER", "L-BRAWLER"] # maintain support for iob1 format - tags2 = ('I-ORG', 'I-ORG', 'B-ORG') - assert iob_to_biluo(tags2) == ['B-ORG', 'L-ORG', 'U-ORG'] + tags2 = ("I-ORG", "I-ORG", "B-ORG") + assert iob_to_biluo(tags2) == ["B-ORG", "L-ORG", "U-ORG"] # maintain support for iob2 format - tags3 = ('B-PERSON', 'I-PERSON', 'B-PERSON') - assert iob_to_biluo(tags3) ==['B-PERSON', 'L-PERSON', 'U-PERSON'] + tags3 = ("B-PERSON", "I-PERSON", "B-PERSON") + assert iob_to_biluo(tags3) == ["B-PERSON", "L-PERSON", "U-PERSON"] -@pytest.mark.parametrize('tags', [ - ('B-ORG', 'L-ORG'), ('B-PERSON', 'I-PERSON', 'L-PERSON'), ('U-BRAWLER', 'U-BRAWLER')]) +@pytest.mark.parametrize( + "tags", + [ + ("B-ORG", "L-ORG"), + ("B-PERSON", "I-PERSON", "L-PERSON"), + ("U-BRAWLER", "U-BRAWLER"), + ], +) def test_issue2385_biluo(tags): """Test that BILUO-compatible tags aren't modified.""" assert iob_to_biluo(tags) == list(tags) + def test_issue2482(): - '''Test we can serialize and deserialize a blank NER or parser model.''' + """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() - nlp.add_pipe(nlp.create_pipe('ner')) + nlp.add_pipe(nlp.create_pipe("ner")) b = nlp.to_bytes() nlp2 = Italian().from_bytes(b) diff --git a/spacy/tests/regression/test_issue2564.py b/spacy/tests/regression/test_issue2564.py index ef629efc1..12b376d1a 100644 --- a/spacy/tests/regression/test_issue2564.py +++ b/spacy/tests/regression/test_issue2564.py @@ -7,11 +7,11 @@ from spacy.language import Language def test_issue2564(): """Test the tagger sets is_tagged correctly when used via Language.pipe.""" nlp = Language() - tagger = nlp.create_pipe('tagger') + tagger = nlp.create_pipe("tagger") tagger.begin_training() # initialise weights nlp.add_pipe(tagger) - doc = nlp('hello world') + doc = nlp("hello world") assert doc.is_tagged - docs = nlp.pipe(['hello', 'world']) + docs = nlp.pipe(["hello", "world"]) piped_doc = next(docs) assert piped_doc.is_tagged diff --git a/spacy/tests/regression/test_issue2569.py b/spacy/tests/regression/test_issue2569.py index b1db67508..6f30948c5 100644 --- a/spacy/tests/regression/test_issue2569.py +++ b/spacy/tests/regression/test_issue2569.py @@ -7,11 +7,11 @@ from spacy.tokens import Span def test_issue2569(en_tokenizer): doc = en_tokenizer("It is May 15, 1993.") - doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings['DATE'])] + doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])] matcher = Matcher(doc.vocab) - matcher.add("RULE", None, [{'ENT_TYPE':'DATE', 'OP':'+'}]) + matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}]) matched = [doc[start:end] for _, start, end in matcher(doc)] matched = sorted(matched, key=len, reverse=True) assert len(matched) == 10 assert len(matched[0]) == 4 - assert matched[0].text == 'May 15, 1993' + assert matched[0].text == "May 15, 1993" diff --git a/spacy/tests/regression/test_issue2671.py b/spacy/tests/regression/test_issue2671.py index 561cb2a9e..59bb8791f 100644 --- a/spacy/tests/regression/test_issue2671.py +++ b/spacy/tests/regression/test_issue2671.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest from spacy.lang.en import English from spacy.matcher import Matcher @@ -10,6 +9,7 @@ def test_issue2671(): """Ensure the correct entity ID is returned for matches with quantifiers. See also #2675 """ + def get_rule_id(nlp, matcher, doc): matches = matcher(doc) for match_id, start, end in matches: @@ -19,10 +19,12 @@ def test_issue2671(): nlp = English() matcher = Matcher(nlp.vocab) - pattern_id = 'test_pattern' - pattern = [{'LOWER': 'high'}, - {'IS_PUNCT': True, 'OP': '?'}, - {'LOWER': 'adrenaline'}] + pattern_id = "test_pattern" + pattern = [ + {"LOWER": "high"}, + {"IS_PUNCT": True, "OP": "?"}, + {"LOWER": "adrenaline"}, + ] matcher.add(pattern_id, None, pattern) doc1 = nlp("This is a high-adrenaline situation.") doc2 = nlp("This is a high adrenaline situation.") diff --git a/spacy/tests/regression/test_issue2772.py b/spacy/tests/regression/test_issue2772.py index d8188c71c..3ae2a7860 100644 --- a/spacy/tests/regression/test_issue2772.py +++ b/spacy/tests/regression/test_issue2772.py @@ -1,17 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import pytest - from ..util import get_doc def test_issue2772(en_vocab): """Test that deprojectivization doesn't mess up sentence boundaries.""" - words = 'When we write or communicate virtually , we can hide our true feelings .'.split() + words = "When we write or communicate virtually , we can hide our true feelings .".split() # A tree with a non-projective (i.e. crossing) arc # The arcs (0, 4) and (2, 9) cross. heads = [4, 1, 7, -1, -2, -1, 3, 2, 1, 0, -1, -2, -1] - deps = ['dep'] * len(heads) + deps = ["dep"] * len(heads) doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) assert doc[1].is_sent_start is None diff --git a/spacy/tests/regression/test_issue2782.py b/spacy/tests/regression/test_issue2782.py index 3e4ba86d0..86591ab12 100644 --- a/spacy/tests/regression/test_issue2782.py +++ b/spacy/tests/regression/test_issue2782.py @@ -5,8 +5,8 @@ from spacy.util import get_lang_class import pytest -@pytest.mark.parametrize('text', ['-0.23', '+123,456', '±1']) -@pytest.mark.parametrize('lang', ['en', 'xx']) +@pytest.mark.parametrize("text", ["-0.23", "+123,456", "±1"]) +@pytest.mark.parametrize("lang", ["en", "xx"]) def test_issue2782(text, lang): """Check that like_num handles + and - before number.""" cls = get_lang_class(lang) diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index f9e092050..77d6e6833 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -18,25 +18,25 @@ def test_serialize_empty_doc(en_vocab): def test_serialize_doc_roundtrip_bytes(en_vocab): - doc = Doc(en_vocab, words=['hello', 'world']) + doc = Doc(en_vocab, words=["hello", "world"]) doc_b = doc.to_bytes() new_doc = Doc(en_vocab).from_bytes(doc_b) assert new_doc.to_bytes() == doc_b def test_serialize_doc_roundtrip_disk(en_vocab): - doc = Doc(en_vocab, words=['hello', 'world']) + doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: - file_path = d / 'doc' + file_path = d / "doc" doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes() def test_serialize_doc_roundtrip_disk_str_path(en_vocab): - doc = Doc(en_vocab, words=['hello', 'world']) + doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: - file_path = d / 'doc' + file_path = d / "doc" file_path = path2str(file_path) doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index 251aaf4f0..1881b7d0c 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -8,19 +8,20 @@ from spacy.vocab import Vocab @pytest.fixture def doc_w_attrs(en_tokenizer): - Doc.set_extension('_test_attr', default=False) - Doc.set_extension('_test_prop', getter=lambda doc: len(doc.text)) - Doc.set_extension('_test_method', method=lambda doc, arg: "{}{}".format(len(doc.text), arg)) + Doc.set_extension("_test_attr", default=False) + Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text)) + Doc.set_extension( + "_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg) + ) doc = en_tokenizer("This is a test.") - doc._._test_attr = 'test' + doc._._test_attr = "test" return doc - def test_serialize_ext_attrs_from_bytes(doc_w_attrs): doc_b = doc_w_attrs.to_bytes() doc = Doc(Vocab()).from_bytes(doc_b) - assert doc._.has('_test_attr') - assert doc._._test_attr == 'test' + assert doc._.has("_test_attr") + assert doc._._test_attr == "test" assert doc._._test_prop == len(doc.text) - assert doc._._test_method('test') == '{}{}'.format(len(doc.text), 'test') + assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test") diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index 210729340..edc5d125d 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -12,14 +12,14 @@ from ..util import make_tempdir @pytest.fixture def meta_data(): return { - 'name': 'name-in-fixture', - 'version': 'version-in-fixture', - 'description': 'description-in-fixture', - 'author': 'author-in-fixture', - 'email': 'email-in-fixture', - 'url': 'url-in-fixture', - 'license': 'license-in-fixture', - 'vectors': {'width': 0, 'vectors': 0, 'keys': 0, 'name': None} + "name": "name-in-fixture", + "version": "version-in-fixture", + "description": "description-in-fixture", + "author": "author-in-fixture", + "email": "email-in-fixture", + "url": "url-in-fixture", + "license": "license-in-fixture", + "vectors": {"width": 0, "vectors": 0, "keys": 0, "name": None}, } @@ -35,16 +35,18 @@ def test_serialize_with_custom_tokenizer(): """Test that serialization with custom tokenizer works without token_match. See: https://support.prodi.gy/t/how-to-save-a-custom-tokenizer/661/2 """ - prefix_re = re.compile(r'''1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:''') - suffix_re = re.compile(r'''''') - infix_re = re.compile(r'''[~]''') + prefix_re = re.compile(r"""1/|2/|:[0-9][0-9][A-K]:|:[0-9][0-9]:""") + suffix_re = re.compile(r"""""") + infix_re = re.compile(r"""[~]""") def custom_tokenizer(nlp): - return Tokenizer(nlp.vocab, - {}, - prefix_search=prefix_re.search, - suffix_search=suffix_re.search, - infix_finditer=infix_re.finditer) + return Tokenizer( + nlp.vocab, + {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + ) nlp = Language() nlp.tokenizer = custom_tokenizer(nlp) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index b177f9bd8..c39d3a325 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -2,7 +2,8 @@ from __future__ import unicode_literals import pytest -from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer, Tensorizer, TextCategorizer +from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer +from spacy.pipeline import Tensorizer, TextCategorizer from ..util import make_tempdir @@ -13,7 +14,7 @@ test_parsers = [DependencyParser, EntityRecognizer] @pytest.fixture def parser(en_vocab): parser = DependencyParser(en_vocab) - parser.add_label('nsubj') + parser.add_label("nsubj") parser.model, cfg = parser.Model(parser.moves.n_moves) parser.cfg.update(cfg) return parser @@ -34,7 +35,7 @@ def taggers(en_vocab): return (tagger1, tagger2) -@pytest.mark.parametrize('Parser', test_parsers) +@pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): parser = Parser(en_vocab) parser.model, _ = parser.Model(10) @@ -44,12 +45,12 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser): assert new_parser.to_bytes() == parser.to_bytes() -@pytest.mark.parametrize('Parser', test_parsers) +@pytest.mark.parametrize("Parser", test_parsers) def test_serialize_parser_roundtrip_disk(en_vocab, Parser): parser = Parser(en_vocab) parser.model, _ = parser.Model(0) with make_tempdir() as d: - file_path = d / 'parser' + file_path = d / "parser" parser.to_disk(file_path) parser_d = Parser(en_vocab) parser_d.model, _ = parser_d.Model(0) @@ -67,7 +68,9 @@ def test_to_from_bytes(parser, blank_parser): assert blank_parser.moves.n_moves == parser.moves.n_moves -@pytest.mark.skip(reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms.") +@pytest.mark.skip( + reason="This seems to be a dict ordering bug somewhere. Only failing on some platforms." +) def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1, tagger2 = taggers tagger1_b = tagger1.to_bytes() @@ -81,8 +84,8 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger1, tagger2 = taggers with make_tempdir() as d: - file_path1 = d / 'tagger1' - file_path2 = d / 'tagger2' + file_path1 = d / "tagger1" + file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) tagger1_d = Tagger(en_vocab).from_disk(file_path1) @@ -102,7 +105,7 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): tensorizer = Tensorizer(en_vocab) tensorizer.model = tensorizer.Model() with make_tempdir() as d: - file_path = d / 'tensorizer' + file_path = d / "tensorizer" tensorizer.to_disk(file_path) tensorizer_d = Tensorizer(en_vocab).from_disk(file_path) assert tensorizer.to_bytes() == tensorizer_d.to_bytes() @@ -110,5 +113,5 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab): def test_serialize_textcat_empty(en_vocab): # See issue #1105 - textcat = TextCategorizer(en_vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) + textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat_bytes = textcat.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 2e1256f2d..4e3dafa30 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -9,7 +9,7 @@ from ..util import make_tempdir, assert_packed_msg_equal def load_tokenizer(b): - tok = get_lang_class('en').Defaults.create_tokenizer() + tok = get_lang_class("en").Defaults.create_tokenizer() tok.from_bytes(b) return tok @@ -23,7 +23,7 @@ def test_serialize_custom_tokenizer(en_vocab, en_tokenizer): @pytest.mark.skip(reason="Currently unreliable across platforms") -@pytest.mark.parametrize('text', ["I💜you", "they’re", "“hello”"]) +@pytest.mark.parametrize("text", ["I💜you", "they’re", "“hello”"]) def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): tokenizer = en_tokenizer new_tokenizer = load_tokenizer(tokenizer.to_bytes()) @@ -38,7 +38,7 @@ def test_serialize_tokenizer_roundtrip_bytes(en_tokenizer, text): def test_serialize_tokenizer_roundtrip_disk(en_tokenizer): tokenizer = en_tokenizer with make_tempdir() as d: - file_path = d / 'tokenizer' + file_path = d / "tokenizer" tokenizer.to_disk(file_path) tokenizer_d = en_tokenizer.from_disk(file_path) assert tokenizer.to_bytes() == tokenizer_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 352620e92..fc51ea930 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -8,12 +8,12 @@ from spacy.strings import StringStore from ..util import make_tempdir -test_strings = [([], []), (['rats', 'are', 'cute'], ['i', 'like', 'rats'])] -test_strings_attrs = [(['rats', 'are', 'cute'], 'Hello')] +test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])] +test_strings_attrs = [(["rats", "are", "cute"], "Hello")] @pytest.mark.xfail -@pytest.mark.parametrize('text', ['rat']) +@pytest.mark.parametrize("text", ["rat"]) def test_serialize_vocab(en_vocab, text): text_hash = en_vocab.strings.add(text) vocab_bytes = en_vocab.to_bytes() @@ -21,7 +21,7 @@ def test_serialize_vocab(en_vocab, text): assert new_vocab.strings(text_hash) == text -@pytest.mark.parametrize('strings1,strings2', test_strings) +@pytest.mark.parametrize("strings1,strings2", test_strings) def test_serialize_vocab_roundtrip_bytes(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) @@ -39,13 +39,13 @@ def test_serialize_vocab_roundtrip_bytes(strings1, strings2): assert sorted([lex.text for lex in new_vocab1]) == sorted(strings1) -@pytest.mark.parametrize('strings1,strings2', test_strings) -def test_serialize_vocab_roundtrip_disk(strings1,strings2): +@pytest.mark.parametrize("strings1,strings2", test_strings) +def test_serialize_vocab_roundtrip_disk(strings1, strings2): vocab1 = Vocab(strings=strings1) vocab2 = Vocab(strings=strings2) with make_tempdir() as d: - file_path1 = d / 'vocab1' - file_path2 = d / 'vocab2' + file_path1 = d / "vocab1" + file_path2 = d / "vocab2" vocab1.to_disk(file_path1) vocab2.to_disk(file_path2) vocab1_d = Vocab().from_disk(file_path1) @@ -58,7 +58,7 @@ def test_serialize_vocab_roundtrip_disk(strings1,strings2): assert list(vocab1_d) != list(vocab2_d) -@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs) +@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() @@ -69,7 +69,7 @@ def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr): assert vocab2[strings[0]].norm_ == lex_attr -@pytest.mark.parametrize('strings,lex_attr', test_strings_attrs) +@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs) def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): vocab1 = Vocab(strings=strings) vocab2 = Vocab() @@ -77,13 +77,13 @@ def test_serialize_vocab_lex_attrs_disk(strings, lex_attr): assert vocab1[strings[0]].norm_ == lex_attr assert vocab2[strings[0]].norm_ != lex_attr with make_tempdir() as d: - file_path = d / 'vocab' + file_path = d / "vocab" vocab1.to_disk(file_path) vocab2 = vocab2.from_disk(file_path) assert vocab2[strings[0]].norm_ == lex_attr -@pytest.mark.parametrize('strings1,strings2', test_strings) +@pytest.mark.parametrize("strings1,strings2", test_strings) def test_serialize_stringstore_roundtrip_bytes(strings1, strings2): sstore1 = StringStore(strings=strings1) sstore2 = StringStore(strings=strings2) @@ -100,13 +100,13 @@ def test_serialize_stringstore_roundtrip_bytes(strings1, strings2): assert list(new_sstore1) == strings1 -@pytest.mark.parametrize('strings1,strings2', test_strings) +@pytest.mark.parametrize("strings1,strings2", test_strings) def test_serialize_stringstore_roundtrip_disk(strings1, strings2): sstore1 = StringStore(strings=strings1) sstore2 = StringStore(strings=strings2) with make_tempdir() as d: - file_path1 = d / 'strings1' - file_path2 = d / 'strings2' + file_path1 = d / "strings1" + file_path2 = d / "strings2" sstore1.to_disk(file_path1) sstore2.to_disk(file_path2) sstore1_d = StringStore().from_disk(file_path1) diff --git a/spacy/tests/test_align.py b/spacy/tests/test_align.py index 2b5af0d2c..72cfa0638 100644 --- a/spacy/tests/test_align.py +++ b/spacy/tests/test_align.py @@ -5,52 +5,63 @@ import pytest from spacy._align import align, multi_align -@pytest.mark.parametrize('string1,string2,cost', [ - ('hello', 'hell', 1), - ('rat', 'cat', 1), - ('rat', 'rat', 0), - ('rat', 'catsie', 4), - ('t', 'catsie', 5), -]) +@pytest.mark.parametrize( + "string1,string2,cost", + [ + ("hello", "hell", 1), + ("rat", "cat", 1), + ("rat", "rat", 0), + ("rat", "catsie", 4), + ("t", "catsie", 5), + ], +) def test_align_costs(string1, string2, cost): output_cost, i2j, j2i, matrix = align(string1, string2) assert output_cost == cost -@pytest.mark.parametrize('string1,string2,i2j', [ - ('hello', 'hell', [0,1,2,3,-1]), - ('rat', 'cat', [0,1,2]), - ('rat', 'rat', [0,1,2]), - ('rat', 'catsie', [0,1,2]), - ('t', 'catsie', [2]), -]) +@pytest.mark.parametrize( + "string1,string2,i2j", + [ + ("hello", "hell", [0, 1, 2, 3, -1]), + ("rat", "cat", [0, 1, 2]), + ("rat", "rat", [0, 1, 2]), + ("rat", "catsie", [0, 1, 2]), + ("t", "catsie", [2]), + ], +) def test_align_i2j(string1, string2, i2j): output_cost, output_i2j, j2i, matrix = align(string1, string2) assert list(output_i2j) == i2j -@pytest.mark.parametrize('string1,string2,j2i', [ - ('hello', 'hell', [0,1,2,3]), - ('rat', 'cat', [0,1,2]), - ('rat', 'rat', [0,1,2]), - ('rat', 'catsie', [0,1,2, -1, -1, -1]), - ('t', 'catsie', [-1, -1, 0, -1, -1, -1]), -]) +@pytest.mark.parametrize( + "string1,string2,j2i", + [ + ("hello", "hell", [0, 1, 2, 3]), + ("rat", "cat", [0, 1, 2]), + ("rat", "rat", [0, 1, 2]), + ("rat", "catsie", [0, 1, 2, -1, -1, -1]), + ("t", "catsie", [-1, -1, 0, -1, -1, -1]), + ], +) def test_align_i2j(string1, string2, j2i): output_cost, output_i2j, output_j2i, matrix = align(string1, string2) assert list(output_j2i) == j2i + def test_align_strings(): - words1 = ['hello', 'this', 'is', 'test!'] - words2 = ['hellothis', 'is', 'test', '!'] + words1 = ["hello", "this", "is", "test!"] + words2 = ["hellothis", "is", "test", "!"] cost, i2j, j2i, matrix = align(words1, words2) assert cost == 4 assert list(i2j) == [-1, -1, 1, -1] assert list(j2i) == [-1, 2, -1, -1] + def test_align_many_to_one(): - words1 = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] - words2 = ['ab', 'bc', 'e', 'fg', 'h'] + words1 = ["a", "b", "c", "d", "e", "f", "g", "h"] + words2 = ["ab", "bc", "e", "fg", "h"] cost, i2j, j2i, matrix = align(words1, words2) assert list(i2j) == [-1, -1, -1, -1, 2, -1, -1, 4] lengths1 = [len(w) for w in words1] diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 7a9198642..d2fa7682e 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -8,75 +8,78 @@ from .util import get_doc def test_gold_biluo_U(en_vocab): - orths_and_spaces = [('I', True), ('flew', True), ('to', True), - ('London', False), ('.', True)] - doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces) - entities = [(len("I flew to "), len("I flew to London"), 'LOC')] + words = ["I", "flew", "to", "London", "."] + spaces = [True, True, True, False, True] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to London"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) - assert tags == ['O', 'O', 'O', 'U-LOC', 'O'] + assert tags == ["O", "O", "O", "U-LOC", "O"] def test_gold_biluo_BL(en_vocab): - orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), - ('Francisco', False), ('.', True)] - doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces) - entities = [(len("I flew to "), len("I flew to San Francisco"), 'LOC')] + words = ["I", "flew", "to", "San", "Francisco", "."] + spaces = [True, True, True, True, False, True] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) - assert tags == ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O'] + assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] def test_gold_biluo_BIL(en_vocab): - orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), - ('Francisco', True), ('Valley', False), ('.', True)] - doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')] + words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, False, True] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) - assert tags == ['O', 'O', 'O', 'B-LOC', 'I-LOC', 'L-LOC', 'O'] + assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] def test_gold_biluo_misalign(en_vocab): - orths_and_spaces = [('I', True), ('flew', True), ('to', True), ('San', True), - ('Francisco', True), ('Valley.', False)] - doc = Doc(en_vocab, orths_and_spaces=orths_and_spaces) - entities = [(len("I flew to "), len("I flew to San Francisco Valley"), 'LOC')] + words = ["I", "flew", "to", "San", "Francisco", "Valley."] + spaces = [True, True, True, True, True, False] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] tags = biluo_tags_from_offsets(doc, entities) - assert tags == ['O', 'O', 'O', '-', '-', '-'] + assert tags == ["O", "O", "O", "-", "-", "-"] def test_roundtrip_offsets_biluo_conversion(en_tokenizer): text = "I flew to Silicon Valley via London." - biluo_tags = ['O', 'O', 'O', 'B-LOC', 'L-LOC', 'O', 'U-GPE', 'O'] - offsets = [(10, 24, 'LOC'), (29, 35, 'GPE')] + biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] + offsets = [(10, 24, "LOC"), (29, 35, "GPE")] doc = en_tokenizer(text) biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) assert biluo_tags_converted == biluo_tags offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) assert offsets_converted == offsets + def test_docs_to_json(en_vocab): - '''Test we can convert a list of Doc objects into the JSON-serializable + """Test we can convert a list of Doc objects into the JSON-serializable format we use for training. - ''' + """ docs = [ get_doc( en_vocab, - words=['a', 'b'], - pos=['VBP', 'NN'], + words=["a", "b"], + pos=["VBP", "NN"], heads=[0, -1], - deps=['ROOT', 'dobj'], - ents=[]), + deps=["ROOT", "dobj"], + ents=[], + ), get_doc( en_vocab, - words=['c', 'd', 'e'], - pos=['VBP', 'NN', 'NN'], + words=["c", "d", "e"], + pos=["VBP", "NN", "NN"], heads=[0, -1, -2], - deps=['ROOT', 'dobj', 'dobj'], - ents=[(1, 2, 'ORG')]), + deps=["ROOT", "dobj", "dobj"], + ents=[(1, 2, "ORG")], + ), ] json_doc = docs_to_json(0, docs) - assert json_doc['id'] == 0 - assert len(json_doc['paragraphs']) == 2 - assert len(json_doc['paragraphs'][0]['sentences']) == 1 - assert len(json_doc['paragraphs'][1]['sentences']) == 1 - assert len(json_doc['paragraphs'][0]['sentences'][0]['tokens']) == 2 - assert len(json_doc['paragraphs'][1]['sentences'][0]['tokens']) == 3 + assert json_doc["id"] == 0 + assert len(json_doc["paragraphs"]) == 2 + assert len(json_doc["paragraphs"][0]["sentences"]) == 1 + assert len(json_doc["paragraphs"][1]["sentences"]) == 1 + assert len(json_doc["paragraphs"][0]["sentences"][0]["tokens"]) == 2 + assert len(json_doc["paragraphs"][1]["sentences"][0]["tokens"]) == 3 diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index eb52e8a94..497525c39 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -11,19 +11,19 @@ from spacy._ml import PrecomputableAffine from .util import get_doc -@pytest.mark.parametrize('text', ['hello/world', 'hello world']) +@pytest.mark.parametrize("text", ["hello/world", "hello world"]) def test_util_ensure_path_succeeds(text): path = util.ensure_path(text) assert isinstance(path, Path) -@pytest.mark.parametrize('package', ['numpy']) +@pytest.mark.parametrize("package", ["numpy"]) def test_util_is_package(package): """Test that an installed package via pip is recognised by util.is_package.""" assert util.is_package(package) -@pytest.mark.parametrize('package', ['thinc']) +@pytest.mark.parametrize("package", ["thinc"]) def test_util_get_package_path(package): """Test that a Path object is returned for a package name.""" path = util.get_package_path(package) @@ -33,44 +33,47 @@ def test_util_get_package_path(package): def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) - doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])] + doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] ents = displacy.parse_ents(doc) assert isinstance(ents, dict) - assert ents['text'] == 'But Google is starting from behind ' - assert ents['ents'] == [{'start': 4, 'end': 10, 'label': 'ORG'}] + assert ents["text"] == "But Google is starting from behind " + assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}] def test_displacy_parse_deps(en_vocab): """Test that deps and tags on a Doc are converted into displaCy's format.""" words = ["This", "is", "a", "sentence"] heads = [1, 0, 1, -2] - pos = ['DET', 'VERB', 'DET', 'NOUN'] - tags = ['DT', 'VBZ', 'DT', 'NN'] - deps = ['nsubj', 'ROOT', 'det', 'attr'] - doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, - deps=deps) + pos = ["DET", "VERB", "DET", "NOUN"] + tags = ["DT", "VBZ", "DT", "NN"] + deps = ["nsubj", "ROOT", "det", "attr"] + doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps) deps = displacy.parse_deps(doc) assert isinstance(deps, dict) - assert deps['words'] == [{'text': 'This', 'tag': 'DET'}, - {'text': 'is', 'tag': 'VERB'}, - {'text': 'a', 'tag': 'DET'}, - {'text': 'sentence', 'tag': 'NOUN'}] - assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, - {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, - {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] + assert deps["words"] == [ + {"text": "This", "tag": "DET"}, + {"text": "is", "tag": "VERB"}, + {"text": "a", "tag": "DET"}, + {"text": "sentence", "tag": "NOUN"}, + ] + assert deps["arcs"] == [ + {"start": 0, "end": 1, "label": "nsubj", "dir": "left"}, + {"start": 2, "end": 3, "label": "det", "dir": "left"}, + {"start": 1, "end": 3, "label": "attr", "dir": "right"}, + ] def test_displacy_spans(en_vocab): """Test that displaCy can render Spans.""" doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) - doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings['ORG'])] - html = displacy.render(doc[1:4], style='ent') - assert html.startswith('' - assert tokens[21].text == '....' + assert tokens[20].text == ":>" + assert tokens[21].text == "...." -@pytest.mark.parametrize('text,length', [("example:)", 3), ("108)", 2), ("XDN", 1)]) +@pytest.mark.parametrize("text,length", [("example:)", 3), ("108)", 2), ("XDN", 1)]) def test_tokenizer_excludes_false_pos_emoticons(tokenizer, text, length): tokens = tokenizer(text) assert len(tokens) == length -@pytest.mark.parametrize('text,length', [('can you still dunk?🍕🍔😵LOL', 8), - ('i💙you', 3), ('🤘🤘yay!', 4)]) + +@pytest.mark.parametrize( + "text,length", [("can you still dunk?🍕🍔😵LOL", 8), ("i💙you", 3), ("🤘🤘yay!", 4)] +) def test_tokenizer_handles_emoji(tokenizer, text, length): # These break on narrow unicode builds, e.g. Windows if sys.maxunicode >= 1114111: diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 57e39e151..b383c8fc8 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -12,11 +12,9 @@ NAUGHTY_STRINGS = [ ",./;'[]\-=", '<>?:"{}|_+', '!@#$%^&*()`~"', - # Unicode additional control characters, byte order marks "­؀؁؂؃؄؅؜۝܏᠎​‌‍‎‏‪", "￾", - # Unicode Symbols "Ω≈ç√∫˜µ≤≥÷", "åß∂ƒ©˙∆˚¬…æ", @@ -29,13 +27,11 @@ NAUGHTY_STRINGS = [ "⅛⅜⅝⅞", "ЁЂЃЄЅІЇЈЉЊЋЌЍЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя", "٠١٢٣٤٥٦٧٨٩", - # Unicode Subscript/Superscript/Accents "⁰⁴⁵", "₀₁₂", "⁰⁴⁵₀₁₂", "ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็ ด้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็็้้้้้้้้็็็็็้้้้้็็็็", - # Two-Byte Characters "田中さんにあげて下さい", "パーティーへ行かないか", @@ -46,7 +42,6 @@ NAUGHTY_STRINGS = [ "社會科學院語學研究所", "울란바토르", "𠜎𠜱𠝹𠱓𠱸𠲖𠳏", - # Japanese Emoticons "ヽ༼ຈل͜ຈ༽ノ ヽ༼ຈل͜ຈ༽ノ", "(。◕ ∀ ◕。)", @@ -55,11 +50,9 @@ NAUGHTY_STRINGS = [ "・( ̄∀ ̄)・:*:", "゚・✿ヾ╲(。◕‿◕。)╱✿・゚", ",。・:*:・゜’( ☻ ω ☻ )。・:*:・゜’", - "(╯°□°)╯︵ ┻━┻)" - "(ノಥ益ಥ)ノ ┻━┻", + "(╯°□°)╯︵ ┻━┻)" "(ノಥ益ಥ)ノ ┻━┻", "┬─┬ノ( º _ ºノ)", "( ͡° ͜ʖ ͡°)", - # Emoji "😍", "👩🏽", @@ -69,18 +62,14 @@ NAUGHTY_STRINGS = [ "✋🏿 💪🏿 👐🏿 🙌🏿 👏🏿 🙏🏿", "🚾 🆒 🆓 🆕 🆖 🆗 🆙 🏧", "0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ 🔟", - # Regional Indicator Symbols "🇺🇸🇷🇺🇸 🇦🇫🇦🇲🇸", "🇺🇸🇷🇺🇸🇦🇫🇦🇲", "🇺🇸🇷🇺🇸🇦", - # Unicode Numbers "123", "١٢٣", - # Right-To-Left Strings - "ثم نفس سقطت وبالتحديد،, جزيرتي باستخدام أن دنو. إذ هنا؟ الستار وتنصيب كان. أهّل ايطاليا، بريطانيا-فرنسا قد أخذ. سليمان، إتفاقية بين ما, يذكر الحدود أي بعد, معاملة بولندا، الإطلاق عل إيو.", "إيو.", "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ", @@ -88,34 +77,21 @@ NAUGHTY_STRINGS = [ "﷽", "ﷺ", "مُنَاقَشَةُ سُبُلِ اِسْتِخْدَامِ اللُّغَةِ فِي النُّظُمِ الْقَائِمَةِ وَفِيم يَخُصَّ التَّطْبِيقَاتُ الْحاسُوبِيَّةُ،", - # Trick Unicode "‪‪test‪", "‫test", "
test
", "test⁠test", "⁦test⁧", - # Zalgo Text "Ṱ̺̺̕o͞ ̷i̲̬͇̪͙n̝̗͕v̟̜̘̦͟o̶̙̰̠kè͚̮̺̪̹̱̤ ̖t̝͕̳̣̻̪͞h̼͓̲̦̳̘̲e͇̣̰̦̬͎ ̢̼̻̱̘h͚͎͙̜̣̲ͅi̦̲̣̰̤v̻͍e̺̭̳̪̰-m̢iͅn̖̺̞̲̯̰d̵̼̟͙̩̼̘̳ ̞̥̱̳̭r̛̗̘e͙p͠r̼̞̻̭̗e̺̠̣͟s̘͇̳͍̝͉e͉̥̯̞̲͚̬͜ǹ̬͎͎̟̖͇̤t͍̬̤͓̼̭͘ͅi̪̱n͠g̴͉ ͏͉ͅc̬̟h͡a̫̻̯͘o̫̟̖͍̙̝͉s̗̦̲.̨̹͈̣", - - "̡͓̞ͅI̗̘̦͝n͇͇͙v̮̫ok̲̫̙͈i̖͙̭̹̠̞n̡̻̮̣̺g̲͈͙̭͙̬͎ ̰t͔̦h̞̲e̢̤ ͍̬̲͖f̴̘͕̣è͖ẹ̥̩l͖͔͚i͓͚̦͠n͖͍̗͓̳̮g͍ ̨o͚̪͡f̘̣̬ ̖̘͖̟͙̮c҉͔̫͖͓͇͖ͅh̵̤̣͚͔á̗̼͕ͅo̼̣̥s̱͈̺̖̦̻͢.̛̖̞̠̫̰", - - "̗̺͖̹̯͓Ṯ̤͍̥͇͈h̲́e͏͓̼̗̙̼̣͔ ͇̜̱̠͓͍ͅN͕͠e̗̱z̘̝̜̺͙p̤̺̹͍̯͚e̠̻̠͜r̨̤͍̺̖͔̖̖d̠̟̭̬̝͟i̦͖̩͓͔̤a̠̗̬͉̙n͚͜ ̻̞̰͚ͅh̵͉i̳̞v̢͇ḙ͎͟-҉̭̩̼͔m̤̭̫i͕͇̝̦n̗͙ḍ̟ ̯̲͕͞ǫ̟̯̰̲͙̻̝f ̪̰̰̗̖̭̘͘c̦͍̲̞͍̩̙ḥ͚a̮͎̟̙͜ơ̩̹͎s̤.̝̝ ҉Z̡̖̜͖̰̣͉̜a͖̰͙̬͡l̲̫̳͍̩g̡̟̼̱͚̞̬ͅo̗͜.̟", - - "̦H̬̤̗̤͝e͜ ̜̥̝̻͍̟́w̕h̖̯͓o̝͙̖͎̱̮ ҉̺̙̞̟͈W̷̼̭a̺̪͍į͈͕̭͙̯̜t̶̼̮s̘͙͖̕ ̠̫̠B̻͍͙͉̳ͅe̵h̵̬͇̫͙i̹͓̳̳̮͎̫̕n͟d̴̪̜̖ ̰͉̩͇͙̲͞ͅT͖̼͓̪͢h͏͓̮̻e̬̝̟ͅ ̤̹̝W͙̞̝͔͇͝ͅa͏͓͔̹̼̣l̴͔̰̤̟͔ḽ̫.͕", - - "Z̮̞̠͙͔ͅḀ̗̞͈̻̗Ḷ͙͎̯̹̞͓G̻O̭̗̮", - - # Unicode Upsidedown "˙ɐnbᴉlɐ ɐuƃɐɯ ǝɹolop ʇǝ ǝɹoqɐl ʇn ʇunpᴉpᴉɔuᴉ ɹodɯǝʇ poɯsnᴉǝ op pǝs 'ʇᴉlǝ ƃuᴉɔsᴉdᴉpɐ ɹnʇǝʇɔǝsuoɔ 'ʇǝɯɐ ʇᴉs ɹolop ɯnsdᴉ ɯǝɹo˥", "00˙Ɩ$-", - # Unicode font "The quick brown fox jumps over the lazy dog", "𝐓𝐡𝐞 𝐪𝐮𝐢𝐜𝐤 𝐛𝐫𝐨𝐰𝐧 𝐟𝐨𝐱 𝐣𝐮𝐦𝐩𝐬 𝐨𝐯𝐞𝐫 𝐭𝐡𝐞 𝐥𝐚𝐳𝐲 𝐝𝐨𝐠", @@ -125,19 +101,17 @@ NAUGHTY_STRINGS = [ "𝕋𝕙𝕖 𝕢𝕦𝕚𝕔𝕜 𝕓𝕣𝕠𝕨𝕟 𝕗𝕠𝕩 𝕛𝕦𝕞𝕡𝕤 𝕠𝕧𝕖𝕣 𝕥𝕙𝕖 𝕝𝕒𝕫𝕪 𝕕𝕠𝕘", "𝚃𝚑𝚎 𝚚𝚞𝚒𝚌𝚔 𝚋𝚛𝚘𝚠𝚗 𝚏𝚘𝚡 𝚓𝚞𝚖𝚙𝚜 𝚘𝚟𝚎𝚛 𝚝𝚑𝚎 𝚕𝚊𝚣𝚢 𝚍𝚘𝚐", "⒯⒣⒠ ⒬⒰⒤⒞⒦ ⒝⒭⒪⒲⒩ ⒡⒪⒳ ⒥⒰⒨⒫⒮ ⒪⒱⒠⒭ ⒯⒣⒠ ⒧⒜⒵⒴ ⒟⒪⒢", - # File paths "../../../../../../../../../../../etc/passwd%00", "../../../../../../../../../../../etc/hosts", - # iOS Vulnerabilities "Powerلُلُصّبُلُلصّبُررً ॣ ॣh ॣ ॣ冗", - "🏳0🌈️" + "🏳0🌈️", ] @pytest.mark.slow -@pytest.mark.parametrize('text', NAUGHTY_STRINGS) +@pytest.mark.parametrize("text", NAUGHTY_STRINGS) def test_tokenizer_naughty_strings(tokenizer, text): tokens = tokenizer(text) assert tokens.text_with_ws == text diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 276ae7f04..e53b22186 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -12,7 +12,7 @@ def test_tokenizer_handles_no_word(tokenizer): assert len(tokens) == 0 -@pytest.mark.parametrize('text', ["lorem"]) +@pytest.mark.parametrize("text", ["lorem"]) def test_tokenizer_handles_single_word(tokenizer, text): tokens = tokenizer(text) assert tokens[0].text == text @@ -39,19 +39,24 @@ def test_tokenizer_handles_digits(tokenizer): assert tokens[3].text == "1984" -@pytest.mark.parametrize('text', ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"]) +@pytest.mark.parametrize( + "text", + ["google.com", "python.org", "spacy.io", "explosion.ai", "http://www.google.com"], +) def test_tokenizer_keep_urls(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 1 -@pytest.mark.parametrize('text', ["NASDAQ:GOOG"]) +@pytest.mark.parametrize("text", ["NASDAQ:GOOG"]) def test_tokenizer_colons(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"]) +@pytest.mark.parametrize( + "text", ["hello123@example.com", "hi+there@gmail.it", "matt@explosion.ai"] +) def test_tokenizer_keeps_email(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 1 @@ -71,10 +76,10 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n assert len(tokens) > 5 -@pytest.mark.parametrize('file_name', ["sun.txt"]) +@pytest.mark.parametrize("file_name", ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): loc = ensure_path(__file__).parent / file_name - text = loc.open('r', encoding='utf8').read() + text = loc.open("r", encoding="utf8").read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 @@ -89,23 +94,23 @@ def test_tokenizer_suspected_freeing_strings(tokenizer): assert tokens2[0].text == "Lorem" -@pytest.mark.parametrize('text,tokens', [ - ("lorem", [{'orth': 'lo'}, {'orth': 'rem'}])]) +@pytest.mark.parametrize("text,tokens", [("lorem", [{"orth": "lo"}, {"orth": "rem"}])]) def test_tokenizer_add_special_case(tokenizer, text, tokens): tokenizer.add_special_case(text, tokens) doc = tokenizer(text) - assert doc[0].text == tokens[0]['orth'] - assert doc[1].text == tokens[1]['orth'] + assert doc[0].text == tokens[0]["orth"] + assert doc[1].text == tokens[1]["orth"] -@pytest.mark.parametrize('text,tokens', [ - ("lorem", [{'orth': 'lo', 'tag': 'NN'}, {'orth': 'rem'}])]) +@pytest.mark.parametrize( + "text,tokens", [("lorem", [{"orth": "lo", "tag": "NN"}, {"orth": "rem"}])] +) def test_tokenizer_add_special_case_tag(text, tokens): - vocab = Vocab(tag_map={'NN': {'pos': 'NOUN'}}) + vocab = Vocab(tag_map={"NN": {"pos": "NOUN"}}) tokenizer = Tokenizer(vocab, {}, None, None, None) tokenizer.add_special_case(text, tokens) doc = tokenizer(text) - assert doc[0].text == tokens[0]['orth'] - assert doc[0].tag_ == tokens[0]['tag'] - assert doc[0].pos_ == 'NOUN' - assert doc[1].text == tokens[1]['orth'] + assert doc[0].text == tokens[0]["orth"] + assert doc[0].tag_ == tokens[0]["tag"] + assert doc[0].pos_ == "NOUN" + assert doc[1].text == tokens[1]["orth"] diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 3bb6521f1..317e0bdbe 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -8,13 +8,12 @@ URLS_BASIC = [ "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", "www.red-stars.com", "mailto:foo.bar@baz.com", - ] URLS_FULL = URLS_BASIC + [ "mailto:foo-bar@baz-co.com", "www.google.com?q=google", - "http://foo.com/blah_(wikipedia)#cite-1" + "http://foo.com/blah_(wikipedia)#cite-1", ] # URL SHOULD_MATCH and SHOULD_NOT_MATCH patterns courtesy of https://mathiasbynens.be/demo/url-regex @@ -45,18 +44,19 @@ URLS_SHOULD_MATCH = [ "http://1337.net", "http://a.b-c.de", "http://223.255.255.254", - "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 - - pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"), - pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"), - pytest.mark.xfail("http://⌘.ws"), - pytest.mark.xfail("http://⌘.ws/"), - pytest.mark.xfail("http://☺.damowmow.com/"), - pytest.mark.xfail("http://✪df.ws/123"), - pytest.mark.xfail("http://➡.ws/䨹"), - pytest.mark.xfail("http://مثال.إختبار"), - pytest.mark.xfail("http://例子.测试"), - pytest.mark.xfail("http://उदाहरण.परीक्षा"), + "http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014 + pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()), + pytest.param( + "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail() + ), + pytest.param("http://⌘.ws", marks=pytest.mark.xfail()), + pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()), + pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()), + pytest.param("http://✪df.ws/123", marks=pytest.mark.xfail()), + pytest.param("http://➡.ws/䨹", marks=pytest.mark.xfail()), + pytest.param("http://مثال.إختبار", marks=pytest.mark.xfail()), + pytest.param("http://例子.测试", marks=pytest.mark.xfail()), + pytest.param("http://उदाहरण.परीक्षा", marks=pytest.mark.xfail()), ] URLS_SHOULD_NOT_MATCH = [ @@ -94,23 +94,20 @@ URLS_SHOULD_NOT_MATCH = [ "http://.www.foo.bar./", "http://10.1.1.1", "NASDAQ:GOOG", - - pytest.mark.xfail("foo.com"), - pytest.mark.xfail("http://1.1.1.1.1"), - pytest.mark.xfail("http://www.foo.bar./"), - pytest.mark.xfail("http://-a.b.co"), + pytest.param("foo.com", marks=pytest.mark.xfail()), + pytest.param("http://1.1.1.1.1", marks=pytest.mark.xfail()), + pytest.param("http://www.foo.bar./", marks=pytest.mark.xfail()), + pytest.param("http://-a.b.co", marks=pytest.mark.xfail()), ] # Punctuation we want to check is split away before the URL -PREFIXES = [ - "(", '"', ">" -] +PREFIXES = ["(", '"', ">"] # Punctuation we want to check is split away after the URL -SUFFIXES = [ - '"', ":", ">"] +SUFFIXES = ['"', ":", ">"] + @pytest.mark.parametrize("url", URLS_SHOULD_MATCH) def test_should_match(en_tokenizer, url): @@ -118,12 +115,14 @@ def test_should_match(en_tokenizer, url): if token_match: assert token_match(url) + @pytest.mark.parametrize("url", URLS_SHOULD_NOT_MATCH) def test_should_not_match(en_tokenizer, url): token_match = en_tokenizer.token_match if token_match: assert not token_match(url) + @pytest.mark.parametrize("url", URLS_BASIC) def test_tokenizer_handles_simple_url(tokenizer, url): tokens = tokenizer(url) diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 7c53584cf..74c9b369b 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -4,45 +4,45 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["lorem ipsum"]) +@pytest.mark.parametrize("text", ["lorem ipsum"]) def test_tokenizer_splits_single_space(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 2 -@pytest.mark.parametrize('text', ["lorem ipsum"]) +@pytest.mark.parametrize("text", ["lorem ipsum"]) def test_tokenizer_splits_double_space(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 assert tokens[1].text == " " -@pytest.mark.parametrize('text', ["lorem ipsum "]) +@pytest.mark.parametrize("text", ["lorem ipsum "]) def test_tokenizer_handles_double_trainling_ws(tokenizer, text): tokens = tokenizer(text) assert repr(tokens.text_with_ws) == repr(text) -@pytest.mark.parametrize('text', ["lorem\nipsum"]) +@pytest.mark.parametrize("text", ["lorem\nipsum"]) def test_tokenizer_splits_newline(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 assert tokens[1].text == "\n" -@pytest.mark.parametrize('text', ["lorem \nipsum"]) +@pytest.mark.parametrize("text", ["lorem \nipsum"]) def test_tokenizer_splits_newline_space(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["lorem \nipsum"]) +@pytest.mark.parametrize("text", ["lorem \nipsum"]) def test_tokenizer_splits_newline_double_space(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 -@pytest.mark.parametrize('text', ["lorem \n ipsum"]) +@pytest.mark.parametrize("text", ["lorem \n ipsum"]) def test_tokenizer_splits_newline_space_wrap(tokenizer, text): tokens = tokenizer(text) assert len(tokens) == 3 diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 0d97c7907..80fbb5b1c 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -13,7 +13,7 @@ from spacy.compat import path2str @contextlib.contextmanager -def make_tempfile(mode='r'): +def make_tempfile(mode="r"): f = tempfile.TemporaryFile(mode=mode) yield f f.close() @@ -28,11 +28,11 @@ def make_tempdir(): def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" - pos = pos or [''] * len(words) - tags = tags or [''] * len(words) + pos = pos or [""] * len(words) + tags = tags or [""] * len(words) heads = heads or [0] * len(words) - deps = deps or [''] * len(words) - for value in (deps+tags+pos): + deps = deps or [""] * len(words) + for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) @@ -43,8 +43,10 @@ def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=No attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: - doc.ents = [Span(doc, start, end, label=doc.vocab.strings[label]) - for start, end, label in ents] + doc.ents = [ + Span(doc, start, end, label=doc.vocab.strings[label]) + for start, end, label in ents + ] if tags: for token in doc: token.tag_ = tags[token.i] @@ -55,8 +57,8 @@ def apply_transition_sequence(parser, doc, sequence): """Perform a series of pre-specified transitions, to put the parser in a desired state.""" for action_name in sequence: - if '-' in action_name: - move, label = action_name.split('-') + if "-" in action_name: + move, label = action_name.split("-") parser.add_label(label) with parser.step_through(doc) as stepwise: for transition in sequence: @@ -81,25 +83,25 @@ def get_cosine(vec1, vec2): def assert_docs_equal(doc1, doc2): """Compare two Doc objects and assert that they're equal. Tests for tokens, tags, dependencies and entities.""" - assert [ t.orth for t in doc1 ] == [ t.orth for t in doc2 ] + assert [t.orth for t in doc1] == [t.orth for t in doc2] - assert [ t.pos for t in doc1 ] == [ t.pos for t in doc2 ] - assert [ t.tag for t in doc1 ] == [ t.tag for t in doc2 ] + assert [t.pos for t in doc1] == [t.pos for t in doc2] + assert [t.tag for t in doc1] == [t.tag for t in doc2] - assert [ t.head.i for t in doc1 ] == [ t.head.i for t in doc2 ] - assert [ t.dep for t in doc1 ] == [ t.dep for t in doc2 ] + assert [t.head.i for t in doc1] == [t.head.i for t in doc2] + assert [t.dep for t in doc1] == [t.dep for t in doc2] if doc1.is_parsed and doc2.is_parsed: - assert [ s for s in doc1.sents ] == [ s for s in doc2.sents ] + assert [s for s in doc1.sents] == [s for s in doc2.sents] - assert [ t.ent_type for t in doc1 ] == [ t.ent_type for t in doc2 ] - assert [ t.ent_iob for t in doc1 ] == [ t.ent_iob for t in doc2 ] - assert [ ent for ent in doc1.ents ] == [ ent for ent in doc2.ents ] + assert [t.ent_type for t in doc1] == [t.ent_type for t in doc2] + assert [t.ent_iob for t in doc1] == [t.ent_iob for t in doc2] + assert [ent for ent in doc1.ents] == [ent for ent in doc2.ents] def assert_packed_msg_equal(b1, b2): """Assert that two packed msgpack messages are equal.""" - msg1 = msgpack.loads(b1, encoding='utf8') - msg2 = msgpack.loads(b2, encoding='utf8') + msg1 = msgpack.loads(b1, encoding="utf8") + msg2 = msgpack.loads(b2, encoding="utf8") assert sorted(msg1.keys()) == sorted(msg2.keys()) for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())): assert k1 == k2 diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index bc16267f1..fa32020e5 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -5,7 +5,7 @@ import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT -@pytest.mark.parametrize('text1,prob1,text2,prob2', [("NOUN", -1, "opera", -2)]) +@pytest.mark.parametrize("text1,prob1,text2,prob2", [("NOUN", -1, "opera", -2)]) def test_vocab_lexeme_lt(en_vocab, text1, text2, prob1, prob2): """More frequent is l.t. less frequent""" lex1 = en_vocab[text1] @@ -17,7 +17,7 @@ def test_vocab_lexeme_lt(en_vocab, text1, text2, prob1, prob2): assert lex2 > lex1 -@pytest.mark.parametrize('text1,text2', [("phantom", "opera")]) +@pytest.mark.parametrize("text1,text2", [("phantom", "opera")]) def test_vocab_lexeme_hash(en_vocab, text1, text2): """Test that lexemes are hashable.""" lex1 = en_vocab[text1] @@ -28,39 +28,39 @@ def test_vocab_lexeme_hash(en_vocab, text1, text2): def test_vocab_lexeme_is_alpha(en_vocab): - assert en_vocab['the'].flags & (1 << IS_ALPHA) - assert not en_vocab['1999'].flags & (1 << IS_ALPHA) - assert not en_vocab['hello1'].flags & (1 << IS_ALPHA) + assert en_vocab["the"].flags & (1 << IS_ALPHA) + assert not en_vocab["1999"].flags & (1 << IS_ALPHA) + assert not en_vocab["hello1"].flags & (1 << IS_ALPHA) def test_vocab_lexeme_is_digit(en_vocab): - assert not en_vocab['the'].flags & (1 << IS_DIGIT) - assert en_vocab['1999'].flags & (1 << IS_DIGIT) - assert not en_vocab['hello1'].flags & (1 << IS_DIGIT) + assert not en_vocab["the"].flags & (1 << IS_DIGIT) + assert en_vocab["1999"].flags & (1 << IS_DIGIT) + assert not en_vocab["hello1"].flags & (1 << IS_DIGIT) def test_vocab_lexeme_add_flag_auto_id(en_vocab): is_len4 = en_vocab.add_flag(lambda string: len(string) == 4) - assert en_vocab['1999'].check_flag(is_len4) == True - assert en_vocab['1999'].check_flag(IS_DIGIT) == True - assert en_vocab['199'].check_flag(is_len4) == False - assert en_vocab['199'].check_flag(IS_DIGIT) == True - assert en_vocab['the'].check_flag(is_len4) == False - assert en_vocab['dogs'].check_flag(is_len4) == True + assert en_vocab["1999"].check_flag(is_len4) == True + assert en_vocab["1999"].check_flag(IS_DIGIT) == True + assert en_vocab["199"].check_flag(is_len4) == False + assert en_vocab["199"].check_flag(IS_DIGIT) == True + assert en_vocab["the"].check_flag(is_len4) == False + assert en_vocab["dogs"].check_flag(is_len4) == True def test_vocab_lexeme_add_flag_provided_id(en_vocab): is_len4 = en_vocab.add_flag(lambda string: len(string) == 4, flag_id=IS_DIGIT) - assert en_vocab['1999'].check_flag(is_len4) == True - assert en_vocab['199'].check_flag(is_len4) == False - assert en_vocab['199'].check_flag(IS_DIGIT) == False - assert en_vocab['the'].check_flag(is_len4) == False - assert en_vocab['dogs'].check_flag(is_len4) == True + assert en_vocab["1999"].check_flag(is_len4) == True + assert en_vocab["199"].check_flag(is_len4) == False + assert en_vocab["199"].check_flag(IS_DIGIT) == False + assert en_vocab["the"].check_flag(is_len4) == False + assert en_vocab["dogs"].check_flag(is_len4) == True def test_lexeme_bytes_roundtrip(en_vocab): - one = en_vocab['one'] - alpha = en_vocab['alpha'] + one = en_vocab["one"] + alpha = en_vocab["alpha"] assert one.orth != alpha.orth assert one.lower != alpha.lower alpha.from_bytes(one.to_bytes()) diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index 5e73041b5..1567e8b57 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -18,6 +18,7 @@ def vocab(en_vocab, vectors): add_vecs_to_vocab(en_vocab, vectors) return en_vocab + def test_vectors_similarity_LL(vocab, vectors): [(word1, vec1), (word2, vec2)] = vectors lex1 = vocab[word1] diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index f70498344..f74f6c5f5 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -12,29 +12,29 @@ def stringstore(): def test_string_hash(stringstore): """Test that string hashing is stable across platforms""" - assert stringstore.add('apple') == 8566208034543834098 - heart = '\U0001f499' + assert stringstore.add("apple") == 8566208034543834098 + heart = "\U0001f499" h = stringstore.add(heart) assert h == 11841826740069053588 def test_stringstore_from_api_docs(stringstore): - apple_hash = stringstore.add('apple') + apple_hash = stringstore.add("apple") assert apple_hash == 8566208034543834098 - assert stringstore[apple_hash] == 'apple' - assert 'apple' in stringstore - assert 'cherry' not in stringstore - orange_hash = stringstore.add('orange') + assert stringstore[apple_hash] == "apple" + assert "apple" in stringstore + assert "cherry" not in stringstore + orange_hash = stringstore.add("orange") all_strings = [s for s in stringstore] - assert all_strings == ['apple', 'orange'] - banana_hash = stringstore.add('banana') + assert all_strings == ["apple", "orange"] + banana_hash = stringstore.add("banana") assert len(stringstore) == 3 assert banana_hash == 2525716904149915114 - assert stringstore[banana_hash] == 'banana' - assert stringstore['banana'] == banana_hash + assert stringstore[banana_hash] == "banana" + assert stringstore["banana"] == banana_hash -@pytest.mark.parametrize('text1,text2,text3', [(b'Hello', b'goodbye', b'hello')]) +@pytest.mark.parametrize("text1,text2,text3", [(b"Hello", b"goodbye", b"hello")]) def test_stringstore_save_bytes(stringstore, text1, text2, text3): key = stringstore.add(text1) assert stringstore[text1] == key @@ -42,7 +42,7 @@ def test_stringstore_save_bytes(stringstore, text1, text2, text3): assert stringstore[text3] != key -@pytest.mark.parametrize('text1,text2,text3', [('Hello', 'goodbye', 'hello')]) +@pytest.mark.parametrize("text1,text2,text3", [("Hello", "goodbye", "hello")]) def test_stringstore_save_unicode(stringstore, text1, text2, text3): key = stringstore.add(text1) assert stringstore[text1] == key @@ -50,19 +50,19 @@ def test_stringstore_save_unicode(stringstore, text1, text2, text3): assert stringstore[text3] != key -@pytest.mark.parametrize('text', [b'A']) +@pytest.mark.parametrize("text", [b"A"]) def test_stringstore_retrieve_id(stringstore, text): key = stringstore.add(text) assert len(stringstore) == 1 - assert stringstore[key] == text.decode('utf8') + assert stringstore[key] == text.decode("utf8") with pytest.raises(KeyError): stringstore[20000] -@pytest.mark.parametrize('text1,text2', [(b'0123456789', b'A')]) +@pytest.mark.parametrize("text1,text2", [(b"0123456789", b"A")]) def test_stringstore_med_string(stringstore, text1, text2): store = stringstore.add(text1) - assert stringstore[store] == text1.decode('utf8') + assert stringstore[store] == text1.decode("utf8") dummy = stringstore.add(text2) assert stringstore[text1] == store @@ -73,26 +73,26 @@ def test_stringstore_long_string(stringstore): assert stringstore[store] == text -@pytest.mark.parametrize('factor', [254, 255, 256]) +@pytest.mark.parametrize("factor", [254, 255, 256]) def test_stringstore_multiply(stringstore, factor): - text = 'a' * factor + text = "a" * factor store = stringstore.add(text) assert stringstore[store] == text def test_stringstore_massive_strings(stringstore): - text = 'a' * 511 + text = "a" * 511 store = stringstore.add(text) assert stringstore[store] == text - text2 = 'z' * 512 + text2 = "z" * 512 store = stringstore.add(text2) assert stringstore[store] == text2 - text3 = '1' * 513 + text3 = "1" * 513 store = stringstore.add(text3) assert stringstore[store] == text3 -@pytest.mark.parametrize('text', ["qqqqq"]) +@pytest.mark.parametrize("text", ["qqqqq"]) def test_stringstore_to_bytes(stringstore, text): store = stringstore.add(text) serialized = stringstore.to_bytes() @@ -101,7 +101,7 @@ def test_stringstore_to_bytes(stringstore, text): @pytest.mark.xfail -@pytest.mark.parametrize('text', [["a", "b", "c"]]) +@pytest.mark.parametrize("text", [["a", "b", "c"]]) def test_stringstore_freeze_oov(stringstore, text): """Test the possibly temporary workaround of flushing the stringstore of OOV words.""" diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index 0b95da59c..cd72cef8e 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -24,9 +24,10 @@ def vectors(): return [ ("apple", [1, 2, 3]), ("orange", [-1, -2, -3]), - ('and', [-1, -1, -1]), - ('juice', [5, 5, 10]), - ('pie', [7, 6.3, 8.9])] + ("and", [-1, -1, -1]), + ("juice", [5, 5, 10]), + ("pie", [7, 6.3, 8.9]), + ] @pytest.fixture @@ -34,8 +35,8 @@ def ngrams_vectors(): return [ ("apple", [1, 2, 3]), ("app", [-0.1, -0.2, -0.3]), - ('ppl', [-0.2, -0.3, -0.4]), - ('pl', [0.7, 0.8, 0.9]) + ("ppl", [-0.2, -0.3, -0.4]), + ("pl", [0.7, 0.8, 0.9]), ] @@ -47,12 +48,12 @@ def ngrams_vocab(en_vocab, ngrams_vectors): @pytest.fixture def data(): - return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype='f') + return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f") @pytest.fixture def resize_data(): - return numpy.asarray([[0.0, 1.0], [2.0, 3.0]], dtype='f') + return numpy.asarray([[0.0, 1.0], [2.0, 3.0]], dtype="f") @pytest.fixture() @@ -66,21 +67,21 @@ def tokenizer_v(vocab): return Tokenizer(vocab, {}, None, None, None) -def test_init_vectors_with_resize_shape(strings,resize_data): +def test_init_vectors_with_resize_shape(strings, resize_data): v = Vectors(shape=(len(strings), 3)) v.resize(shape=resize_data.shape) assert v.shape == resize_data.shape assert v.shape != (len(strings), 3) -def test_init_vectors_with_resize_data(data,resize_data): +def test_init_vectors_with_resize_data(data, resize_data): v = Vectors(data=data) v.resize(shape=resize_data.shape) assert v.shape == resize_data.shape assert v.shape != data.shape -def test_get_vector_resize(strings, data,resize_data): +def test_get_vector_resize(strings, data, resize_data): v = Vectors(data=data) v.resize(shape=resize_data.shape) strings = [hash_string(s) for s in strings] @@ -126,152 +127,165 @@ def test_set_vector(strings, data): assert list(v[strings[0]]) != list(orig[0]) -@pytest.mark.parametrize('text', ["apple and orange"]) +@pytest.mark.parametrize("text", ["apple and orange"]) def test_vectors_token_vector(tokenizer_v, vectors, text): doc = tokenizer_v(text) assert vectors[0] == (doc[0].text, list(doc[0].vector)) assert vectors[1] == (doc[2].text, list(doc[2].vector)) -@pytest.mark.parametrize('text', ["apple"]) -def test_vectors__ngrams_word(ngrams_vocab, text): - assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors()[0][1]) +@pytest.mark.parametrize("text", ["apple"]) +def test_vectors__ngrams_word(ngrams_vocab, ngrams_vectors, text): + assert list(ngrams_vocab.get_vector(text)) == list(ngrams_vectors[0][1]) -@pytest.mark.parametrize('text', ["applpie"]) -def test_vectors__ngrams_subword(ngrams_vocab, text): - truth = list(ngrams_vocab.get_vector(text,1,6)) - test = list([(ngrams_vectors()[1][1][i] + ngrams_vectors()[2][1][i] + ngrams_vectors()[3][1][i])/3 for i in range(len(ngrams_vectors()[1][1]))]) + +@pytest.mark.parametrize("text", ["applpie"]) +def test_vectors__ngrams_subword(ngrams_vocab, ngrams_vectors, text): + truth = list(ngrams_vocab.get_vector(text, 1, 6)) + test = list( + [ + ( + ngrams_vectors[1][1][i] + + ngrams_vectors[2][1][i] + + ngrams_vectors[3][1][i] + ) + / 3 + for i in range(len(ngrams_vectors[1][1])) + ] + ) eps = [abs(truth[i] - test[i]) for i in range(len(truth))] for i in eps: - assert i<1e-6 + assert i < 1e-6 -@pytest.mark.parametrize('text', ["apple", "orange"]) + +@pytest.mark.parametrize("text", ["apple", "orange"]) def test_vectors_lexeme_vector(vocab, text): lex = vocab[text] assert list(lex.vector) assert lex.vector_norm -@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) +@pytest.mark.parametrize("text", [["apple", "and", "orange"]]) def test_vectors_doc_vector(vocab, text): doc = Doc(vocab, words=text) assert list(doc.vector) assert doc.vector_norm -@pytest.mark.parametrize('text', [["apple", "and", "orange"]]) +@pytest.mark.parametrize("text", [["apple", "and", "orange"]]) def test_vectors_span_vector(vocab, text): span = Doc(vocab, words=text)[0:2] assert list(span.vector) assert span.vector_norm -@pytest.mark.parametrize('text', ["apple orange"]) +@pytest.mark.parametrize("text", ["apple orange"]) def test_vectors_token_token_similarity(tokenizer_v, text): doc = tokenizer_v(text) assert doc[0].similarity(doc[1]) == doc[1].similarity(doc[0]) - assert -1. < doc[0].similarity(doc[1]) < 1.0 + assert -1.0 < doc[0].similarity(doc[1]) < 1.0 -@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) +@pytest.mark.parametrize("text1,text2", [("apple", "orange")]) def test_vectors_token_lexeme_similarity(tokenizer_v, vocab, text1, text2): token = tokenizer_v(text1) lex = vocab[text2] assert token.similarity(lex) == lex.similarity(token) - assert -1. < token.similarity(lex) < 1.0 + assert -1.0 < token.similarity(lex) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_token_span_similarity(vocab, text): doc = Doc(vocab, words=text) assert doc[0].similarity(doc[1:3]) == doc[1:3].similarity(doc[0]) - assert -1. < doc[0].similarity(doc[1:3]) < 1.0 + assert -1.0 < doc[0].similarity(doc[1:3]) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_token_doc_similarity(vocab, text): doc = Doc(vocab, words=text) assert doc[0].similarity(doc) == doc.similarity(doc[0]) - assert -1. < doc[0].similarity(doc) < 1.0 + assert -1.0 < doc[0].similarity(doc) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_lexeme_span_similarity(vocab, text): doc = Doc(vocab, words=text) lex = vocab[text[0]] assert lex.similarity(doc[1:3]) == doc[1:3].similarity(lex) - assert -1. < doc.similarity(doc[1:3]) < 1.0 + assert -1.0 < doc.similarity(doc[1:3]) < 1.0 -@pytest.mark.parametrize('text1,text2', [("apple", "orange")]) +@pytest.mark.parametrize("text1,text2", [("apple", "orange")]) def test_vectors_lexeme_lexeme_similarity(vocab, text1, text2): lex1 = vocab[text1] lex2 = vocab[text2] assert lex1.similarity(lex2) == lex2.similarity(lex1) - assert -1. < lex1.similarity(lex2) < 1.0 + assert -1.0 < lex1.similarity(lex2) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_lexeme_doc_similarity(vocab, text): doc = Doc(vocab, words=text) lex = vocab[text[0]] assert lex.similarity(doc) == doc.similarity(lex) - assert -1. < lex.similarity(doc) < 1.0 + assert -1.0 < lex.similarity(doc) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_span_span_similarity(vocab, text): doc = Doc(vocab, words=text) with pytest.warns(None): assert doc[0:2].similarity(doc[1:3]) == doc[1:3].similarity(doc[0:2]) - assert -1. < doc[0:2].similarity(doc[1:3]) < 1.0 + assert -1.0 < doc[0:2].similarity(doc[1:3]) < 1.0 -@pytest.mark.parametrize('text', [["apple", "orange", "juice"]]) +@pytest.mark.parametrize("text", [["apple", "orange", "juice"]]) def test_vectors_span_doc_similarity(vocab, text): doc = Doc(vocab, words=text) with pytest.warns(None): assert doc[0:2].similarity(doc) == doc.similarity(doc[0:2]) - assert -1. < doc[0:2].similarity(doc) < 1.0 + assert -1.0 < doc[0:2].similarity(doc) < 1.0 -@pytest.mark.parametrize('text1,text2', [ - (["apple", "and", "apple", "pie"], ["orange", "juice"])]) +@pytest.mark.parametrize( + "text1,text2", [(["apple", "and", "apple", "pie"], ["orange", "juice"])] +) def test_vectors_doc_doc_similarity(vocab, text1, text2): doc1 = Doc(vocab, words=text1) doc2 = Doc(vocab, words=text2) assert doc1.similarity(doc2) == doc2.similarity(doc1) - assert -1. < doc1.similarity(doc2) < 1.0 + assert -1.0 < doc1.similarity(doc2) < 1.0 def test_vocab_add_vector(): vocab = Vocab() - data = numpy.ndarray((5,3), dtype='f') - data[0] = 1. - data[1] = 2. - vocab.set_vector('cat', data[0]) - vocab.set_vector('dog', data[1]) - cat = vocab['cat'] - assert list(cat.vector) == [1., 1., 1.] - dog = vocab['dog'] - assert list(dog.vector) == [2., 2., 2.] + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + cat = vocab["cat"] + assert list(cat.vector) == [1.0, 1.0, 1.0] + dog = vocab["dog"] + assert list(dog.vector) == [2.0, 2.0, 2.0] def test_vocab_prune_vectors(): vocab = Vocab() - _ = vocab['cat'] - _ = vocab['dog'] - _ = vocab['kitten'] - data = numpy.ndarray((5,3), dtype='f') - data[0] = 1. - data[1] = 2. + _ = vocab["cat"] + _ = vocab["dog"] + _ = vocab["kitten"] + data = numpy.ndarray((5, 3), dtype="f") + data[0] = 1.0 + data[1] = 2.0 data[2] = 1.1 - vocab.set_vector('cat', data[0]) - vocab.set_vector('dog', data[1]) - vocab.set_vector('kitten', data[2]) + vocab.set_vector("cat", data[0]) + vocab.set_vector("dog", data[1]) + vocab.set_vector("kitten", data[2]) remap = vocab.prune_vectors(2) - assert list(remap.keys()) == ['kitten'] + assert list(remap.keys()) == ["kitten"] neighbour, similarity = list(remap.values())[0] - assert neighbour == 'cat', remap + assert neighbour == "cat", remap assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6) diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index dc504c2f6..61b315350 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -6,32 +6,41 @@ from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB -@pytest.mark.parametrize('text1,text2', [ - ("Hello", "bye"), ("Hello", "hello"), ("Hello", "Hello,")]) +@pytest.mark.parametrize( + "text1,text2", [("Hello", "bye"), ("Hello", "hello"), ("Hello", "Hello,")] +) def test_vocab_api_neq(en_vocab, text1, text2): assert en_vocab[text1].orth != en_vocab[text2].orth -@pytest.mark.parametrize('text', "Hello") +@pytest.mark.parametrize("text", "Hello") def test_vocab_api_eq(en_vocab, text): lex = en_vocab[text] assert en_vocab[text].orth == lex.orth -@pytest.mark.parametrize('text', ["example"]) +@pytest.mark.parametrize("text", ["example"]) def test_vocab_api_shape_attr(en_vocab, text): lex = en_vocab[text] assert lex.orth != lex.shape -@pytest.mark.parametrize('string,symbol', [ - ('IS_ALPHA', IS_ALPHA), ('NOUN', NOUN), ('VERB', VERB), ('LEMMA', LEMMA), - ('ORTH', ORTH), ('PROB', PROB)]) +@pytest.mark.parametrize( + "string,symbol", + [ + ("IS_ALPHA", IS_ALPHA), + ("NOUN", NOUN), + ("VERB", VERB), + ("LEMMA", LEMMA), + ("ORTH", ORTH), + ("PROB", PROB), + ], +) def test_vocab_api_symbols(en_vocab, string, symbol): assert en_vocab.strings[string] == symbol -@pytest.mark.parametrize('text', "Hello") +@pytest.mark.parametrize("text", "Hello") def test_vocab_api_contains(en_vocab, text): _ = en_vocab[text] assert text in en_vocab