From 3a3cb2c90ce2591e04806d3e9ccfd2d1fbdc722d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:53:15 +0100 Subject: [PATCH 1/8] Add unicode declaration --- spacy/tests/tokenizer/test_urls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 426edee3b..f377179b1 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -1,7 +1,9 @@ +# coding: utf-8 from __future__ import unicode_literals import pytest + URLS = [ u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", u"www.google.com?q=google", From e10d4ca9640e6cbd5bf0f70f61381aa15b5b382a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:54:25 +0100 Subject: [PATCH 2/8] Remove semi-redundant URLs and punctuation for faster testing --- spacy/tests/tokenizer/test_urls.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index f377179b1..3a45a36bb 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -5,20 +5,17 @@ import pytest URLS = [ - u"http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", - u"www.google.com?q=google", - u"google.com", - u"www.red-stars.com", - pytest.mark.xfail(u"red-stars.com"), - u"http://foo.com/blah_(wikipedia)#cite-1", - u"http://www.example.com/wpstyle/?bar=baz&inga=42&quux", - u"mailto:foo.bar@baz.com", - u"mailto:foo-bar@baz-co.com" + "http://www.nytimes.com/2016/04/20/us/politics/new-york-primary-preview.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=a-lede-package-region®ion=top-news&WT.nav=top-news&_r=0", + "www.google.com?q=google", + "www.red-stars.com", + "http://foo.com/blah_(wikipedia)#cite-1", + "mailto:foo.bar@baz.com", + "mailto:foo-bar@baz-co.com" ] # Punctuation we want to check is split away before the URL PREFIXES = [ - "(", '"', "...", ">" + "(", '"', ">" ] # Punctuation we want to check is split away after the URL From 2185d31907041f4e4c8856bfcb8635998648571b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:56:35 +0100 Subject: [PATCH 3/8] Adjust names and formatting --- spacy/tests/tokenizer/test_urls.py | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 3a45a36bb..cd2e5cd20 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -13,62 +13,68 @@ URLS = [ "mailto:foo-bar@baz-co.com" ] + # Punctuation we want to check is split away before the URL PREFIXES = [ "(", '"', ">" ] + # Punctuation we want to check is split away after the URL SUFFIXES = [ '"', ":", ">"] -@pytest.mark.parametrize("text", URLS) -def test_simple_url(tokenizer, text): - tokens = tokenizer(text) - assert tokens[0].orth_ == text + +@pytest.mark.parametrize("url", URLS) +def test_tokenizer_handles_simple_url(tokenizer, url): + tokens = tokenizer(url) assert len(tokens) == 1 + assert tokens[0].text == url @pytest.mark.parametrize("prefix", PREFIXES) @pytest.mark.parametrize("url", URLS) -def test_prefixed_url(tokenizer, prefix, url): +def test_tokenizer_handles_prefixed_url(tokenizer, prefix, url): tokens = tokenizer(prefix + url) assert tokens[0].text == prefix assert tokens[1].text == url assert len(tokens) == 2 - + + @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) -def test_suffixed_url(tokenizer, url, suffix): +def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): tokens = tokenizer(url + suffix) assert tokens[0].text == url assert tokens[1].text == suffix assert len(tokens) == 2 - + + @pytest.mark.parametrize("prefix", PREFIXES) @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) -def test_surround_url(tokenizer, prefix, suffix, url): +def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): tokens = tokenizer(prefix + url + suffix) assert tokens[0].text == prefix assert tokens[1].text == url assert tokens[2].text == suffix - assert len(tokens) == 3 - + + @pytest.mark.parametrize("prefix1", PREFIXES) @pytest.mark.parametrize("prefix2", PREFIXES) @pytest.mark.parametrize("url", URLS) -def test_two_prefix_url(tokenizer, prefix1, prefix2, url): +def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): tokens = tokenizer(prefix1 + prefix2 + url) assert tokens[0].text == prefix1 assert tokens[1].text == prefix2 assert tokens[2].text == url assert len(tokens) == 3 - + + @pytest.mark.parametrize("suffix1", SUFFIXES) @pytest.mark.parametrize("suffix2", SUFFIXES) @pytest.mark.parametrize("url", URLS) -def test_two_prefix_url(tokenizer, suffix1, suffix2, url): +def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) assert tokens[0].text == url assert tokens[1].text == suffix1 From 0ba5cf51d2735432e367ff384cbb53e10e02bd74 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:57:00 +0100 Subject: [PATCH 4/8] Assert length first --- spacy/tests/tokenizer/test_urls.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index cd2e5cd20..9dc5bbfab 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -36,18 +36,18 @@ def test_tokenizer_handles_simple_url(tokenizer, url): @pytest.mark.parametrize("url", URLS) def test_tokenizer_handles_prefixed_url(tokenizer, prefix, url): tokens = tokenizer(prefix + url) + assert len(tokens) == 2 assert tokens[0].text == prefix assert tokens[1].text == url - assert len(tokens) == 2 @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): tokens = tokenizer(url + suffix) + assert len(tokens) == 2 assert tokens[0].text == url assert tokens[1].text == suffix - assert len(tokens) == 2 @pytest.mark.parametrize("prefix", PREFIXES) @@ -55,6 +55,7 @@ def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): @pytest.mark.parametrize("url", URLS) def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): tokens = tokenizer(prefix + url + suffix) + assert len(tokens) == 3 assert tokens[0].text == prefix assert tokens[1].text == url assert tokens[2].text == suffix @@ -65,10 +66,10 @@ def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): @pytest.mark.parametrize("url", URLS) def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): tokens = tokenizer(prefix1 + prefix2 + url) + assert len(tokens) == 3 assert tokens[0].text == prefix1 assert tokens[1].text == prefix2 assert tokens[2].text == url - assert len(tokens) == 3 @pytest.mark.parametrize("suffix1", SUFFIXES) @@ -76,7 +77,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): @pytest.mark.parametrize("url", URLS) def test_tokenizer_handles_two_prefix_url(tokenizer, suffix1, suffix2, url): tokens = tokenizer(url + suffix1 + suffix2) + assert len(tokens) == 3 assert tokens[0].text == url assert tokens[1].text == suffix1 assert tokens[2].text == suffix2 - assert len(tokens) == 3 From 487e020ebe4e00bfa3b9ee96e91e99e8d7500299 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:57:26 +0100 Subject: [PATCH 5/8] Add simple test for surrounding brackets --- spacy/tests/tokenizer/test_urls.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 9dc5bbfab..6f905914d 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -50,6 +50,15 @@ def test_tokenizer_handles_suffixed_url(tokenizer, url, suffix): assert tokens[1].text == suffix +@pytest.mark.parametrize("url", URLS) +def test_tokenizer_handles_simple_surround_url(tokenizer, url): + tokens = tokenizer("(" + url + ")") + assert len(tokens) == 3 + assert tokens[0].text == "(" + assert tokens[1].text == url + assert tokens[2].text == ")" + + @pytest.mark.parametrize("prefix", PREFIXES) @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) From 869963c3c43e980cb7c0036760770f1c2acc0111 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 15:57:35 +0100 Subject: [PATCH 6/8] Mark extensive prefix/suffix tests as slow --- spacy/tests/tokenizer/test_urls.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index 6f905914d..556afc2c9 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -59,6 +59,7 @@ def test_tokenizer_handles_simple_surround_url(tokenizer, url): assert tokens[2].text == ")" +@pytest.mark.slow @pytest.mark.parametrize("prefix", PREFIXES) @pytest.mark.parametrize("suffix", SUFFIXES) @pytest.mark.parametrize("url", URLS) @@ -70,6 +71,7 @@ def test_tokenizer_handles_surround_url(tokenizer, prefix, suffix, url): assert tokens[2].text == suffix +@pytest.mark.slow @pytest.mark.parametrize("prefix1", PREFIXES) @pytest.mark.parametrize("prefix2", PREFIXES) @pytest.mark.parametrize("url", URLS) @@ -81,6 +83,7 @@ def test_tokenizer_handles_two_prefix_url(tokenizer, prefix1, prefix2, url): assert tokens[2].text == url +@pytest.mark.slow @pytest.mark.parametrize("suffix1", SUFFIXES) @pytest.mark.parametrize("suffix2", SUFFIXES) @pytest.mark.parametrize("url", URLS) From 3e6e1f0251465a1c05ae9df0e82313711a2691e1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 10 Jan 2017 19:24:10 +0100 Subject: [PATCH 7/8] Tidy up regression tests --- spacy/tests/regression/test_issue351.py | 2 ++ spacy/tests/regression/test_issue360.py | 4 +++- spacy/tests/regression/test_issue429.py | 6 ++++-- spacy/tests/regression/test_issue587.py | 6 +++++- spacy/tests/regression/test_issue588.py | 6 +++++- spacy/tests/regression/test_issue589.py | 7 +++++-- spacy/tests/regression/test_issue590.py | 5 ++++- spacy/tests/regression/test_issue595.py | 10 ++++++---- spacy/tests/regression/test_issue599.py | 4 ++++ spacy/tests/regression/test_issue600.py | 4 +++- spacy/tests/regression/test_issue605.py | 11 +++++++---- spacy/tests/regression/test_issue615.py | 8 +++++--- spacy/tests/regression/test_issue617.py | 3 +++ 13 files changed, 56 insertions(+), 20 deletions(-) diff --git a/spacy/tests/regression/test_issue351.py b/spacy/tests/regression/test_issue351.py index 84d4398c5..06f24715c 100644 --- a/spacy/tests/regression/test_issue351.py +++ b/spacy/tests/regression/test_issue351.py @@ -1,4 +1,6 @@ +# coding: utf-8 from __future__ import unicode_literals + from ...en import English import pytest diff --git a/spacy/tests/regression/test_issue360.py b/spacy/tests/regression/test_issue360.py index 018289030..d0b55032a 100644 --- a/spacy/tests/regression/test_issue360.py +++ b/spacy/tests/regression/test_issue360.py @@ -1,4 +1,6 @@ +# coding: utf-8 from __future__ import unicode_literals + from ...en import English import pytest @@ -10,5 +12,5 @@ def en_tokenizer(): def test_big_ellipsis(en_tokenizer): - tokens = en_tokenizer(u'$45...............Asking') + tokens = en_tokenizer('$45...............Asking') assert len(tokens) > 2 diff --git a/spacy/tests/regression/test_issue429.py b/spacy/tests/regression/test_issue429.py index b3e6b2831..1412a54f2 100644 --- a/spacy/tests/regression/test_issue429.py +++ b/spacy/tests/regression/test_issue429.py @@ -1,9 +1,11 @@ +# coding: utf-8 from __future__ import unicode_literals -import pytest import spacy from spacy.attrs import ORTH +import pytest + @pytest.mark.models def test_issue429(): @@ -23,7 +25,7 @@ def test_issue429(): doc = nlp.tokenizer('a b c') nlp.tagger(doc) nlp.matcher(doc) - + for word in doc: print(word.text, word.ent_iob_, word.ent_type_) nlp.entity(doc) diff --git a/spacy/tests/regression/test_issue587.py b/spacy/tests/regression/test_issue587.py index 5b86801d6..8815b346a 100644 --- a/spacy/tests/regression/test_issue587.py +++ b/spacy/tests/regression/test_issue587.py @@ -1,14 +1,18 @@ +# coding: utf-8 +from __future__ import unicode_literals + import spacy import spacy.matcher from spacy.attrs import IS_PUNCT, ORTH import pytest + @pytest.mark.models def test_matcher_segfault(): nlp = spacy.load('en', parser=False, entity=False) matcher = spacy.matcher.Matcher(nlp.vocab) - content = u'''a b; c''' + content = '''a b; c''' matcher.add(entity_key='1', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}]]) matcher(nlp(content)) matcher.add(entity_key='2', label='TEST', attrs={}, specs=[[{ORTH: 'a'}, {ORTH: 'b'}, {IS_PUNCT: True}, {ORTH: 'c'}]]) diff --git a/spacy/tests/regression/test_issue588.py b/spacy/tests/regression/test_issue588.py index 0b7defe75..0b05ac74e 100644 --- a/spacy/tests/regression/test_issue588.py +++ b/spacy/tests/regression/test_issue588.py @@ -1,8 +1,12 @@ -import pytest +# coding: utf-8 +from __future__ import unicode_literals + from ...vocab import Vocab from ...tokens import Doc from ...matcher import Matcher +import pytest + def test_issue588(): matcher = Matcher(Vocab()) diff --git a/spacy/tests/regression/test_issue589.py b/spacy/tests/regression/test_issue589.py index bf2fda72a..bcbfb0a6a 100644 --- a/spacy/tests/regression/test_issue589.py +++ b/spacy/tests/regression/test_issue589.py @@ -1,10 +1,13 @@ -import pytest +# coding: utf-8 +from __future__ import unicode_literals from ...vocab import Vocab from ...tokens import Doc +import pytest + def test_issue589(): vocab = Vocab() vocab.strings.set_frozen(True) - doc = Doc(vocab, words=[u'whata']) + doc = Doc(vocab, words=['whata']) diff --git a/spacy/tests/regression/test_issue590.py b/spacy/tests/regression/test_issue590.py index a35d5d1a4..fedc9eaf4 100644 --- a/spacy/tests/regression/test_issue590.py +++ b/spacy/tests/regression/test_issue590.py @@ -1,9 +1,12 @@ +# coding: utf-8 from __future__ import unicode_literals + from ...attrs import * from ...matcher import Matcher from ...tokens import Doc from ...en import English + def test_overlapping_matches(): vocab = English.Defaults.create_vocab() doc = Doc(vocab, words=['n', '=', '1', ';', 'a', ':', '5', '%']) @@ -29,6 +32,6 @@ def test_overlapping_matches(): {ORTH: '='}, {LIKE_NUM: True}, ], label='b') - + matches = matcher(doc) assert len(matches) == 2 diff --git a/spacy/tests/regression/test_issue595.py b/spacy/tests/regression/test_issue595.py index 1f0ed3a3c..e61ff5273 100644 --- a/spacy/tests/regression/test_issue595.py +++ b/spacy/tests/regression/test_issue595.py @@ -1,11 +1,13 @@ +# coding: utf-8 from __future__ import unicode_literals -import pytest from ...symbols import POS, VERB, VerbForm_inf from ...tokens import Doc from ...vocab import Vocab from ...lemmatizer import Lemmatizer +import pytest + @pytest.fixture def index(): @@ -37,6 +39,6 @@ def vocab(lemmatizer, tag_map): def test_not_lemmatize_base_forms(vocab): doc = Doc(vocab, words=["Do", "n't", "feed", "the", "dog"]) feed = doc[2] - feed.tag_ = u'VB' - assert feed.text == u'feed' - assert feed.lemma_ == u'feed' + feed.tag_ = 'VB' + assert feed.text == 'feed' + assert feed.lemma_ == 'feed' diff --git a/spacy/tests/regression/test_issue599.py b/spacy/tests/regression/test_issue599.py index ce35c6db2..9f8721676 100644 --- a/spacy/tests/regression/test_issue599.py +++ b/spacy/tests/regression/test_issue599.py @@ -1,6 +1,10 @@ +# coding: utf-8 +from __future__ import unicode_literals + from ...tokens import Doc from ...vocab import Vocab + def test_issue599(): doc = Doc(Vocab()) doc.is_tagged = True diff --git a/spacy/tests/regression/test_issue600.py b/spacy/tests/regression/test_issue600.py index 90e700aed..5fc1bc68c 100644 --- a/spacy/tests/regression/test_issue600.py +++ b/spacy/tests/regression/test_issue600.py @@ -1,4 +1,6 @@ +# coding: utf-8 from __future__ import unicode_literals + from ...tokens import Doc from ...vocab import Vocab from ...attrs import POS @@ -6,4 +8,4 @@ from ...attrs import POS def test_issue600(): doc = Doc(Vocab(tag_map={'NN': {'pos': 'NOUN'}}), words=['hello']) - doc[0].tag_ = u'NN' + doc[0].tag_ = 'NN' diff --git a/spacy/tests/regression/test_issue605.py b/spacy/tests/regression/test_issue605.py index 64373950e..16bcea472 100644 --- a/spacy/tests/regression/test_issue605.py +++ b/spacy/tests/regression/test_issue605.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from ...attrs import LOWER, ORTH from ...tokens import Doc from ...vocab import Vocab @@ -9,16 +12,16 @@ def return_false(doc, ent_id, label, start, end): def test_matcher_accept(): - doc = Doc(Vocab(), words=[u'The', u'golf', u'club', u'is', u'broken']) + doc = Doc(Vocab(), words=['The', 'golf', 'club', 'is', 'broken']) - golf_pattern = [ + golf_pattern = [ { ORTH: "golf"}, { ORTH: "club"} ] matcher = Matcher(doc.vocab) - matcher.add_entity(u'Sport_Equipment', acceptor=return_false) - matcher.add_pattern(u"Sport_Equipment", golf_pattern) + matcher.add_entity('Sport_Equipment', acceptor=return_false) + matcher.add_pattern("Sport_Equipment", golf_pattern) match = matcher(doc) assert match == [] diff --git a/spacy/tests/regression/test_issue615.py b/spacy/tests/regression/test_issue615.py index 83f44037b..71c6de366 100644 --- a/spacy/tests/regression/test_issue615.py +++ b/spacy/tests/regression/test_issue615.py @@ -1,5 +1,7 @@ +# coding: utf-8 from __future__ import unicode_literals -import spacy + +import spacy from spacy.attrs import ORTH @@ -17,10 +19,10 @@ def merge_phrases(matcher, doc, i, matches): def test_entity_ID_assignment(): nlp = spacy.en.English() - text = u"""The golf club is broken""" + text = """The golf club is broken""" doc = nlp(text) - golf_pattern = [ + golf_pattern = [ { ORTH: "golf"}, { ORTH: "club"} ] diff --git a/spacy/tests/regression/test_issue617.py b/spacy/tests/regression/test_issue617.py index 600445c2f..0f4d63b97 100644 --- a/spacy/tests/regression/test_issue617.py +++ b/spacy/tests/regression/test_issue617.py @@ -1,3 +1,6 @@ +# coding: utf-8 +from __future__ import unicode_literals + from ...vocab import Vocab From 8e603cc917eb0a291e59754e882468446aa2c010 Mon Sep 17 00:00:00 2001 From: Daniel Hershcovich Date: Wed, 11 Jan 2017 11:18:22 +0200 Subject: [PATCH 8/8] Avoid "True if ... else False" --- spacy/vocab.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index cce85e095..7b9b705dc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -305,7 +305,7 @@ cdef class Vocab: ''' key = hash_string(string) lex = self._by_hash.get(key) - return True if lex is not NULL else False + return lex is not NULL def __iter__(self): '''Iterate over the lexemes in the vocabulary.