From f0986df94be01e122c5f1a8d8578dcd0c2a53ffe Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 14:44:36 +0100 Subject: [PATCH 1/4] Add test for #1488 (passes on v2.0.0a18?) --- spacy/tests/regression/test_issue1488.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 spacy/tests/regression/test_issue1488.py diff --git a/spacy/tests/regression/test_issue1488.py b/spacy/tests/regression/test_issue1488.py new file mode 100644 index 000000000..5e82517d6 --- /dev/null +++ b/spacy/tests/regression/test_issue1488.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + +import regex as re +from ...lang.en import English +from ...tokenizer import Tokenizer + + +def test_issue1488(): + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~\.]''') + simple_url_re = re.compile(r'''^https?://''') + + def my_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match) + + nlp = English() + nlp.tokenizer = my_tokenizer(nlp) + doc = nlp("This is a test.") + for token in doc: + print(token.text) From eef930c73e5ba4308473093a38fead383c85a6af Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 18:50:57 +0100 Subject: [PATCH 2/4] Assert instead of print --- spacy/tests/regression/test_issue1488.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue1488.py b/spacy/tests/regression/test_issue1488.py index 5e82517d6..6b9ab9a70 100644 --- a/spacy/tests/regression/test_issue1488.py +++ b/spacy/tests/regression/test_issue1488.py @@ -23,4 +23,4 @@ def test_issue1488(): nlp.tokenizer = my_tokenizer(nlp) doc = nlp("This is a test.") for token in doc: - print(token.text) + assert token.text From 380f2441b4f2880ff28969583d3cf0261b1142d4 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 18:51:03 +0100 Subject: [PATCH 3/4] Fix script includes --- website/_includes/_scripts.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index 0be2e2e98..572a50483 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -11,7 +11,7 @@ if environment == "deploy" script(src="/assets/js/vendor/prism.min.js") -if SECTION == "models" +if compare_models script(src="/assets/js/vendor/chart.min.js") script @@ -58,7 +58,7 @@ if environment == "deploy" !=NavHighlighter !=GitHubEmbed if HAS_MODELS - !=ModeLoader + !=ModelLoader if compare_models !=ModelComparer else From d6e831bf89b0ba36d5e9f0f7675f83d98fa42029 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 19:46:34 +0100 Subject: [PATCH 4/4] Fix lemmatizer tests --- spacy/tests/lang/en/test_lemmatizer.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py index 22c8f2499..169cb2695 100644 --- a/spacy/tests/lang/en/test_lemmatizer.py +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -22,35 +22,37 @@ def test_doc_lemmatization(EN): ("ring", ["ring"]), ("axes", ["axis", "axe", "ax"])]) def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): - assert en_lemmatizer.noun(text) == set(lemmas) + assert en_lemmatizer.noun(text) == lemmas @pytest.mark.models('en') @pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]), ("feed", ["feed"]), ("need", ["need"]), - ("ring", ["ring"]), - ("axes", ["axis", "axe", "ax"])]) + ("ring", ["ring"])]) def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): - assert en_lemmatizer.noun(text) == set(lemmas) + # Cases like this are problematic -- not clear what we should do to resolve + # ambiguity? + # ("axes", ["ax", "axes", "axis"])]) + assert en_lemmatizer.noun(text) == lemmas @pytest.mark.xfail @pytest.mark.models('en') def test_en_lemmatizer_base_forms(en_lemmatizer): - assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) - assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) + assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive'] + assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva'] @pytest.mark.models('en') def test_en_lemmatizer_base_form_verb(en_lemmatizer): - assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) + assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see'] @pytest.mark.models('en') def test_en_lemmatizer_punct(en_lemmatizer): - assert en_lemmatizer.punct('“') == set(['"']) - assert en_lemmatizer.punct('“') == set(['"']) + assert en_lemmatizer.punct('“') == ['"'] + assert en_lemmatizer.punct('“') == ['"'] @pytest.mark.models('en')