diff --git a/spacy/tests/lang/en/test_lemmatizer.py b/spacy/tests/lang/en/test_lemmatizer.py index 22c8f2499..169cb2695 100644 --- a/spacy/tests/lang/en/test_lemmatizer.py +++ b/spacy/tests/lang/en/test_lemmatizer.py @@ -22,35 +22,37 @@ def test_doc_lemmatization(EN): ("ring", ["ring"]), ("axes", ["axis", "axe", "ax"])]) def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): - assert en_lemmatizer.noun(text) == set(lemmas) + assert en_lemmatizer.noun(text) == lemmas @pytest.mark.models('en') @pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]), ("feed", ["feed"]), ("need", ["need"]), - ("ring", ["ring"]), - ("axes", ["axis", "axe", "ax"])]) + ("ring", ["ring"])]) def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas): - assert en_lemmatizer.noun(text) == set(lemmas) + # Cases like this are problematic -- not clear what we should do to resolve + # ambiguity? + # ("axes", ["ax", "axes", "axis"])]) + assert en_lemmatizer.noun(text) == lemmas @pytest.mark.xfail @pytest.mark.models('en') def test_en_lemmatizer_base_forms(en_lemmatizer): - assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive']) - assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva']) + assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive'] + assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva'] @pytest.mark.models('en') def test_en_lemmatizer_base_form_verb(en_lemmatizer): - assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see']) + assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see'] @pytest.mark.models('en') def test_en_lemmatizer_punct(en_lemmatizer): - assert en_lemmatizer.punct('“') == set(['"']) - assert en_lemmatizer.punct('“') == set(['"']) + assert en_lemmatizer.punct('“') == ['"'] + assert en_lemmatizer.punct('“') == ['"'] @pytest.mark.models('en') diff --git a/spacy/tests/regression/test_issue1488.py b/spacy/tests/regression/test_issue1488.py new file mode 100644 index 000000000..6b9ab9a70 --- /dev/null +++ b/spacy/tests/regression/test_issue1488.py @@ -0,0 +1,26 @@ +# coding: utf8 +from __future__ import unicode_literals + +import regex as re +from ...lang.en import English +from ...tokenizer import Tokenizer + + +def test_issue1488(): + prefix_re = re.compile(r'''[\[\("']''') + suffix_re = re.compile(r'''[\]\)"']''') + infix_re = re.compile(r'''[-~\.]''') + simple_url_re = re.compile(r'''^https?://''') + + def my_tokenizer(nlp): + return Tokenizer(nlp.vocab, {}, + prefix_search=prefix_re.search, + suffix_search=suffix_re.search, + infix_finditer=infix_re.finditer, + token_match=simple_url_re.match) + + nlp = English() + nlp.tokenizer = my_tokenizer(nlp) + doc = nlp("This is a test.") + for token in doc: + assert token.text diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index 0be2e2e98..572a50483 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -11,7 +11,7 @@ if environment == "deploy" script(src="/assets/js/vendor/prism.min.js") -if SECTION == "models" +if compare_models script(src="/assets/js/vendor/chart.min.js") script @@ -58,7 +58,7 @@ if environment == "deploy" !=NavHighlighter !=GitHubEmbed if HAS_MODELS - !=ModeLoader + !=ModelLoader if compare_models !=ModelComparer else