# coding: utf-8 from __future__ import unicode_literals import pytest import re from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.lang.en import English from spacy.lang.lex_attrs import LEX_ATTRS from spacy.matcher import Matcher from spacy.tokenizer import Tokenizer from spacy.lemmatizer import Lemmatizer from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part def test_issue1242(): nlp = English() doc = nlp('') assert len(doc) == 0 docs = list(nlp.pipe(['', 'hello'])) assert len(docs[0]) == 0 assert len(docs[1]) == 1 def test_issue1250(): """Test cached special cases.""" special_case = [{ORTH: 'reimbur', LEMMA: 'reimburse', POS: 'VERB'}] nlp = English() nlp.tokenizer.add_special_case('reimbur', special_case) lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')] assert lemmas == ['reimburse', ',', 'reimburse', '...'] lemmas = [w.lemma_ for w in nlp('reimbur, reimbur...')] assert lemmas == ['reimburse', ',', 'reimburse', '...'] def test_issue1257(): """Test that tokens compare correctly.""" doc1 = Doc(Vocab(), words=['a', 'b', 'c']) doc2 = Doc(Vocab(), words=['a', 'c', 'e']) assert doc1[0] != doc2[0] assert not doc1[0] == doc2[0] def test_issue1375(): """Test that token.nbor() raises IndexError for out-of-bounds access.""" doc = Doc(Vocab(), words=['0', '1', '2']) with pytest.raises(IndexError): assert doc[0].nbor(-1) assert doc[1].nbor(-1).text == '0' with pytest.raises(IndexError): assert doc[2].nbor(1) assert doc[1].nbor(1).text == '2' def test_issue1387(): tag_map = {'VBG': {POS: VERB, VerbForm_part: True}} index = {"verb": ("cope","cop")} exc = {"verb": {"coping": ("cope",)}} rules = {"verb": [["ing", ""]]} lemmatizer = Lemmatizer(index, exc, rules) vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map) doc = Doc(vocab, words=["coping"]) doc[0].tag_ = 'VBG' assert doc[0].text == "coping" assert doc[0].lemma_ == "cope" def test_issue1434(): """Test matches occur when optional element at end of short doc.""" pattern = [{'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'}] vocab = Vocab(lex_attr_getters=LEX_ATTRS) hello_world = Doc(vocab, words=['Hello', 'World']) hello = Doc(vocab, words=['Hello']) matcher = Matcher(vocab) matcher.add('MyMatcher', None, pattern) matches = matcher(hello_world) assert matches matches = matcher(hello) assert matches @pytest.mark.parametrize('string,start,end', [ ('a', 0, 1), ('a b', 0, 2), ('a c', 0, 1), ('a b c', 0, 2), ('a b b c', 0, 3), ('a b b', 0, 3),]) def test_issue1450(string, start, end): """Test matcher works when patterns end with * operator.""" pattern = [{'ORTH': "a"}, {'ORTH': "b", 'OP': "*"}] matcher = Matcher(Vocab()) matcher.add("TSTEND", None, pattern) doc = Doc(Vocab(), words=string.split()) matches = matcher(doc) if start is None or end is None: assert matches == [] assert matches[-1][1] == start assert matches[-1][2] == end def test_issue1488(): prefix_re = re.compile(r'''[\[\("']''') suffix_re = re.compile(r'''[\]\)"']''') infix_re = re.compile(r'''[-~\.]''') simple_url_re = re.compile(r'''^https?://''') def my_tokenizer(nlp): return Tokenizer(nlp.vocab, {}, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=simple_url_re.match) nlp = English() nlp.tokenizer = my_tokenizer(nlp) doc = nlp("This is a test.") for token in doc: assert token.text def test_issue1494(): infix_re = re.compile(r'''[^a-z]''') test_cases = [('token 123test', ['token', '1', '2', '3', 'test']), ('token 1test', ['token', '1test']), ('hello...test', ['hello', '.', '.', '.', 'test'])] new_tokenizer = lambda nlp: Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer) nlp = English() nlp.tokenizer = new_tokenizer(nlp) for text, expected in test_cases: assert [token.text for token in nlp(text)] == expected