mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
5861308910
Handle tokenizer special cases more generally by using the Matcher internally to match special cases after the affix/token_match tokenization is complete. Instead of only matching special cases while processing balanced or nearly balanced prefixes and suffixes, this recognizes special cases in a wider range of contexts: * Allows arbitrary numbers of prefixes/affixes around special cases * Allows special cases separated by infixes Existing tests/settings that couldn't be preserved as before: * The emoticon '")' is no longer a supported special case * The emoticon ':)' in "example:)" is a false positive again When merged with #4258 (or the relevant cache bugfix), the affix and token_match properties should be modified to flush and reload all special cases to use the updated internal tokenization with the Matcher.
187 lines
5.8 KiB
Python
187 lines
5.8 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
import re
|
|
from spacy.tokens import Doc
|
|
from spacy.vocab import Vocab
|
|
from spacy.lang.en import English
|
|
from spacy.lang.lex_attrs import LEX_ATTRS
|
|
from spacy.matcher import Matcher
|
|
from spacy.tokenizer import Tokenizer
|
|
from spacy.lemmatizer import Lemmatizer
|
|
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
|
|
|
|
|
@pytest.mark.xfail
|
|
def test_issue1061():
|
|
'''Test special-case works after tokenizing. Was caching problem.'''
|
|
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
|
|
tokenizer = English.Defaults.create_tokenizer()
|
|
doc = tokenizer(text)
|
|
assert 'MATH' in [w.text for w in doc]
|
|
assert '_MATH_' not in [w.text for w in doc]
|
|
|
|
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
|
doc = tokenizer(text)
|
|
assert '_MATH_' in [w.text for w in doc]
|
|
assert 'MATH' not in [w.text for w in doc]
|
|
|
|
# For sanity, check it works when pipeline is clean.
|
|
tokenizer = English.Defaults.create_tokenizer()
|
|
tokenizer.add_special_case('_MATH_', [{ORTH: '_MATH_'}])
|
|
doc = tokenizer(text)
|
|
assert '_MATH_' in [w.text for w in doc]
|
|
assert 'MATH' not in [w.text for w in doc]
|
|
text = '...gimme...? that ...gimme...? or else ...gimme...?!'
|
|
|
|
tokenizer = English.Defaults.create_tokenizer()
|
|
tokenizer.add_special_case(u'...gimme...?', [{ORTH: u'...gimme...?'}])
|
|
assert [w.text for w in nlp(text)] == ['...gimme...?', 'that', '...gimme...?', 'or', 'else', '...gimme...?', '!']
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
|
|
)
|
|
def test_issue1235():
|
|
"""Test that g is not split of if preceded by a number and a letter"""
|
|
nlp = English()
|
|
testwords = "e2g 2g 52g"
|
|
doc = nlp(testwords)
|
|
assert len(doc) == 5
|
|
assert doc[0].text == "e2g"
|
|
assert doc[1].text == "2"
|
|
assert doc[2].text == "g"
|
|
assert doc[3].text == "52"
|
|
assert doc[4].text == "g"
|
|
|
|
|
|
def test_issue1242():
|
|
nlp = English()
|
|
doc = nlp("")
|
|
assert len(doc) == 0
|
|
docs = list(nlp.pipe(["", "hello"]))
|
|
assert len(docs[0]) == 0
|
|
assert len(docs[1]) == 1
|
|
|
|
|
|
def test_issue1250():
|
|
"""Test cached special cases."""
|
|
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
|
|
nlp = English()
|
|
nlp.tokenizer.add_special_case("reimbur", special_case)
|
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
|
|
|
|
|
def test_issue1257():
|
|
"""Test that tokens compare correctly."""
|
|
doc1 = Doc(Vocab(), words=["a", "b", "c"])
|
|
doc2 = Doc(Vocab(), words=["a", "c", "e"])
|
|
assert doc1[0] != doc2[0]
|
|
assert not doc1[0] == doc2[0]
|
|
|
|
|
|
def test_issue1375():
|
|
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
|
|
doc = Doc(Vocab(), words=["0", "1", "2"])
|
|
with pytest.raises(IndexError):
|
|
assert doc[0].nbor(-1)
|
|
assert doc[1].nbor(-1).text == "0"
|
|
with pytest.raises(IndexError):
|
|
assert doc[2].nbor(1)
|
|
assert doc[1].nbor(1).text == "2"
|
|
|
|
|
|
def test_issue1387():
|
|
tag_map = {"VBG": {POS: VERB, VerbForm_part: True}}
|
|
index = {"verb": ("cope", "cop")}
|
|
exc = {"verb": {"coping": ("cope",)}}
|
|
rules = {"verb": [["ing", ""]]}
|
|
lemmatizer = Lemmatizer(index, exc, rules)
|
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
|
doc = Doc(vocab, words=["coping"])
|
|
doc[0].tag_ = "VBG"
|
|
assert doc[0].text == "coping"
|
|
assert doc[0].lemma_ == "cope"
|
|
|
|
|
|
def test_issue1434():
|
|
"""Test matches occur when optional element at end of short doc."""
|
|
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
|
|
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
|
|
hello_world = Doc(vocab, words=["Hello", "World"])
|
|
hello = Doc(vocab, words=["Hello"])
|
|
matcher = Matcher(vocab)
|
|
matcher.add("MyMatcher", None, pattern)
|
|
matches = matcher(hello_world)
|
|
assert matches
|
|
matches = matcher(hello)
|
|
assert matches
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"string,start,end",
|
|
[
|
|
("a", 0, 1),
|
|
("a b", 0, 2),
|
|
("a c", 0, 1),
|
|
("a b c", 0, 2),
|
|
("a b b c", 0, 3),
|
|
("a b b", 0, 3),
|
|
],
|
|
)
|
|
def test_issue1450(string, start, end):
|
|
"""Test matcher works when patterns end with * operator."""
|
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
|
matcher = Matcher(Vocab())
|
|
matcher.add("TSTEND", None, pattern)
|
|
doc = Doc(Vocab(), words=string.split())
|
|
matches = matcher(doc)
|
|
if start is None or end is None:
|
|
assert matches == []
|
|
assert matches[-1][1] == start
|
|
assert matches[-1][2] == end
|
|
|
|
|
|
def test_issue1488():
|
|
prefix_re = re.compile(r"""[\[\("']""")
|
|
suffix_re = re.compile(r"""[\]\)"']""")
|
|
infix_re = re.compile(r"""[-~\.]""")
|
|
simple_url_re = re.compile(r"""^https?://""")
|
|
|
|
def my_tokenizer(nlp):
|
|
return Tokenizer(
|
|
nlp.vocab,
|
|
{},
|
|
prefix_search=prefix_re.search,
|
|
suffix_search=suffix_re.search,
|
|
infix_finditer=infix_re.finditer,
|
|
token_match=simple_url_re.match,
|
|
)
|
|
|
|
nlp = English()
|
|
nlp.tokenizer = my_tokenizer(nlp)
|
|
doc = nlp("This is a test.")
|
|
for token in doc:
|
|
assert token.text
|
|
|
|
|
|
def test_issue1494():
|
|
infix_re = re.compile(r"""[^a-z]""")
|
|
test_cases = [
|
|
("token 123test", ["token", "1", "2", "3", "test"]),
|
|
("token 1test", ["token", "1test"]),
|
|
("hello...test", ["hello", ".", ".", ".", "test"]),
|
|
]
|
|
|
|
def new_tokenizer(nlp):
|
|
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
|
|
|
nlp = English()
|
|
nlp.tokenizer = new_tokenizer(nlp)
|
|
for text, expected in test_cases:
|
|
assert [token.text for token in nlp(text)] == expected
|