mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
adc9745718
* Restructure tag maps for MorphAnalysis changes Prepare tag maps for upcoming MorphAnalysis changes that allow arbritrary features. * Use default tag map rather than duplicating for ca / uk / vi * Import tag map into defaults for ga * Modify tag maps so all morphological fields and features are strings * Move features from `"Other"` to the top level * Rewrite tuples as strings separated by `","` * Rewrite morph symbols for fr lemmatizer as strings * Export MorphAnalysis under spacy.tokens * Modify morphology to support arbitrary features Modify `Morphology` and `MorphAnalysis` so that arbitrary features are supported. * Modify `MorphAnalysisC` so that it can support arbitrary features and multiple values per field. `MorphAnalysisC` is redesigned to contain: * key: hash of UD FEATS string of morphological features * array of `MorphFeatureC` structs that each contain a hash of `Field` and `Field=Value` for a given morphological feature, which makes it possible to: * find features by field * represent multiple values for a given field * `get_field()` is renamed to `get_by_field()` and is no longer `nogil`. Instead a new helper function `get_n_by_field()` is `nogil` and returns `n` features by field. * `MorphAnalysis.get()` returns all possible values for a field as a list of individual features such as `["Tense=Pres", "Tense=Past"]`. * `MorphAnalysis`'s `str()` and `repr()` are the UD FEATS string. * `Morphology.feats_to_dict()` converts a UD FEATS string to a dict where: * Each field has one entry in the dict * Multiple values remain separated by a separator in the value string * `Token.morph_` returns the UD FEATS string and you can set `Token.morph_` with a UD FEATS string or with a tag map dict. * Modify get_by_field to use np.ndarray Modify `get_by_field()` to use np.ndarray. Remove `max_results` from `get_n_by_field()` and always iterate over all the fields. * Rewrite without MorphFeatureC * Add shortcut for existing feats strings as keys Add shortcut for existing feats strings as keys in `Morphology.add()`. * Check for '_' as empty analysis when adding morphs * Extend helper converters in Morphology Add and extend helper converters that convert and normalize between: * UD FEATS strings (`"Case=dat,gen|Number=sing"`) * per-field dict of feats (`{"Case": "dat,gen", "Number": "sing"}`) * list of individual features (`["Case=dat", "Case=gen", "Number=sing"]`) All converters sort fields and values where applicable.
180 lines
5.6 KiB
Python
180 lines
5.6 KiB
Python
import pytest
|
|
import re
|
|
from spacy.tokens import Doc
|
|
from spacy.vocab import Vocab
|
|
from spacy.lang.en import English
|
|
from spacy.lang.lex_attrs import LEX_ATTRS
|
|
from spacy.matcher import Matcher
|
|
from spacy.tokenizer import Tokenizer
|
|
from spacy.lemmatizer import Lemmatizer
|
|
from spacy.lookups import Lookups
|
|
from spacy.symbols import ORTH, LEMMA, POS, VERB
|
|
|
|
|
|
def test_issue1061():
|
|
"""Test special-case works after tokenizing. Was caching problem."""
|
|
text = "I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_."
|
|
tokenizer = English.Defaults.create_tokenizer()
|
|
doc = tokenizer(text)
|
|
assert "MATH" in [w.text for w in doc]
|
|
assert "_MATH_" not in [w.text for w in doc]
|
|
|
|
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
|
doc = tokenizer(text)
|
|
assert "_MATH_" in [w.text for w in doc]
|
|
assert "MATH" not in [w.text for w in doc]
|
|
|
|
# For sanity, check it works when pipeline is clean.
|
|
tokenizer = English.Defaults.create_tokenizer()
|
|
tokenizer.add_special_case("_MATH_", [{ORTH: "_MATH_"}])
|
|
doc = tokenizer(text)
|
|
assert "_MATH_" in [w.text for w in doc]
|
|
assert "MATH" not in [w.text for w in doc]
|
|
|
|
|
|
@pytest.mark.xfail(
|
|
reason="g is split of as a unit, as the suffix regular expression can not look back further (variable-width)"
|
|
)
|
|
def test_issue1235():
|
|
"""Test that g is not split of if preceded by a number and a letter"""
|
|
nlp = English()
|
|
testwords = "e2g 2g 52g"
|
|
doc = nlp(testwords)
|
|
assert len(doc) == 5
|
|
assert doc[0].text == "e2g"
|
|
assert doc[1].text == "2"
|
|
assert doc[2].text == "g"
|
|
assert doc[3].text == "52"
|
|
assert doc[4].text == "g"
|
|
|
|
|
|
def test_issue1242():
|
|
nlp = English()
|
|
doc = nlp("")
|
|
assert len(doc) == 0
|
|
docs = list(nlp.pipe(["", "hello"]))
|
|
assert len(docs[0]) == 0
|
|
assert len(docs[1]) == 1
|
|
|
|
|
|
def test_issue1250():
|
|
"""Test cached special cases."""
|
|
special_case = [{ORTH: "reimbur", LEMMA: "reimburse", POS: "VERB"}]
|
|
nlp = English()
|
|
nlp.tokenizer.add_special_case("reimbur", special_case)
|
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
|
lemmas = [w.lemma_ for w in nlp("reimbur, reimbur...")]
|
|
assert lemmas == ["reimburse", ",", "reimburse", "..."]
|
|
|
|
|
|
def test_issue1257():
|
|
"""Test that tokens compare correctly."""
|
|
doc1 = Doc(Vocab(), words=["a", "b", "c"])
|
|
doc2 = Doc(Vocab(), words=["a", "c", "e"])
|
|
assert doc1[0] != doc2[0]
|
|
assert not doc1[0] == doc2[0]
|
|
|
|
|
|
def test_issue1375():
|
|
"""Test that token.nbor() raises IndexError for out-of-bounds access."""
|
|
doc = Doc(Vocab(), words=["0", "1", "2"])
|
|
with pytest.raises(IndexError):
|
|
assert doc[0].nbor(-1)
|
|
assert doc[1].nbor(-1).text == "0"
|
|
with pytest.raises(IndexError):
|
|
assert doc[2].nbor(1)
|
|
assert doc[1].nbor(1).text == "2"
|
|
|
|
|
|
def test_issue1387():
|
|
tag_map = {"VBG": {POS: VERB, "VerbForm": "part"}}
|
|
lookups = Lookups()
|
|
lookups.add_table("lemma_index", {"verb": ("cope", "cop")})
|
|
lookups.add_table("lemma_exc", {"verb": {"coping": ("cope",)}})
|
|
lookups.add_table("lemma_rules", {"verb": [["ing", ""]]})
|
|
lemmatizer = Lemmatizer(lookups)
|
|
vocab = Vocab(lemmatizer=lemmatizer, tag_map=tag_map)
|
|
doc = Doc(vocab, words=["coping"])
|
|
doc[0].tag_ = "VBG"
|
|
assert doc[0].text == "coping"
|
|
assert doc[0].lemma_ == "cope"
|
|
|
|
|
|
def test_issue1434():
|
|
"""Test matches occur when optional element at end of short doc."""
|
|
pattern = [{"ORTH": "Hello"}, {"IS_ALPHA": True, "OP": "?"}]
|
|
vocab = Vocab(lex_attr_getters=LEX_ATTRS)
|
|
hello_world = Doc(vocab, words=["Hello", "World"])
|
|
hello = Doc(vocab, words=["Hello"])
|
|
matcher = Matcher(vocab)
|
|
matcher.add("MyMatcher", [pattern])
|
|
matches = matcher(hello_world)
|
|
assert matches
|
|
matches = matcher(hello)
|
|
assert matches
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"string,start,end",
|
|
[
|
|
("a", 0, 1),
|
|
("a b", 0, 2),
|
|
("a c", 0, 1),
|
|
("a b c", 0, 2),
|
|
("a b b c", 0, 3),
|
|
("a b b", 0, 3),
|
|
],
|
|
)
|
|
def test_issue1450(string, start, end):
|
|
"""Test matcher works when patterns end with * operator."""
|
|
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
|
|
matcher = Matcher(Vocab())
|
|
matcher.add("TSTEND", [pattern])
|
|
doc = Doc(Vocab(), words=string.split())
|
|
matches = matcher(doc)
|
|
if start is None or end is None:
|
|
assert matches == []
|
|
assert matches[-1][1] == start
|
|
assert matches[-1][2] == end
|
|
|
|
|
|
def test_issue1488():
|
|
prefix_re = re.compile(r"""[\[\("']""")
|
|
suffix_re = re.compile(r"""[\]\)"']""")
|
|
infix_re = re.compile(r"""[-~\.]""")
|
|
simple_url_re = re.compile(r"""^https?://""")
|
|
|
|
def my_tokenizer(nlp):
|
|
return Tokenizer(
|
|
nlp.vocab,
|
|
{},
|
|
prefix_search=prefix_re.search,
|
|
suffix_search=suffix_re.search,
|
|
infix_finditer=infix_re.finditer,
|
|
token_match=simple_url_re.match,
|
|
)
|
|
|
|
nlp = English()
|
|
nlp.tokenizer = my_tokenizer(nlp)
|
|
doc = nlp("This is a test.")
|
|
for token in doc:
|
|
assert token.text
|
|
|
|
|
|
def test_issue1494():
|
|
infix_re = re.compile(r"""[^a-z]""")
|
|
test_cases = [
|
|
("token 123test", ["token", "1", "2", "3", "test"]),
|
|
("token 1test", ["token", "1test"]),
|
|
("hello...test", ["hello", ".", ".", ".", "test"]),
|
|
]
|
|
|
|
def new_tokenizer(nlp):
|
|
return Tokenizer(nlp.vocab, {}, infix_finditer=infix_re.finditer)
|
|
|
|
nlp = English()
|
|
nlp.tokenizer = new_tokenizer(nlp)
|
|
for text, expected in test_cases:
|
|
assert [token.text for token in nlp(text)] == expected
|