mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
2f981d5af1
Remove corpus-specific tag maps from the language data for languages without custom tokenizers. For languages with custom word segmenters that also provide tags (Japanese and Korean), the tag maps for the custom tokenizers are kept as the default. The default tag maps for languages without custom tokenizers are now the default tag map from `lang/tag_map/py`, UPOS -> UPOS.
82 lines
3.1 KiB
Python
82 lines
3.1 KiB
Python
from ...util import get_doc
|
|
|
|
|
|
def test_en_parser_noun_chunks_standard(en_tokenizer):
|
|
text = "A base phrase should be recognized."
|
|
heads = [2, 1, 3, 2, 1, 0, -1]
|
|
pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
|
|
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(
|
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
|
)
|
|
chunks = list(doc.noun_chunks)
|
|
assert len(chunks) == 1
|
|
assert chunks[0].text_with_ws == "A base phrase "
|
|
|
|
|
|
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
|
|
# fmt: off
|
|
text = "A base phrase and a good phrase are often the same."
|
|
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
|
|
pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
|
|
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
|
|
# fmt: on
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(
|
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
|
)
|
|
chunks = list(doc.noun_chunks)
|
|
assert len(chunks) == 2
|
|
assert chunks[0].text_with_ws == "A base phrase "
|
|
assert chunks[1].text_with_ws == "a good phrase "
|
|
|
|
|
|
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
|
|
text = "A phrase with another phrase occurs."
|
|
heads = [1, 4, -1, 1, -2, 0, -1]
|
|
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
|
|
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(
|
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
|
)
|
|
chunks = list(doc.noun_chunks)
|
|
assert len(chunks) == 2
|
|
assert chunks[0].text_with_ws == "A phrase "
|
|
assert chunks[1].text_with_ws == "another phrase "
|
|
|
|
|
|
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
|
|
# fmt: off
|
|
text = "Sam, my brother, arrived to the house."
|
|
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
|
|
pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
|
|
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
|
|
# fmt: on
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(
|
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
|
)
|
|
chunks = list(doc.noun_chunks)
|
|
assert len(chunks) == 3
|
|
assert chunks[0].text_with_ws == "Sam "
|
|
assert chunks[1].text_with_ws == "my brother "
|
|
assert chunks[2].text_with_ws == "the house "
|
|
|
|
|
|
def test_en_parser_noun_chunks_dative(en_tokenizer):
|
|
text = "She gave Bob a raise."
|
|
heads = [1, 0, -1, 1, -3, -4]
|
|
pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
|
|
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
|
|
tokens = en_tokenizer(text)
|
|
doc = get_doc(
|
|
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
|
|
)
|
|
chunks = list(doc.noun_chunks)
|
|
assert len(chunks) == 3
|
|
assert chunks[0].text_with_ws == "She "
|
|
assert chunks[1].text_with_ws == "Bob "
|
|
assert chunks[2].text_with_ws == "a raise "
|