Adriane Boyd 2f981d5af1 Remove corpus-specific tag maps
Remove corpus-specific tag maps from the language data for languages
without custom tokenizers. For languages with custom word segmenters
that also provide tags (Japanese and Korean), the tag maps for the
custom tokenizers are kept as the default.

The default tag maps for languages without custom tokenizers are now the
default tag map from `lang/tag_map/py`, UPOS -> UPOS.
2020-07-15 15:58:29 +02:00

82 lines
3.1 KiB

from ...util import get_doc
def test_en_parser_noun_chunks_standard(en_tokenizer):
text = "A base phrase should be recognized."
heads = [2, 1, 3, 2, 1, 0, -1]
pos = ["DET", "ADJ", "NOUN", "AUX", "VERB", "VERB", "PUNCT"]
deps = ["det", "amod", "nsubjpass", "aux", "auxpass", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
chunks = list(doc.noun_chunks)
assert len(chunks) == 1
assert chunks[0].text_with_ws == "A base phrase "
def test_en_parser_noun_chunks_coordinated(en_tokenizer):
# fmt: off
text = "A base phrase and a good phrase are often the same."
heads = [2, 1, 5, -1, 2, 1, -4, 0, -1, 1, -3, -4]
pos = ["DET", "NOUN", "NOUN", "CCONJ", "DET", "ADJ", "NOUN", "VERB", "ADV", "DET", "ADJ", "PUNCT"]
deps = ["det", "compound", "nsubj", "cc", "det", "amod", "conj", "ROOT", "advmod", "det", "attr", "punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "A base phrase "
assert chunks[1].text_with_ws == "a good phrase "
def test_en_parser_noun_chunks_pp_chunks(en_tokenizer):
text = "A phrase with another phrase occurs."
heads = [1, 4, -1, 1, -2, 0, -1]
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB", "PUNCT"]
deps = ["det", "nsubj", "prep", "det", "pobj", "ROOT", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
chunks = list(doc.noun_chunks)
assert len(chunks) == 2
assert chunks[0].text_with_ws == "A phrase "
assert chunks[1].text_with_ws == "another phrase "
def test_en_parser_noun_chunks_appositional_modifiers(en_tokenizer):
# fmt: off
text = "Sam, my brother, arrived to the house."
heads = [5, -1, 1, -3, -4, 0, -1, 1, -2, -4]
pos = ["PROPN", "PUNCT", "DET", "NOUN", "PUNCT", "VERB", "ADP", "DET", "NOUN", "PUNCT"]
deps = ["nsubj", "punct", "poss", "appos", "punct", "ROOT", "prep", "det", "pobj", "punct"]
# fmt: on
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "Sam "
assert chunks[1].text_with_ws == "my brother "
assert chunks[2].text_with_ws == "the house "
def test_en_parser_noun_chunks_dative(en_tokenizer):
text = "She gave Bob a raise."
heads = [1, 0, -1, 1, -3, -4]
pos = ["PRON", "VERB", "PROPN", "DET", "NOUN", "PUNCT"]
deps = ["nsubj", "ROOT", "dative", "det", "dobj", "punct"]
tokens = en_tokenizer(text)
doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], pos=pos, deps=deps, heads=heads
chunks = list(doc.noun_chunks)
assert len(chunks) == 3
assert chunks[0].text_with_ws == "She "
assert chunks[1].text_with_ws == "Bob "
assert chunks[2].text_with_ws == "a raise "