mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Set default lemmas in retokenizer (#6667)
Instead of unsetting lemmas on retokenized tokens, set the default lemmas to: * merge: concatenate any existing lemmas with `SPACY` preserved * split: use the new `ORTH` values if lemmas were previously set, otherwise leave unset
This commit is contained in:
parent
0041dfbc7f
commit
bf9096437e
|
@ -21,11 +21,13 @@ def test_doc_retokenize_merge(en_tokenizer):
|
||||||
assert doc[4].text == "the beach boys"
|
assert doc[4].text == "the beach boys"
|
||||||
assert doc[4].text_with_ws == "the beach boys "
|
assert doc[4].text_with_ws == "the beach boys "
|
||||||
assert doc[4].tag_ == "NAMED"
|
assert doc[4].tag_ == "NAMED"
|
||||||
|
assert doc[4].lemma_ == "LEMMA"
|
||||||
assert str(doc[4].morph) == "Number=Plur"
|
assert str(doc[4].morph) == "Number=Plur"
|
||||||
assert doc[5].text == "all night"
|
assert doc[5].text == "all night"
|
||||||
assert doc[5].text_with_ws == "all night"
|
assert doc[5].text_with_ws == "all night"
|
||||||
assert doc[5].tag_ == "NAMED"
|
assert doc[5].tag_ == "NAMED"
|
||||||
assert str(doc[5].morph) == "Number=Plur"
|
assert str(doc[5].morph) == "Number=Plur"
|
||||||
|
assert doc[5].lemma_ == "LEMMA"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_merge_children(en_tokenizer):
|
def test_doc_retokenize_merge_children(en_tokenizer):
|
||||||
|
@ -103,25 +105,29 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
|
||||||
|
|
||||||
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||||
words = ["The", "players", "start", "."]
|
words = ["The", "players", "start", "."]
|
||||||
|
lemmas = [t.lower() for t in words]
|
||||||
heads = [1, 2, 2, 2]
|
heads = [1, 2, 2, 2]
|
||||||
tags = ["DT", "NN", "VBZ", "."]
|
tags = ["DT", "NN", "VBZ", "."]
|
||||||
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
pos = ["DET", "NOUN", "VERB", "PUNCT"]
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
assert doc[0].pos_ == "DET"
|
assert doc[0].pos_ == "DET"
|
||||||
|
assert doc[0].lemma_ == "the"
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2])
|
retokenizer.merge(doc[0:2])
|
||||||
assert len(doc) == 3
|
assert len(doc) == 3
|
||||||
assert doc[0].text == "The players"
|
assert doc[0].text == "The players"
|
||||||
assert doc[0].tag_ == "NN"
|
assert doc[0].tag_ == "NN"
|
||||||
assert doc[0].pos_ == "NOUN"
|
assert doc[0].pos_ == "NOUN"
|
||||||
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
|
assert doc[0].lemma_ == "the players"
|
||||||
|
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
|
||||||
assert len(doc) == 4
|
assert len(doc) == 4
|
||||||
assert doc[0].text == "The"
|
assert doc[0].text == "The"
|
||||||
assert doc[0].tag_ == "DT"
|
assert doc[0].tag_ == "DT"
|
||||||
assert doc[0].pos_ == "DET"
|
assert doc[0].pos_ == "DET"
|
||||||
|
assert doc[0].lemma_ == "the"
|
||||||
with doc.retokenize() as retokenizer:
|
with doc.retokenize() as retokenizer:
|
||||||
retokenizer.merge(doc[0:2])
|
retokenizer.merge(doc[0:2])
|
||||||
retokenizer.merge(doc[2:4])
|
retokenizer.merge(doc[2:4])
|
||||||
|
@ -129,9 +135,11 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
|
||||||
assert doc[0].text == "The players"
|
assert doc[0].text == "The players"
|
||||||
assert doc[0].tag_ == "NN"
|
assert doc[0].tag_ == "NN"
|
||||||
assert doc[0].pos_ == "NOUN"
|
assert doc[0].pos_ == "NOUN"
|
||||||
|
assert doc[0].lemma_ == "the players"
|
||||||
assert doc[1].text == "start ."
|
assert doc[1].text == "start ."
|
||||||
assert doc[1].tag_ == "VBZ"
|
assert doc[1].tag_ == "VBZ"
|
||||||
assert doc[1].pos_ == "VERB"
|
assert doc[1].pos_ == "VERB"
|
||||||
|
assert doc[1].lemma_ == "start ."
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
def test_doc_retokenize_spans_merge_heads(en_vocab):
|
||||||
|
|
|
@ -39,6 +39,36 @@ def test_doc_retokenize_split(en_vocab):
|
||||||
assert len(str(doc)) == 19
|
assert len(str(doc)) == 19
|
||||||
|
|
||||||
|
|
||||||
|
def test_doc_retokenize_split_lemmas(en_vocab):
|
||||||
|
# If lemmas are not set, leave unset
|
||||||
|
words = ["LosAngeles", "start", "."]
|
||||||
|
heads = [1, 2, 2]
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads)
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(
|
||||||
|
doc[0],
|
||||||
|
["Los", "Angeles"],
|
||||||
|
[(doc[0], 1), doc[1]],
|
||||||
|
)
|
||||||
|
assert doc[0].lemma_ == ""
|
||||||
|
assert doc[1].lemma_ == ""
|
||||||
|
|
||||||
|
# If lemmas are set, use split orth as default lemma
|
||||||
|
words = ["LosAngeles", "start", "."]
|
||||||
|
heads = [1, 2, 2]
|
||||||
|
doc = Doc(en_vocab, words=words, heads=heads)
|
||||||
|
for t in doc:
|
||||||
|
t.lemma_ = "a"
|
||||||
|
with doc.retokenize() as retokenizer:
|
||||||
|
retokenizer.split(
|
||||||
|
doc[0],
|
||||||
|
["Los", "Angeles"],
|
||||||
|
[(doc[0], 1), doc[1]],
|
||||||
|
)
|
||||||
|
assert doc[0].lemma_ == "Los"
|
||||||
|
assert doc[1].lemma_ == "Angeles"
|
||||||
|
|
||||||
|
|
||||||
def test_doc_retokenize_split_dependencies(en_vocab):
|
def test_doc_retokenize_split_dependencies(en_vocab):
|
||||||
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
|
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
|
||||||
dep1 = doc.vocab.strings.add("amod")
|
dep1 = doc.vocab.strings.add("amod")
|
||||||
|
|
|
@ -188,8 +188,15 @@ def _merge(Doc doc, merges):
|
||||||
and doc.c[start - 1].ent_type == token.ent_type:
|
and doc.c[start - 1].ent_type == token.ent_type:
|
||||||
merged_iob = 1
|
merged_iob = 1
|
||||||
token.ent_iob = merged_iob
|
token.ent_iob = merged_iob
|
||||||
|
# Set lemma to concatenated lemmas
|
||||||
|
merged_lemma = ""
|
||||||
|
for span_token in span:
|
||||||
|
merged_lemma += span_token.lemma_
|
||||||
|
if doc.c[span_token.i].spacy:
|
||||||
|
merged_lemma += " "
|
||||||
|
merged_lemma = merged_lemma.strip()
|
||||||
|
token.lemma = doc.vocab.strings.add(merged_lemma)
|
||||||
# Unset attributes that don't match new token
|
# Unset attributes that don't match new token
|
||||||
token.lemma = 0
|
|
||||||
token.norm = 0
|
token.norm = 0
|
||||||
tokens[merge_index] = token
|
tokens[merge_index] = token
|
||||||
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
# Resize the doc.tensor, if it's set. Let the last row for each token stand
|
||||||
|
@ -335,7 +342,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
|
||||||
token = &doc.c[token_index + i]
|
token = &doc.c[token_index + i]
|
||||||
lex = doc.vocab.get(doc.mem, orth)
|
lex = doc.vocab.get(doc.mem, orth)
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
token.lemma = 0 # reset lemma
|
# If lemma is currently set, set default lemma to orth
|
||||||
|
if token.lemma != 0:
|
||||||
|
token.lemma = lex.orth
|
||||||
token.norm = 0 # reset norm
|
token.norm = 0 # reset norm
|
||||||
if to_process_tensor:
|
if to_process_tensor:
|
||||||
# setting the tensors of the split tokens to array of zeros
|
# setting the tensors of the split tokens to array of zeros
|
||||||
|
|
Loading…
Reference in New Issue
Block a user