Set default lemmas in retokenizer (#6667)

Instead of unsetting lemmas on retokenized tokens, set the default
lemmas to:

* merge: concatenate any existing lemmas with `SPACY` preserved
* split: use the new `ORTH` values if lemmas were previously set,
  otherwise leave unset
This commit is contained in:
Adriane Boyd 2021-01-06 05:29:44 +01:00 committed by GitHub
parent 0041dfbc7f
commit bf9096437e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 4 deletions

View File

@ -21,11 +21,13 @@ def test_doc_retokenize_merge(en_tokenizer):
assert doc[4].text == "the beach boys"
assert doc[4].text_with_ws == "the beach boys "
assert doc[4].tag_ == "NAMED"
assert doc[4].lemma_ == "LEMMA"
assert str(doc[4].morph) == "Number=Plur"
assert doc[5].text == "all night"
assert doc[5].text_with_ws == "all night"
assert doc[5].tag_ == "NAMED"
assert str(doc[5].morph) == "Number=Plur"
assert doc[5].lemma_ == "LEMMA"
def test_doc_retokenize_merge_children(en_tokenizer):
@ -103,25 +105,29 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer):
def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
words = ["The", "players", "start", "."]
lemmas = [t.lower() for t in words]
heads = [1, 2, 2, 2]
tags = ["DT", "NN", "VBZ", "."]
pos = ["DET", "NOUN", "VERB", "PUNCT"]
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
assert doc[0].lemma_ == "the"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
assert len(doc) == 3
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads)
assert doc[0].lemma_ == "the players"
doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas)
assert len(doc) == 4
assert doc[0].text == "The"
assert doc[0].tag_ == "DT"
assert doc[0].pos_ == "DET"
assert doc[0].lemma_ == "the"
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[0:2])
retokenizer.merge(doc[2:4])
@ -129,9 +135,11 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab):
assert doc[0].text == "The players"
assert doc[0].tag_ == "NN"
assert doc[0].pos_ == "NOUN"
assert doc[0].lemma_ == "the players"
assert doc[1].text == "start ."
assert doc[1].tag_ == "VBZ"
assert doc[1].pos_ == "VERB"
assert doc[1].lemma_ == "start ."
def test_doc_retokenize_spans_merge_heads(en_vocab):

View File

@ -39,6 +39,36 @@ def test_doc_retokenize_split(en_vocab):
assert len(str(doc)) == 19
def test_doc_retokenize_split_lemmas(en_vocab):
# If lemmas are not set, leave unset
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
["Los", "Angeles"],
[(doc[0], 1), doc[1]],
)
assert doc[0].lemma_ == ""
assert doc[1].lemma_ == ""
# If lemmas are set, use split orth as default lemma
words = ["LosAngeles", "start", "."]
heads = [1, 2, 2]
doc = Doc(en_vocab, words=words, heads=heads)
for t in doc:
t.lemma_ = "a"
with doc.retokenize() as retokenizer:
retokenizer.split(
doc[0],
["Los", "Angeles"],
[(doc[0], 1), doc[1]],
)
assert doc[0].lemma_ == "Los"
assert doc[1].lemma_ == "Angeles"
def test_doc_retokenize_split_dependencies(en_vocab):
doc = Doc(en_vocab, words=["LosAngeles", "start", "."])
dep1 = doc.vocab.strings.add("amod")

View File

@ -188,8 +188,15 @@ def _merge(Doc doc, merges):
and doc.c[start - 1].ent_type == token.ent_type:
merged_iob = 1
token.ent_iob = merged_iob
# Set lemma to concatenated lemmas
merged_lemma = ""
for span_token in span:
merged_lemma += span_token.lemma_
if doc.c[span_token.i].spacy:
merged_lemma += " "
merged_lemma = merged_lemma.strip()
token.lemma = doc.vocab.strings.add(merged_lemma)
# Unset attributes that don't match new token
token.lemma = 0
token.norm = 0
tokens[merge_index] = token
# Resize the doc.tensor, if it's set. Let the last row for each token stand
@ -335,7 +342,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
token = &doc.c[token_index + i]
lex = doc.vocab.get(doc.mem, orth)
token.lex = lex
token.lemma = 0 # reset lemma
# If lemma is currently set, set default lemma to orth
if token.lemma != 0:
token.lemma = lex.orth
token.norm = 0 # reset norm
if to_process_tensor:
# setting the tensors of the split tokens to array of zeros