diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 14e4bc44b..60cc66d66 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -21,11 +21,13 @@ def test_doc_retokenize_merge(en_tokenizer): assert doc[4].text == "the beach boys" assert doc[4].text_with_ws == "the beach boys " assert doc[4].tag_ == "NAMED" + assert doc[4].lemma_ == "LEMMA" assert str(doc[4].morph) == "Number=Plur" assert doc[5].text == "all night" assert doc[5].text_with_ws == "all night" assert doc[5].tag_ == "NAMED" assert str(doc[5].morph) == "Number=Plur" + assert doc[5].lemma_ == "LEMMA" def test_doc_retokenize_merge_children(en_tokenizer): @@ -103,25 +105,29 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer): def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): words = ["The", "players", "start", "."] + lemmas = [t.lower() for t in words] heads = [1, 2, 2, 2] tags = ["DT", "NN", "VBZ", "."] pos = ["DET", "NOUN", "VERB", "PUNCT"] - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads) + doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" assert doc[0].pos_ == "DET" + assert doc[0].lemma_ == "the" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) assert len(doc) == 3 assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" - doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads) + assert doc[0].lemma_ == "the players" + doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) assert len(doc) == 4 assert doc[0].text == "The" assert doc[0].tag_ == "DT" assert doc[0].pos_ == "DET" + assert doc[0].lemma_ == "the" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:2]) retokenizer.merge(doc[2:4]) @@ -129,9 +135,11 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): assert doc[0].text == "The players" assert doc[0].tag_ == "NN" assert doc[0].pos_ == "NOUN" + assert doc[0].lemma_ == "the players" assert doc[1].text == "start ." assert doc[1].tag_ == "VBZ" assert doc[1].pos_ == "VERB" + assert doc[1].lemma_ == "start ." def test_doc_retokenize_spans_merge_heads(en_vocab): diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index 30f945165..21c3ffd4b 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -39,6 +39,36 @@ def test_doc_retokenize_split(en_vocab): assert len(str(doc)) == 19 +def test_doc_retokenize_split_lemmas(en_vocab): + # If lemmas are not set, leave unset + words = ["LosAngeles", "start", "."] + heads = [1, 2, 2] + doc = Doc(en_vocab, words=words, heads=heads) + with doc.retokenize() as retokenizer: + retokenizer.split( + doc[0], + ["Los", "Angeles"], + [(doc[0], 1), doc[1]], + ) + assert doc[0].lemma_ == "" + assert doc[1].lemma_ == "" + + # If lemmas are set, use split orth as default lemma + words = ["LosAngeles", "start", "."] + heads = [1, 2, 2] + doc = Doc(en_vocab, words=words, heads=heads) + for t in doc: + t.lemma_ = "a" + with doc.retokenize() as retokenizer: + retokenizer.split( + doc[0], + ["Los", "Angeles"], + [(doc[0], 1), doc[1]], + ) + assert doc[0].lemma_ == "Los" + assert doc[1].lemma_ == "Angeles" + + def test_doc_retokenize_split_dependencies(en_vocab): doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) dep1 = doc.vocab.strings.add("amod") diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index 0069e36bf..ed8c4323e 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -188,8 +188,15 @@ def _merge(Doc doc, merges): and doc.c[start - 1].ent_type == token.ent_type: merged_iob = 1 token.ent_iob = merged_iob + # Set lemma to concatenated lemmas + merged_lemma = "" + for span_token in span: + merged_lemma += span_token.lemma_ + if doc.c[span_token.i].spacy: + merged_lemma += " " + merged_lemma = merged_lemma.strip() + token.lemma = doc.vocab.strings.add(merged_lemma) # Unset attributes that don't match new token - token.lemma = 0 token.norm = 0 tokens[merge_index] = token # Resize the doc.tensor, if it's set. Let the last row for each token stand @@ -335,7 +342,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs): token = &doc.c[token_index + i] lex = doc.vocab.get(doc.mem, orth) token.lex = lex - token.lemma = 0 # reset lemma + # If lemma is currently set, set default lemma to orth + if token.lemma != 0: + token.lemma = lex.orth token.norm = 0 # reset norm if to_process_tensor: # setting the tensors of the split tokens to array of zeros