mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-29 23:17:59 +03:00 
			
		
		
		
	Set default lemmas in retokenizer (#6667)
Instead of unsetting lemmas on retokenized tokens, set the default lemmas to: * merge: concatenate any existing lemmas with `SPACY` preserved * split: use the new `ORTH` values if lemmas were previously set, otherwise leave unset
This commit is contained in:
		
							parent
							
								
									0041dfbc7f
								
							
						
					
					
						commit
						bf9096437e
					
				|  | @ -21,11 +21,13 @@ def test_doc_retokenize_merge(en_tokenizer): | ||||||
|     assert doc[4].text == "the beach boys" |     assert doc[4].text == "the beach boys" | ||||||
|     assert doc[4].text_with_ws == "the beach boys " |     assert doc[4].text_with_ws == "the beach boys " | ||||||
|     assert doc[4].tag_ == "NAMED" |     assert doc[4].tag_ == "NAMED" | ||||||
|  |     assert doc[4].lemma_ == "LEMMA" | ||||||
|     assert str(doc[4].morph) == "Number=Plur" |     assert str(doc[4].morph) == "Number=Plur" | ||||||
|     assert doc[5].text == "all night" |     assert doc[5].text == "all night" | ||||||
|     assert doc[5].text_with_ws == "all night" |     assert doc[5].text_with_ws == "all night" | ||||||
|     assert doc[5].tag_ == "NAMED" |     assert doc[5].tag_ == "NAMED" | ||||||
|     assert str(doc[5].morph) == "Number=Plur" |     assert str(doc[5].morph) == "Number=Plur" | ||||||
|  |     assert doc[5].lemma_ == "LEMMA" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_doc_retokenize_merge_children(en_tokenizer): | def test_doc_retokenize_merge_children(en_tokenizer): | ||||||
|  | @ -103,25 +105,29 @@ def test_doc_retokenize_spans_merge_tokens(en_tokenizer): | ||||||
| 
 | 
 | ||||||
| def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | ||||||
|     words = ["The", "players", "start", "."] |     words = ["The", "players", "start", "."] | ||||||
|  |     lemmas = [t.lower() for t in words] | ||||||
|     heads = [1, 2, 2, 2] |     heads = [1, 2, 2, 2] | ||||||
|     tags = ["DT", "NN", "VBZ", "."] |     tags = ["DT", "NN", "VBZ", "."] | ||||||
|     pos = ["DET", "NOUN", "VERB", "PUNCT"] |     pos = ["DET", "NOUN", "VERB", "PUNCT"] | ||||||
|     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads) |     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) | ||||||
|     assert len(doc) == 4 |     assert len(doc) == 4 | ||||||
|     assert doc[0].text == "The" |     assert doc[0].text == "The" | ||||||
|     assert doc[0].tag_ == "DT" |     assert doc[0].tag_ == "DT" | ||||||
|     assert doc[0].pos_ == "DET" |     assert doc[0].pos_ == "DET" | ||||||
|  |     assert doc[0].lemma_ == "the" | ||||||
|     with doc.retokenize() as retokenizer: |     with doc.retokenize() as retokenizer: | ||||||
|         retokenizer.merge(doc[0:2]) |         retokenizer.merge(doc[0:2]) | ||||||
|     assert len(doc) == 3 |     assert len(doc) == 3 | ||||||
|     assert doc[0].text == "The players" |     assert doc[0].text == "The players" | ||||||
|     assert doc[0].tag_ == "NN" |     assert doc[0].tag_ == "NN" | ||||||
|     assert doc[0].pos_ == "NOUN" |     assert doc[0].pos_ == "NOUN" | ||||||
|     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads) |     assert doc[0].lemma_ == "the players" | ||||||
|  |     doc = Doc(en_vocab, words=words, tags=tags, pos=pos, heads=heads, lemmas=lemmas) | ||||||
|     assert len(doc) == 4 |     assert len(doc) == 4 | ||||||
|     assert doc[0].text == "The" |     assert doc[0].text == "The" | ||||||
|     assert doc[0].tag_ == "DT" |     assert doc[0].tag_ == "DT" | ||||||
|     assert doc[0].pos_ == "DET" |     assert doc[0].pos_ == "DET" | ||||||
|  |     assert doc[0].lemma_ == "the" | ||||||
|     with doc.retokenize() as retokenizer: |     with doc.retokenize() as retokenizer: | ||||||
|         retokenizer.merge(doc[0:2]) |         retokenizer.merge(doc[0:2]) | ||||||
|         retokenizer.merge(doc[2:4]) |         retokenizer.merge(doc[2:4]) | ||||||
|  | @ -129,9 +135,11 @@ def test_doc_retokenize_spans_merge_tokens_default_attrs(en_vocab): | ||||||
|     assert doc[0].text == "The players" |     assert doc[0].text == "The players" | ||||||
|     assert doc[0].tag_ == "NN" |     assert doc[0].tag_ == "NN" | ||||||
|     assert doc[0].pos_ == "NOUN" |     assert doc[0].pos_ == "NOUN" | ||||||
|  |     assert doc[0].lemma_ == "the players" | ||||||
|     assert doc[1].text == "start ." |     assert doc[1].text == "start ." | ||||||
|     assert doc[1].tag_ == "VBZ" |     assert doc[1].tag_ == "VBZ" | ||||||
|     assert doc[1].pos_ == "VERB" |     assert doc[1].pos_ == "VERB" | ||||||
|  |     assert doc[1].lemma_ == "start ." | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_doc_retokenize_spans_merge_heads(en_vocab): | def test_doc_retokenize_spans_merge_heads(en_vocab): | ||||||
|  |  | ||||||
|  | @ -39,6 +39,36 @@ def test_doc_retokenize_split(en_vocab): | ||||||
|     assert len(str(doc)) == 19 |     assert len(str(doc)) == 19 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_doc_retokenize_split_lemmas(en_vocab): | ||||||
|  |     # If lemmas are not set, leave unset | ||||||
|  |     words = ["LosAngeles", "start", "."] | ||||||
|  |     heads = [1, 2, 2] | ||||||
|  |     doc = Doc(en_vocab, words=words, heads=heads) | ||||||
|  |     with doc.retokenize() as retokenizer: | ||||||
|  |         retokenizer.split( | ||||||
|  |             doc[0], | ||||||
|  |             ["Los", "Angeles"], | ||||||
|  |             [(doc[0], 1), doc[1]], | ||||||
|  |         ) | ||||||
|  |     assert doc[0].lemma_ == "" | ||||||
|  |     assert doc[1].lemma_ == "" | ||||||
|  | 
 | ||||||
|  |     # If lemmas are set, use split orth as default lemma | ||||||
|  |     words = ["LosAngeles", "start", "."] | ||||||
|  |     heads = [1, 2, 2] | ||||||
|  |     doc = Doc(en_vocab, words=words, heads=heads) | ||||||
|  |     for t in doc: | ||||||
|  |         t.lemma_ = "a" | ||||||
|  |     with doc.retokenize() as retokenizer: | ||||||
|  |         retokenizer.split( | ||||||
|  |             doc[0], | ||||||
|  |             ["Los", "Angeles"], | ||||||
|  |             [(doc[0], 1), doc[1]], | ||||||
|  |         ) | ||||||
|  |     assert doc[0].lemma_ == "Los" | ||||||
|  |     assert doc[1].lemma_ == "Angeles" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_doc_retokenize_split_dependencies(en_vocab): | def test_doc_retokenize_split_dependencies(en_vocab): | ||||||
|     doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) |     doc = Doc(en_vocab, words=["LosAngeles", "start", "."]) | ||||||
|     dep1 = doc.vocab.strings.add("amod") |     dep1 = doc.vocab.strings.add("amod") | ||||||
|  |  | ||||||
|  | @ -188,8 +188,15 @@ def _merge(Doc doc, merges): | ||||||
|                     and doc.c[start - 1].ent_type == token.ent_type: |                     and doc.c[start - 1].ent_type == token.ent_type: | ||||||
|                 merged_iob = 1 |                 merged_iob = 1 | ||||||
|         token.ent_iob = merged_iob |         token.ent_iob = merged_iob | ||||||
|  |         # Set lemma to concatenated lemmas | ||||||
|  |         merged_lemma = "" | ||||||
|  |         for span_token in span: | ||||||
|  |             merged_lemma += span_token.lemma_ | ||||||
|  |             if doc.c[span_token.i].spacy: | ||||||
|  |                 merged_lemma += " " | ||||||
|  |         merged_lemma = merged_lemma.strip() | ||||||
|  |         token.lemma = doc.vocab.strings.add(merged_lemma) | ||||||
|         # Unset attributes that don't match new token |         # Unset attributes that don't match new token | ||||||
|         token.lemma = 0 |  | ||||||
|         token.norm = 0 |         token.norm = 0 | ||||||
|         tokens[merge_index] = token |         tokens[merge_index] = token | ||||||
|     # Resize the doc.tensor, if it's set. Let the last row for each token stand |     # Resize the doc.tensor, if it's set. Let the last row for each token stand | ||||||
|  | @ -335,7 +342,9 @@ def _split(Doc doc, int token_index, orths, heads, attrs): | ||||||
|         token = &doc.c[token_index + i] |         token = &doc.c[token_index + i] | ||||||
|         lex = doc.vocab.get(doc.mem, orth) |         lex = doc.vocab.get(doc.mem, orth) | ||||||
|         token.lex = lex |         token.lex = lex | ||||||
|         token.lemma = 0  # reset lemma |         # If lemma is currently set, set default lemma to orth | ||||||
|  |         if token.lemma != 0: | ||||||
|  |             token.lemma = lex.orth | ||||||
|         token.norm = 0  # reset norm |         token.norm = 0  # reset norm | ||||||
|         if to_process_tensor: |         if to_process_tensor: | ||||||
|             # setting the tensors of the split tokens to array of zeros |             # setting the tensors of the split tokens to array of zeros | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user