mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Fix Dutch noun chunks to skip overlapping spans (#11275)
* Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									231a17817d
								
							
						
					
					
						commit
						ed4ad309e6
					
				|  | @ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | |||
|     span_label = doc.vocab.strings.add("NP") | ||||
| 
 | ||||
|     # Only NOUNS and PRONOUNS matter | ||||
|     end_span = -1 | ||||
|     for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): | ||||
|         # For NOUNS | ||||
|         # Pick children from syntactic parse (only those with certain dependencies) | ||||
|  | @ -58,6 +59,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | |||
|             children_i = [c.i for c in children] + [word.i] | ||||
| 
 | ||||
|             start_span = min(children_i) | ||||
|             if start_span >= end_span: | ||||
|                 end_span = max(children_i) + 1 | ||||
|                 yield start_span, end_span, span_label | ||||
| 
 | ||||
|  | @ -65,6 +67,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | |||
|         elif word.pos == PRON: | ||||
|             if word.dep in pronoun_deps: | ||||
|                 start_span = word.i | ||||
|                 if start_span >= end_span: | ||||
|                     end_span = word.i + 1 | ||||
|                     yield start_span, end_span, span_label | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,5 +1,6 @@ | |||
| from spacy.tokens import Doc | ||||
| import pytest | ||||
| from spacy.tokens import Doc | ||||
| from spacy.util import filter_spans | ||||
| 
 | ||||
| 
 | ||||
| @pytest.fixture | ||||
|  | @ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking): | |||
|     """ | ||||
|     chunks = [s.text.lower() for s in nl_sample.noun_chunks] | ||||
|     assert chunks == nl_reference_chunking | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.issue(10846) | ||||
| def test_no_overlapping_chunks(nl_vocab): | ||||
|     # fmt: off | ||||
|     doc = Doc( | ||||
|         nl_vocab, | ||||
|         words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"], | ||||
|         deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"], | ||||
|         heads=[1, 3, 3, 3, 8, 8, 5, 8, 3], | ||||
|         pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"], | ||||
|     ) | ||||
|     # fmt: on | ||||
|     chunks = list(doc.noun_chunks) | ||||
|     assert filter_spans(chunks) == chunks | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user