mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Fix Dutch noun chunks to skip overlapping spans (#11275)
* Add test for overlapping noun chunks * Skip overlapping noun chunks * Update spacy/tests/lang/nl/test_noun_chunks.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									231a17817d
								
							
						
					
					
						commit
						ed4ad309e6
					
				|  | @ -40,6 +40,7 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||||
|     span_label = doc.vocab.strings.add("NP") |     span_label = doc.vocab.strings.add("NP") | ||||||
| 
 | 
 | ||||||
|     # Only NOUNS and PRONOUNS matter |     # Only NOUNS and PRONOUNS matter | ||||||
|  |     end_span = -1 | ||||||
|     for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): |     for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): | ||||||
|         # For NOUNS |         # For NOUNS | ||||||
|         # Pick children from syntactic parse (only those with certain dependencies) |         # Pick children from syntactic parse (only those with certain dependencies) | ||||||
|  | @ -58,15 +59,17 @@ def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Tuple[int, int, int]]: | ||||||
|             children_i = [c.i for c in children] + [word.i] |             children_i = [c.i for c in children] + [word.i] | ||||||
| 
 | 
 | ||||||
|             start_span = min(children_i) |             start_span = min(children_i) | ||||||
|             end_span = max(children_i) + 1 |             if start_span >= end_span: | ||||||
|             yield start_span, end_span, span_label |                 end_span = max(children_i) + 1 | ||||||
|  |                 yield start_span, end_span, span_label | ||||||
| 
 | 
 | ||||||
|         # PRONOUNS only if it is the subject of a verb |         # PRONOUNS only if it is the subject of a verb | ||||||
|         elif word.pos == PRON: |         elif word.pos == PRON: | ||||||
|             if word.dep in pronoun_deps: |             if word.dep in pronoun_deps: | ||||||
|                 start_span = word.i |                 start_span = word.i | ||||||
|                 end_span = word.i + 1 |                 if start_span >= end_span: | ||||||
|                 yield start_span, end_span, span_label |                     end_span = word.i + 1 | ||||||
|  |                     yield start_span, end_span, span_label | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | ||||||
|  |  | ||||||
|  | @ -1,5 +1,6 @@ | ||||||
| from spacy.tokens import Doc |  | ||||||
| import pytest | import pytest | ||||||
|  | from spacy.tokens import Doc | ||||||
|  | from spacy.util import filter_spans | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
|  | @ -207,3 +208,18 @@ def test_chunking(nl_sample, nl_reference_chunking): | ||||||
|     """ |     """ | ||||||
|     chunks = [s.text.lower() for s in nl_sample.noun_chunks] |     chunks = [s.text.lower() for s in nl_sample.noun_chunks] | ||||||
|     assert chunks == nl_reference_chunking |     assert chunks == nl_reference_chunking | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.mark.issue(10846) | ||||||
|  | def test_no_overlapping_chunks(nl_vocab): | ||||||
|  |     # fmt: off | ||||||
|  |     doc = Doc( | ||||||
|  |         nl_vocab, | ||||||
|  |         words=["Dit", "programma", "wordt", "beschouwd", "als", "'s", "werelds", "eerste", "computerprogramma"], | ||||||
|  |         deps=["det", "nsubj:pass", "aux:pass", "ROOT", "mark", "det", "fixed", "amod", "xcomp"], | ||||||
|  |         heads=[1, 3, 3, 3, 8, 8, 5, 8, 3], | ||||||
|  |         pos=["DET", "NOUN", "AUX", "VERB", "SCONJ", "DET", "NOUN", "ADJ", "NOUN"], | ||||||
|  |     ) | ||||||
|  |     # fmt: on | ||||||
|  |     chunks = list(doc.noun_chunks) | ||||||
|  |     assert filter_spans(chunks) == chunks | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user